In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import plotly.express as px
import plotly.graph_objects as go


In [None]:
data = pd.read_csv('/content/n_movies.csv')
data.head()

Unnamed: 0,title,year,certificate,duration,genre,rating,description,stars,votes
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,Decades after their 1984 All Valley Karate Tou...,"['Ralph Macchio, ', 'William Zabka, ', 'Courtn...",177031
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"['Claire Foy, ', 'Olivia Colman, ', 'Imelda St...",199885
2,Better Call Saul,(2015–2022),TV-MA,46 min,"Crime, Drama",8.9,The trials and tribulations of criminal lawyer...,"['Bob Odenkirk, ', 'Rhea Seehorn, ', 'Jonathan...",501384
3,Devil in Ohio,(2022),TV-MA,356 min,"Drama, Horror, Mystery",5.9,When a psychiatrist shelters a mysterious cult...,"['Emily Deschanel, ', 'Sam Jaeger, ', 'Gerardo...",9773
4,Cyberpunk: Edgerunners,(2022– ),TV-MA,24 min,"Animation, Action, Adventure",8.6,A Street Kid trying to survive in a technology...,"['Zach Aguilar, ', 'Kenichiro Ohashi, ', 'Emi ...",15413


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9957 entries, 0 to 9956
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        9957 non-null   object 
 1   year         9430 non-null   object 
 2   certificate  6504 non-null   object 
 3   duration     7921 non-null   object 
 4   genre        9884 non-null   object 
 5   rating       8784 non-null   float64
 6   description  9957 non-null   object 
 7   stars        9957 non-null   object 
 8   votes        8784 non-null   object 
dtypes: float64(1), object(8)
memory usage: 700.2+ KB


In [None]:
data.columns

Index(['title', 'year', 'certificate', 'duration', 'genre', 'rating',
       'description', 'stars', 'votes'],
      dtype='object')

In [None]:
data.isnull().sum()

Unnamed: 0,0
title,0
year,527
certificate,3453
duration,2036
genre,73
rating,1173
description,0
stars,0
votes,1173


In [None]:
# Fill missing values in categorical columns with 'Unknown'

data['certificate'].fillna('Unknown', inplace=True)
data['duration'].fillna('Unknown', inplace=True)
data['genre'].fillna('Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['certificate'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['duration'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting

In [None]:
# Fill missing values in numerical columns with the mean
data['rating'].fillna(data['rating'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['rating'].fillna(data['rating'].mean(), inplace=True)


In [None]:
# For the 'votes' column, convert to numeric first, then fill missing with mean
data['votes'] = data['votes'].str.replace(',', '').astype(float)
data['votes'].fillna(data['votes'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['votes'].fillna(data['votes'].mean(), inplace=True)


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9957 entries, 0 to 9956
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        9957 non-null   object 
 1   year         9430 non-null   object 
 2   certificate  9957 non-null   object 
 3   duration     9957 non-null   object 
 4   genre        9957 non-null   object 
 5   rating       9957 non-null   float64
 6   description  9957 non-null   object 
 7   stars        9957 non-null   object 
 8   votes        9957 non-null   float64
dtypes: float64(2), object(7)
memory usage: 700.2+ KB


In [None]:
# Combine relevant text columns into a single string
data['features'] = data['genre'] + ' ' + data['description'] + ' ' + data['stars']

display(data[['title', 'features']].head())

Unnamed: 0,title,features
0,Cobra Kai,"Action, Comedy, Drama Decades after their 1984..."
1,The Crown,"Biography, Drama, History Follows the politica..."
2,Better Call Saul,"Crime, Drama The trials and tribulations of cr..."
3,Devil in Ohio,"Drama, Horror, Mystery When a psychiatrist she..."
4,Cyberpunk: Edgerunners,"Animation, Action, Adventure A Street Kid tryi..."


In [None]:
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
tfidf_matrix = tfidf.fit_transform(data['features'])

In [None]:
tfidf_matrix.shape

(9957, 40961)

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim.shape

(9957, 9957)

In [None]:
# Construct a reverse mapping of indices and movie titles
indices = pd.Series(data.index, index=data['title']).drop_duplicates()

# Function to get movie recommendations based on cosine similarity
def get_recommendations(title, cosine_sim=cosine_sim):
    if isinstance(indices[title], pd.Series):
        idx = indices[title].iloc[0]
    else:
        idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['title'].iloc[movie_indices]

In [None]:
recommendations=get_recommendations('The Crown')
print("Recommendations for the Crown")
print(recommendations)

Recommendations for the Crown
9954                          The Crown
105                       Downton Abbey
1400    Chicken Run: Dawn of the Nugget
2654                        First Match
682                               Locke
401                   The Lost Daughter
6729                     The 80's India
243                         Broadchurch
1664         Scrooge: A Christmas Carol
5062     Kevin Hart: Don't F**k This Up
Name: title, dtype: object


In [None]:
recommendations=get_recommendations('Better Call Saul')
print("Recommendations for Better Call Saul")
print(recommendations)

Recommendations for Better Call Saul
211     El Camino: A Breaking Bad Movie
436      A Series of Unfortunate Events
2397                    W/Bob and David
28                        Modern Family
9939                      Modern Family
1599            Inside Man: Most Wanted
144                          Doc Martin
4210      Jimmy Carr: His Dark Material
722             I Am Not Okay with This
2520                      Hell and Back
Name: title, dtype: object
