In [1]:
import ast
import nltk
import pickle
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Use a specific seaborn style in matplotlib
# plt.style.use("seaborn-darkgrid")  # Replace "seaborn" with a valid style name
sns.set_style("darkgrid")
warnings.filterwarnings("ignore")

In [2]:
movies = pd.read_csv("/Users/anjalisingh/Documents/ML LAB/Movie Recommender/tmdb_5000_movies.csv")
credits = pd.read_csv("/Users/anjalisingh/Documents/ML LAB/Movie Recommender/tmdb_5000_credits.csv")


In [4]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [5]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [6]:
# Merging all the columns of movies and credits into movies, excluding title from credits since it will get repeated.

movies = movies.merge(credits, on = "title")

In [7]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [8]:
# Picking the most relevant columns on which recommendations will be made.
movies = movies[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew", "vote_average"]]

In [9]:
credits=movies[["movie_id", "vote_average"]]

In [10]:
credits.head(1
             )

Unnamed: 0,movie_id,vote_average
0,19995,7.2


In [11]:
movies = movies[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]

In [12]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [13]:
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [14]:
# Removing the tuples/data points with missing data.

movies.dropna(inplace = True)

In [15]:
# Checking whether there is any duplicate data.

movies.duplicated().sum()

0

In [16]:
# Extracting useful information from raw data of genres & keywords for the creation of tags.

def convert1(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i["name"])

    return List

In [17]:
movies["genres"] = movies["genres"].apply(convert1)

In [18]:
movies["keywords"] = movies["keywords"].apply(convert1)

In [19]:
# Extracting top three actors/actresses from raw data of cast.

def convert2(obj):
    List = []
    count = 0

    for i in ast.literal_eval(obj):
        if count < 3:
            List.append(i["name"])
            count += 1
        else:
            break

    return List

In [20]:
movies["cast"] = movies["cast"].apply(convert2)

In [21]:
# Extracting director from raw data of crew.

def fetch_director(obj):
    List = []

    for i in ast.literal_eval(obj):
        if i["job"] == "Director":
            List.append(i["name"])
            break

    return List

In [22]:
movies["crew"] = movies["crew"].apply(fetch_director)

In [23]:
# Converting string into a list of elements.

movies["overview"] = movies["overview"].apply(lambda x : x.split())

In [24]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [25]:
# Replacing spaces in the elements of genres for each tuple.

movies["genres"] = movies["genres"].apply(lambda x : [i.replace(" ", "") for i in x])

In [26]:
# Replacing spaces in the elements of keywords for each tuple.

movies["keywords"] = movies["keywords"].apply(lambda x : [i.replace(" ", "") for i in x])

In [27]:
# Replacing spaces in the elements of cast for each tuple.

movies["cast"] = movies["cast"].apply(lambda x : [i.replace(" ", "") for i in x])

In [28]:
# Replacing spaces in the elements of crew for each tuple.

movies["crew"] = movies["crew"].apply(lambda x : [i.replace(" ", "") for i in x])

In [29]:
movies["tags"] = movies["overview"] + movies["genres"] + movies["keywords"] + movies["cast"] + movies["crew"]

In [30]:
# Creating a new dataframe.

df = movies[["movie_id", "title", "tags"]]

In [31]:
df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [32]:
# Converting elements of tags into a string.

df["tags"] = df["tags"].apply(lambda x : " ".join(x))

In [33]:
df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."


In [34]:
# Converting the string into lower case.

df["tags"] = df["tags"].apply(lambda x : x.lower())

In [35]:
df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."


In [36]:
# Choosing top 5000 most frequently words amongst all the tags.

cv = CountVectorizer(max_features = 5000, stop_words = "english")

In [37]:
# Transforming every movie in the form of a vector.

vectors = cv.fit_transform(df["tags"]).toarray()

In [38]:
ps = PorterStemmer()

In [39]:
# Stemming the words like loving or loved into love etc.

def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [40]:
df["tags"] = df["tags"].apply(stem)

In [41]:
# Similarities between the vectors of movies i.e greater the similarity more will be the chances of it getting recommended.

similarity = cosine_similarity(vectors)

In [42]:
movie_id_to_vote_average = dict(zip(credits["movie_id"], credits["vote_average"]))
# **This creates a dictionary for quick lookup of vote_average using movie_id.**

In [43]:
# Create a new DataFrame that includes movie_id, title, and tags
df_with_ids = movies[["movie_id", "title", "tags"]].copy()

# Ensure movie_id is integrated cleanly into this new DataFrame
df_with_ids = pd.merge(df_with_ids, credits[["movie_id", "vote_average"]], on="movie_id", how="left")
# **This integrates `vote_average` from `credits` into the new DataFrame.**

# Check the structure of the new DataFrame
print(df_with_ids.head())

   movie_id                                     title  \
0     19995                                    Avatar   
1       285  Pirates of the Caribbean: At World's End   
2    206647                                   Spectre   
3     49026                     The Dark Knight Rises   
4     49529                               John Carter   

                                                tags  vote_average  
0  [In, the, 22nd, century,, a, paraplegic, Marin...           7.2  
1  [Captain, Barbossa,, long, believed, to, be, d...           6.9  
2  [A, cryptic, message, from, Bond’s, past, send...           6.3  
3  [Following, the, death, of, District, Attorney...           7.6  
4  [John, Carter, is, a, war-weary,, former, mili...           6.1  


RECCOMENDATION

In [44]:
def recommend(movie):
    # Get the index of the given movie from the dataframe `df`
    movie_index = df[df["title"] == movie].index[0]
    
    # Get the similarity scores for the movie with all other movies
    distances = similarity[movie_index]

    # Sort the movies based on their similarity score (greater similarity = higher rank)
    # Exclude the first one (the movie itself) by starting from index 1
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    # Collect similar movie titles and their respective vote averages
    similar_movies = []
    vote_averages = []
    similarity_scores = []

    for i in movies_list:
        movie_title = df.iloc[i[0]]["title"]  # Access movie title
        movie_id = df.iloc[i[0]]["movie_id"]  # Access movie_id to get the vote_average from the credits DataFrame
        similarity_score = i[1]
        
        # Retrieve the vote_average from the movie_id_to_vote_average dictionary
        movie_vote_average = movie_id_to_vote_average.get(movie_id, 0)  # Default to 0 if not found
        
        similar_movies.append(movie_title)
        vote_averages.append(movie_vote_average)
        similarity_scores.append(similarity_score)
    
    # Compute the weighted vote average (CNN - Cosine-Weighted Average)
    weighted_vote_average = np.average(vote_averages, weights=similarity_scores)
    
    # Print the top 5 recommended movies and the new weighted vote average
    print(f"Top 5 recommended movies for '{movie}':")
    for title in similar_movies:
        print(title)
    
    print(f"\nNew Weighted Vote Average for '{movie}': {weighted_vote_average:.2f}")

In [45]:
recommend("Inception")

Top 5 recommended movies for 'Inception':
Duplex
The Helix... Loaded
Star Trek II: The Wrath of Khan
Timecop
Chicago Overcoat

New Weighted Vote Average for 'Inception': 5.92


In [49]:
pickle.dump(df, open("/Users/anjalisingh/Documents/ML LAB/Movie Recommender/movies.pkl", "wb"))

In [50]:
pickle.dump(similarity, open("/Users/anjalisingh/Documents/ML LAB/Movie Recommender/similarity.pkl", "wb"))

In [52]:
pickle.dump(credits, open("/Users/anjalisingh/Documents/ML LAB/Movie Recommender/credits.pkl", "wb"))

print("credits.pkl has been created and saved.")

credits.pkl has been created and saved.
