In [1]:
# Libraries
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import ast

In [2]:
# Dataset
movies = pd.read_csv(r'C:\Users\HP\OneDrive\Desktop\movies.csv', usecols=['movieId','title','genres'], dtype={'movieId':'int32','title':'str','genres':'str'})
ratings = pd.read_csv(r'C:\Users\HP\OneDrive\Desktop\ratings.csv', usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
tags = pd.read_csv(r'C:\Users\HP\OneDrive\Desktop\tags.csv')
links = pd.read_csv(r'C:\Users\HP\OneDrive\Desktop\links.csv')
tmbd_movies = pd.read_csv(r'C:\Users\HP\OneDrive\Desktop\tmbd_movies.csv')
tmbd_credits = pd.read_csv(r'C:\Users\HP\OneDrive\Desktop\tmbd_credits.csv')

In [3]:
print(movies.shape)
print(ratings.shape)
print(tags.shape)
print(links.shape)
print(tmbd_movies.shape)
print(tmbd_credits.shape)

(9742, 3)
(100836, 3)
(3683, 4)
(9742, 3)
(4803, 20)
(4803, 4)


In [4]:
print(links["tmdbId"].isna().sum())
links.dropna(axis=0,inplace = True)

8


In [5]:
movies_links = pd.merge(movies, links, on='movieId', how='inner')
new_movies = pd.merge(movies_links, tmbd_movies, left_on='tmdbId', right_on='id', how='inner')
new_movies.shape

(3537, 25)

In [6]:
new_movies = new_movies.merge(tmbd_credits,left_on='title_y',right_on="title",how="inner")

In [7]:
new_movies.drop(["homepage","status","title_y","title","movie_id","tagline","genres_y","imdbId","tmdbId","id","original_title","original_language","production_companies","production_countries","release_date","spoken_languages","vote_average","keywords","overview"],axis = 1,inplace=True)

In [8]:
new_movies

Unnamed: 0,movieId,title_x,genres_x,budget,popularity,revenue,runtime,vote_count,cast,crew
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,30000000,73.640445,373554033,81.0,5269,"[{""cast_id"": 14, ""character"": ""Woody (voice)"",...","[{""credit_id"": ""52fe4284c3a36847f8024f55"", ""de..."
1,10,GoldenEye (1995),Action|Adventure|Thriller,58000000,59.824565,352194034,130.0,1174,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""52fe426ec3a36847f801e16f"", ""de..."
2,11,"American President, The (1995)",Comedy|Drama|Romance,62000000,11.056763,107879496,106.0,195,"[{""cast_id"": 1, ""character"": ""Andrew Shepherd""...","[{""credit_id"": ""52fe44dac3a36847f80adfa3"", ""de..."
3,14,Nixon (1995),Drama,44000000,3.770161,13681765,192.0,71,"[{""cast_id"": 1, ""character"": ""Richard Nixon"", ...","[{""credit_id"": ""52fe43c59251416c7501d705"", ""de..."
4,15,Cutthroat Island (1995),Action|Adventure|Romance,98000000,7.029308,10017322,119.0,136,"[{""cast_id"": 1, ""character"": ""Morgan Adams"", ""...","[{""credit_id"": ""52fe42f4c3a36847f802f69f"", ""de..."
...,...,...,...,...,...,...,...,...,...,...
3536,160644,Indignation (2016),Drama,0,5.625989,0,110.0,69,"[{""cast_id"": 0, ""character"": ""Marcus Messner"",...","[{""credit_id"": ""58514b91c3a3682dfe017405"", ""de..."
3537,160954,Nerve (2016),Drama|Thriller,20000000,62.933899,83707310,96.0,2181,"[{""cast_id"": 4, ""character"": ""Vee Delmonico"", ...","[{""credit_id"": ""57993c2f925141234800341d"", ""de..."
3538,161127,The Infiltrator (2016),Crime|Drama,25000000,24.672234,15436808,127.0,573,"[{""cast_id"": 2, ""character"": ""Robert Mazur"", ""...","[{""credit_id"": ""578af3a79251417aca003525"", ""de..."
3539,161580,Bad Moms (2016),Comedy,20000000,42.512296,183936074,100.0,1252,"[{""cast_id"": 0, ""character"": ""Amy Mitchell"", ""...","[{""credit_id"": ""5690c7adc3a3686b52001c68"", ""de..."


In [9]:
print(new_movies.isna().sum())

movieId       0
title_x       0
genres_x      0
budget        0
popularity    0
revenue       0
runtime       0
vote_count    0
cast          0
crew          0
dtype: int64


In [10]:
new_movies.rename(columns={'title_x': 'title','genres_x' : 'genres'}, inplace=True)


In [11]:
# Merge movies and tags
grouped_tags = tags.groupby('movieId')['tag'].apply(list)
grouped_tags_df = pd.DataFrame(grouped_tags)
new_movies = pd.merge(new_movies, grouped_tags_df, left_on='movieId', right_index=True, how='left')

In [12]:
# Preprocess movie data
new_movies['tag'] = new_movies['tag'].apply(lambda x: x if isinstance(x, list) else [])
new_movies['genres'] = new_movies['genres'].apply(lambda x: x.split('|'))
new_movies['year'] = new_movies['title'].apply(lambda x: re.findall('\((\d{4})\)', x))
new_movies['year'] = new_movies['year'].apply(lambda x: x[0] if x else '')
new_movies['title'] = new_movies['title'].str.replace('\(\d{4}\)', '', regex=True).str.strip()

In [13]:
def preprocess_cast(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter+=1
    return L 

new_movies['cast'] = new_movies['cast'].apply(preprocess_cast)


In [14]:
def preprocess_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L
new_movies['crew'] = new_movies['crew'].apply(preprocess_cast)


In [15]:
def filter_data(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

new_movies['cast'] = new_movies['cast'].apply(filter_data)
new_movies['crew'] = new_movies['crew'].apply(filter_data)

In [16]:
new_movies

Unnamed: 0,movieId,title,genres,budget,popularity,revenue,runtime,vote_count,cast,crew,tag,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",30000000,73.640445,373554033,81.0,5269,"[TomHanks, TimAllen, DonRickles]","[AndrewStanton, AndrewStanton, AndrewStanton]","[pixar, pixar, fun]",1995
1,10,GoldenEye,"[Action, Adventure, Thriller]",58000000,59.824565,352194034,130.0,1174,"[PierceBrosnan, SeanBean, IzabellaScorupco]","[EricSerra, TinaTurner, IanFleming]",[],1995
2,11,"American President, The","[Comedy, Drama, Romance]",62000000,11.056763,107879496,106.0,195,"[MichaelDouglas, AnnetteBening, MichaelJ.Fox]","[JohnSeale, RobReiner, RobReiner]","[politics, president]",1995
3,14,Nixon,[Drama],44000000,3.770161,13681765,192.0,71,"[AnthonyHopkins, JoanAllen, PowersBoothe]","[JohnWilliams, RobertRichardson, OliverStone]","[politics, president]",1995
4,15,Cutthroat Island,"[Action, Adventure, Romance]",98000000,7.029308,10017322,119.0,136,"[GeenaDavis, MatthewModine, FrankLangella]","[PeterLevy, MaggieGray, NormanGarwood]",[],1995
...,...,...,...,...,...,...,...,...,...,...,...,...
3536,160644,Indignation,[Drama],0,5.625989,0,110.0,69,"[LoganLerman, SarahGadon, TracyLetts]","[AvyKaufman, AnthonyBregman, JamesSchamus]",[],2016
3537,160954,Nerve,"[Drama, Thriller]",20000000,62.933899,83707310,96.0,2181,"[EmmaRoberts, DaveFranco, EmilyMeade]","[JohnPapsidera, PetrHlinomaz, RandallPoster]",[],2016
3538,161127,The Infiltrator,"[Crime, Drama]",25000000,24.672234,15436808,127.0,573,"[BryanCranston, DianeKruger, JohnLeguizamo]","[LuisCarballar, CrispianSallis, GailStevens]",[],2016
3539,161580,Bad Moms,[Comedy],20000000,42.512296,183936074,100.0,1252,"[MilaKunis, KristenBell, KathrynHahn]","[JonLucas, JonLucas, ScottMoore]",[],2016


In [17]:
new_movies["movie_document"] = new_movies['genres'] + new_movies['crew'] + new_movies['cast'] + new_movies['tag']
new_movies['movie_document'] = new_movies['movie_document'].apply(lambda x: " ".join(x))


In [18]:
# Compute TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(new_movies['movie_document'])


In [19]:
# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Create a reverse map of indices and movie titles
indices = pd.Series(new_movies.index, index=new_movies['title']).drop_duplicates()

In [20]:
# Function to recommend popular movies to new users
def recommend_popular_movies(n=3):
    popular_movies = new_movies.sort_values('popularity', ascending=False).head(n)
    return popular_movies['title']

popular_recommendations = recommend_popular_movies()
print(popular_recommendations)


3443         Minions
3255    Interstellar
3381        Deadpool
Name: title, dtype: object


In [21]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]
    movie_indices = [i[0] for i in sim_scores]
    return new_movies['title'].iloc[movie_indices]

In [22]:
get_recommendations("Toy Story")

643     Bug's Life, A
868       Toy Story 2
1592     Finding Nemo
Name: title, dtype: object