In [1]:
%pip install -q ipywidgets

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer     
from sklearn.metrics.pairwise import cosine_similarity
from ipywidgets import *

In [3]:
movies=pd.read_csv('movies.csv')
ratings=pd.read_csv('ratings.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 161.5+ KB


In [7]:
movies.shape

(10329, 3)

In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [9]:
ratings.shape

(105339, 4)

In [10]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


In [11]:
movies['genres']=movies['genres'].str.split("|")

In [12]:
movies2=movies.explode('genres')
movies2.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy


In [13]:
movies2['genres'].unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'War', 'Musical', 'Documentary',
       'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

In [14]:
movies2['genres'].nunique()

20

In [15]:
movies2=movies2[movies2['genres']!='(no genres listed)']

In [16]:
movies2['genres'].unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'War', 'Musical', 'Documentary',
       'Western', 'Film-Noir'], dtype=object)

In [17]:
movies2['genres'].nunique()

19

In [18]:
merged_data=pd.merge(ratings,movies2,on=['movieId'],how='inner')

In [19]:
popularity=merged_data.groupby(['genres','title']).agg({'rating':['mean','size']}).reset_index()
popularity.columns=["Genres","Title","Average_Ratings","Number_of_Ratings"]
popularity

Unnamed: 0,Genres,Title,Average_Ratings,Number_of_Ratings
0,Action,'71 (2014),3.500000,1
1,Action,'Hellboy': The Seeds of Creation (2004),3.000000,1
2,Action,10 to Midnight (1983),2.500000,1
3,Action,12 Rounds (2009),2.875000,4
4,Action,13 Assassins (Jûsan-nin no shikaku) (2010),3.500000,5
...,...,...,...,...
23093,Western,Wyatt Earp (1994),3.200000,30
23094,Western,Young Guns (1988),3.375000,36
23095,Western,Young Guns II (1990),3.083333,12
23096,Western,Young Ones (2014),2.000000,1


In [20]:
popularity[(popularity['Genres']=='Action')&(popularity['Number_of_Ratings']>=50)].sort_values(by="Average_Ratings",ascending=False).head(7)

Unnamed: 0,Genres,Title,Average_Ratings,Number_of_Ratings
1179,Action,Princess Mononoke (Mononoke-hime) (1997),4.384615,52
1076,Action,North by Northwest (1959),4.273973,73
975,Action,"Matrix, The (1999)",4.264368,261
1433,Action,Star Wars: Episode V - The Empire Strikes Back...,4.22807,228
1331,Action,Seven Samurai (Shichinin no samurai) (1954),4.217742,62
1199,Action,Raiders of the Lost Ark (Indiana Jones and the...,4.212054,224
747,Action,Inception (2010),4.18932,103


In [21]:
def TopNPopularMovies(genre,threshold,topN):
    popularity=merged_data.groupby(['genres','title']).agg({'rating':['mean','size']}).reset_index()
    popularity.columns=["Genres","Title","Average_Ratings","Number_of_Ratings"]

    topNrecommendations=popularity[(popularity['Genres']==genre)&(popularity['Number_of_Ratings']>=threshold)].sort_values(by="Average_Ratings",ascending=False).head(topN)

    topNrecommendations['Sno.']=range(1,len(topNrecommendations)+1)
    topNrecommendations.index=range(0,len(topNrecommendations))
    topNrecommendations.columns=['Genre','Movie Title','Average Movie Rating','Number of Reviews','Sno.']
    return topNrecommendations[['Sno.','Movie Title','Average Movie Rating','Number of Reviews']]

In [22]:
genre='Adventure'
threshold=50
topN=10
TopNPopularMovies(genre=genre,threshold=threshold,topN=topN)

Unnamed: 0,Sno.,Movie Title,Average Movie Rating,Number of Reviews
0,1,Princess Mononoke (Mononoke-hime) (1997),4.384615,52
1,2,Monty Python and the Holy Grail (1975),4.301948,154
2,3,North by Northwest (1959),4.273973,73
3,4,Spirited Away (Sen to Chihiro no kamikakushi) ...,4.236111,72
4,5,Star Wars: Episode V - The Empire Strikes Back...,4.22807,228
5,6,Seven Samurai (Shichinin no samurai) (1954),4.217742,62
6,7,Raiders of the Lost Ark (Indiana Jones and the...,4.212054,224
7,8,Star Wars: Episode IV - A New Hope (1977),4.188645,273
8,9,Lawrence of Arabia (1962),4.166667,60
9,10,"Princess Bride, The (1987)",4.163743,171


In [23]:
movies3=movies2.groupby('title').agg({"genres":lambda x:" ".join(list(x))}).reset_index()

In [24]:
tf=TfidfVectorizer(analyzer='word',ngram_range=(1,3),stop_words='english')
tf

In [25]:
tf_matrix=tf.fit_transform(movies3['genres'])

In [26]:
cosine_sim = cosine_similarity(tf_matrix,tf_matrix)

In [27]:
cosine_sim

array([[1.        , 0.02677945, 0.02931913, ..., 0.10229517, 0.        ,
        0.        ],
       [0.02677945, 1.        , 0.        , ..., 0.03626651, 0.02411583,
        0.02863994],
       [0.02931913, 0.        , 1.        , ..., 0.        , 0.        ,
        0.35526663],
       ...,
       [0.10229517, 0.03626651, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.02411583, 0.        , ..., 0.        , 1.        ,
        0.07090711],
       [0.        , 0.02863994, 0.35526663, ..., 0.        , 0.07090711,
        1.        ]])

In [30]:
def recommendation_genre(movie_df,similarity_matrix,movie_title,topN):
    indices=pd.Series(movies3.index,index=movies3['title'])
    index=indices[movie_title]
    cosine_scores=list(enumerate(similarity_matrix[index]))
    cosine_scores=sorted(cosine_scores,key=lambda x:x[1],reverse=True)[1:topN+2]
    matched=[i[0] for i in cosine_scores]
    matching_df=movie_df.iloc[matched]
    matching_df=matching_df[matching_df['title']!=movie_title]
    matching_df.rename(columns={'title':'Movie Title'},inplace=True)
    matching_df['Sno.']=range(1,len(matching_df)+1)
    matching_df.index=range(0,len(matching_df))
    return matching_df[['Sno.','Movie Title']].head(topN)

In [58]:
recommendation_genre(movie_df=movies3,similarity_matrix=cosine_sim,movie_title='Inception (2010)',topN=10)

Unnamed: 0,Sno.,Movie Title
0,1,Strange Days (1995)
1,2,Watchmen (2009)
2,3,Super 8 (2011)
3,4,Coherence (2013)
4,5,Donnie Darko (2001)
5,6,"Forgotten, The (2004)"
6,7,"Jacket, The (2005)"
7,8,Moon (2009)
8,9,"Prestige, The (2006)"
9,10,Soylent Green (1973)


In [42]:
genres=Dropdown(options=list(set(movies2['genres'])),description="Genres",style={"description_width":'initial'})
num_reviews=IntText(description="Minimum Reviews",style={"description_width":'initial'})
num_recommendations_1=IntText(description="Number of Recommendations",style={"description_width":'initial'})

b1=Button(description="RECOMMEND ME",style={"description_width":'initial'})
h1=HBox([num_reviews,num_recommendations_1])
popularity_tab=VBox([genres,h1,b1])


title=Textarea(description="Movie Title",style={"description_width":'initial'})
num_recommendations_2=IntText(description="Number of Recommendations",style={"description_width":'initial'})

h2=HBox([title,num_recommendations_2])
b2=Button(description="RECOMMEND ME",style={"description_width":'initial'})
content_tab=VBox([h2,b2])


tabs=[popularity_tab,content_tab]
wid=widgets.Tab(tabs)

names=['Popularity Based Recommendations','Content Based Recommendations']
[wid.set_title(i,title) for i,title in enumerate(names)]

display(wid)

Tab(children=(VBox(children=(Dropdown(description='Genres', options=('Musical', 'War', 'Film-Noir', 'Comedy', …

In [59]:
def b1_clicked(b):
    global output
    output=TopNPopularMovies(genre=genres.value,threshold=num_reviews.value,topN=num_recommendations_1.value)
b1.on_click(b1_clicked)

def b2_clicked(b):
    global output
    result=recommendation_genre(movie_df=movies3,similarity_matrix=cosine_sim,movie_title=title.value,topN=num_recommendations_2.value)
    output=result
b2.on_click(b2_clicked)

In [60]:
display(wid)

Tab(children=(VBox(children=(Dropdown(description='Genres', index=4, options=('Musical', 'War', 'Film-Noir', '…

In [65]:
output

Unnamed: 0,Sno.,Movie Title,Average Movie Rating,Number of Reviews
0,1,North by Northwest (1959),4.273973,73
1,2,Life Is Beautiful (La Vita è bella) (1997),4.253425,73
2,3,Casablanca (1942),4.236,125
3,4,Annie Hall (1977),4.205882,68
4,5,My Fair Lady (1964),4.17,50
5,6,Vertigo (1958),4.169492,59
6,7,Good Will Hunting (1997),4.167857,140
7,8,West Side Story (1961),4.166667,42
8,9,"Princess Bride, The (1987)",4.163743,171
9,10,American Beauty (1999),4.157407,216
