In [3]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/Movie_Recommendation_Dataset/'  #change dir to your project folder

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
import pandas as pd


In [5]:
# reading movies dataset
movies = pd.read_csv(root_path+'movies.csv')

In [6]:
movies


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [7]:
import re

In [8]:
# the function to clean titles from unwanted commas and spaces to implement search engine

def clean_title(title):
  return re.sub("[^a-zA-Z0-9 ]","",title)


In [9]:
# using pandas built-in function apply to take and pass each movie title to the clean title function

movies['clean_title']=movies['title'].apply(clean_title)

In [10]:
# Building term frequency matrix

# Inverse Document Frequency

# TF * IDF Matix

from sklearn.feature_extraction.text import TfidfVectorizer



In [11]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [12]:
tfidf_matrix = vectorizer.fit_transform(movies['clean_title'])
tfidf_matrix

<62423x170073 sparse matrix of type '<class 'numpy.float64'>'
	with 446566 stored elements in Compressed Sparse Row format>

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [14]:
# lets test out a few results

exp_movie = 'Harry Potter'
exp_movie = clean_title(exp_movie)
vec = vectorizer.transform([exp_movie])
similarity = cosine_similarity(vec,tfidf_matrix).flatten()
similarity

array([0., 0., 0., ..., 0., 0., 0.])

In [15]:
# lets test out a few results

exp_movie = 'Jumanji'
exp_movie = clean_title(exp_movie)
vec = vectorizer.transform([exp_movie])
similarity = cosine_similarity(vec,tfidf_matrix).flatten()
# now finding the indices with more similarity
indices = np.argpartition(similarity,-5)[-5:]
results = movies.iloc[indices][::-1]
results

Unnamed: 0,movieId,title,genres,clean_title
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
49687,179401,Jumanji: Welcome to the Jungle (2017),Action|Adventure|Children,Jumanji Welcome to the Jungle 2017
20807,107565,"Fuck You, Goethe (Fack Ju Göhte) (2013)",Comedy,Fuck You Goethe Fack Ju Ghte 2013
20803,107548,Ice Quake (2010),Action|Sci-Fi|Thriller,Ice Quake 2010
20804,107557,Fun Size (2012),Comedy,Fun Size 2012


In [16]:
# the function for search 

def search(title):
  title = clean_title(title)
  vec = vectorizer.transform([title])
  similarity = cosine_similarity(vec,tfidf_matrix).flatten()
  # now finding the indices with more similarity
  indices = np.argpartition(similarity,-5)[-5:]
  results = movies.iloc[indices][::-1]
  return results

# Now Building an interactive search box widget

In [17]:
!pip install ipython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
import ipywidgets as widget
from IPython.display import display

movie_input = widget.Text(
    value = 'Toy Story',
    description = 'Movie Title',
    disabled = False
)

movie_list = widget.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data['new']
    if(len(title)>5):
      display(search(title))

movie_input.observe(on_type,names='value')

display(movie_input,movie_list)

Text(value='Toy Story', description='Movie Title')

Output()

In [19]:
ratings = pd.read_csv(root_path+'ratings.csv')

In [20]:
ratings


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [21]:
# now finding the similar users who watched the similar movie like us for example movie no.1 and have given ratings greater than 4
movie_id = 1
similar_users= ratings[(ratings['movieId']==movie_id) & (ratings['rating']>4)]['userId'].unique()
similar_users


array([    36,     75,     86, ..., 162527, 162530, 162533])

In [22]:
# find those movieIds for which the users exist in the similar users
similar_users_rec = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating']>4)]['movieId']
similar_users_rec

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [23]:
# now narrowing down recommendations
# by considering only greater than 10% of the users similar to us, liked the movies and rated 5 stars

similar_users_rec=similar_users_rec.value_counts()/len(similar_users)

similar_users_rec = similar_users_rec[similar_users_rec>.1]


similar_users_rec

1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: movieId, Length: 113, dtype: float64

In [24]:
# finding those users who have watched movies of similar users and are  with highest ratings 

all_users = ratings[(ratings['movieId'].isin(similar_users_rec.index)) & (ratings['rating'] > 4)]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [25]:
# what percentage of users recommend each movie

all_users_rec = all_users['movieId'].value_counts()/len(all_users['userId'].unique())
all_users_rec

318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: movieId, Length: 113, dtype: float64

**Creating a Recommendation Score**

---



In [27]:
rec_percentages = pd.concat([similar_users_rec,all_users_rec],axis=1)
rec_percentages.columns = ["Similar","All"]
rec_percentages

Unnamed: 0,Similar,All
1,1.000000,0.124728
32,0.160711,0.100293
34,0.130555,0.052229
47,0.225909,0.144469
50,0.275604,0.200513
...,...,...
59315,0.104593,0.054269
60069,0.170640,0.076307
68954,0.159172,0.064944
78499,0.152960,0.035131


In [28]:
# In order to differentiate, the difference between these columns should be significant

rec_percentages['score']=rec_percentages["Similar"] / rec_percentages['All']


In [29]:
# Now sorting the results

rec_percentages.sort_values('score',ascending = False)
rec_percentages

Unnamed: 0,Similar,All,score
1,1.000000,0.124728,8.017414
32,0.160711,0.100293,1.602424
34,0.130555,0.052229,2.499660
47,0.225909,0.144469,1.563719
50,0.275604,0.200513,1.374497
...,...,...,...
59315,0.104593,0.054269,1.927310
60069,0.170640,0.076307,2.236221
68954,0.159172,0.064944,2.450924
78499,0.152960,0.035131,4.354038


In [32]:
# Now getting only the top 10 recommendations and merging with movies dataframe, on the basis of movieId attribute

rec_percentages.head(10).merge(movies,left_index=True,right_on='movieId')


Unnamed: 0,Similar,All,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
31,0.160711,0.100293,1.602424,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,Twelve Monkeys aka 12 Monkeys 1995
33,0.130555,0.052229,2.49966,34,Babe (1995),Children|Drama,Babe 1995
46,0.225909,0.144469,1.563719,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,Seven aka Se7en 1995
49,0.275604,0.200513,1.374497,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,Usual Suspects The 1995
108,0.239873,0.160872,1.491075,110,Braveheart (1995),Action|Drama|War,Braveheart 1995
109,0.11553,0.081704,1.413997,111,Taxi Driver (1976),Crime|Drama|Thriller,Taxi Driver 1976
148,0.190231,0.090578,2.10019,150,Apollo 13 (1995),Adventure|Drama|IMAX,Apollo 13 1995
257,0.40377,0.222207,1.817089,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,Star Wars Episode IV A New Hope 1977
289,0.134962,0.095346,1.415493,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,Lon The Professional aka The Professional Lon ...


**Recommendation Engine**

---



In [38]:
def recomendationEngine(movie_id):
  # This section is about finding similar user who have watched the same movie
  similar_users= ratings[(ratings['movieId']==movie_id) & (ratings['rating']>4)]['userId'].unique()
  similar_users_rec = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating']>4)]['movieId']

  # Adjusting the values to show only those where over 10% of users that have recommended the movie
  similar_users_rec=similar_users_rec.value_counts()/len(similar_users)
  similar_users_rec = similar_users_rec[similar_users_rec>.1]

  # this section is about how commen the recommendations among all the users
  all_users = ratings[(ratings['movieId'].isin(similar_users_rec.index)) & (ratings['rating'] > 4)]
  all_users_rec = all_users['movieId'].value_counts()/len(all_users['userId'].unique())

  # Now creating the recommendation score
  rec_percentages = pd.concat([similar_users_rec,all_users_rec],axis=1)
  rec_percentages.columns = ["Similar","All"]

  rec_percentages['score']=rec_percentages["Similar"] / rec_percentages['All']
  rec_percentages.sort_values('score',ascending = False)
  
  # Now getting only the top 10 recommendations and merging with movies dataframe, on the basis of movieId attribute
  res=rec_percentages.head(10).merge(movies,left_index=True,right_on='movieId')

  return res



***Interactive Movie Recommendation System***

---




In [42]:
# Input and output Widget
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

In [44]:
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(recomendationEnginer(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()