In [2]:
import pandas as pd
import numpy as np

In [11]:
movies=pd.read_csv("movies.csv")
ratings=pd.read_csv("ratings.csv")
print(movies.shape)
print(ratings.shape)
ratings.head()

(9742, 3)
(100836, 4)


0         True
1         True
2         True
3         True
4         True
          ... 
100831    True
100832    True
100833    True
100834    True
100835    True
Name: movieId, Length: 100836, dtype: bool

In [12]:
movies=movies[movies['movieId'].isin(ratings['movieId'])]
ratings=ratings[ratings['movieId'].isin(movies['movieId'])]
print(movies.shape)
print(ratings.shape)


(9724, 3)
(100836, 4)


# Content Based Recommendation

* Based On Similar Titles ( We are only considering Title similarity as content similairty)


In [13]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [76]:
# Write a function to CLean the titles ( ) paranthesis and other special characters

import re  #Importing regular expression library

def clean_title(title):
  return re.sub("[^a-zA-Z0-9]"," ",title)   #Note :space needs to be added between to avoid errors when using ngram in TfidfVectorize

In [77]:

movies['clean_title']=movies['title'].apply(clean_title)     #Note that these titiles will be used for comparision of user input

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv=TfidfVectorizer(ngram_range=(1,2))

In [79]:
cos_similar=tfv.fit_transform(movies['clean_title'])
cos_similar

<9724x33124 sparse matrix of type '<class 'numpy.float64'>'
	with 70477 stored elements in Compressed Sparse Row format>

In [82]:
indices_df=pd.Series(movies.index,index=movies['clean_title'])


In [99]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search_title(x):
  title=clean_title(x)
  title_vector=tfv.transform([title])
  similarity=cosine_similarity(title_vector, cos_similar).flatten()
  #indices=np.argpartition(similarity,-5)[-5:]   # We will get the 5 most similar movies in reverse order
  indices=sorted(list(enumerate(similarity)),key=lambda x:x[1],reverse=True)[0:6]
  indices=[i[0] for i in indices]
  results=movies.iloc[indices]
  return results


In [104]:
#Building Interactive search box widgets

import ipywidgets as widgets     #importing widgets
from IPython.display import display     #For Showing outputs

#Input Widget

movie_input=widgets.Text(
    value="Toy Story",
    description="Movie Title",
    disabled=False
)

#Creating Ouput widget
movie_list=widgets.Output()

def on_input(data):
  with movie_list:
    movie_list.clear_output()
    title=data['new']  #Input widgets are entered as a dictonary so we are grabbing the key to get the values
    if len(title) > 5 :
      display(search_title(title))

#Connect input widget to on_input function

movie_input.observe(on_input,names="value")

display(movie_input,movie_list)


Text(value='Toy Story', description='Movie Title')

Output()

# Collaborative Filtering

In [105]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [106]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.8 MB


In [107]:
# When a movie is searched we will find users that iliked that movie and find other movies that those users liked

In [108]:
similar_users= ratings[(ratings['movieId']==1) &(ratings['rating'] >=4)]
similar_users   #users that liked the same movie

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
516,5,1,4.0,847434962
874,7,1,4.5,1106635946
1667,17,1,4.5,1305696483
2274,19,1,4.0,965705637
...,...,...,...,...
95864,601,1,4.0,1521467801
96100,603,1,4.0,963178147
97143,605,1,4.0,1277097561
98479,607,1,4.0,964744033


In [116]:
# movies liked by similar users
similar_user_recs=ratings[ratings['userId'].isin(similar_users['userId'])&(ratings['rating'] >=4)]
similar_user_recs


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100830,610,166528,4.0,1493879365
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047


In [134]:
# From all these movies liked by similar users we will narrow down to movies with highest number of counts
top_similar_users=similar_user_recs['movieId'].value_counts()/len(similar_users)
top_similar_users=top_similar_users[top_similar_users>.20]   #Taking only movies rated higly by more than 20% of similar users

In [129]:
# We have to narrow down these movies to movies thatonly the most similar users to us liked
#Because here for example Toy story is liked by every one not just the most similar users to us so we will find the general crowd liking and filter them out

In [131]:
all_users=ratings[ratings['movieId'].isin(top_similar_users.index) & (ratings['rating'] >= 4)]

In [135]:
all_users=all_users['movieId'].value_counts()/len(all_users['userId'].unique())

In [140]:
rec_percentages=pd.concat([top_similar_users,all_users],axis=1)
rec_percentages.columns=['similar','all']
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.241379
318,0.585034,0.449918
356,0.551020,0.408867
260,0.530612,0.330049
296,0.503401,0.400657
...,...,...
4475,,0.001642
4478,,0.001642
4488,,0.001642
4583,,0.001642


In [143]:
#We will score by dividing similar to all to find out different movies and highest ratio indicate that those moivies are liked by similar movies and
#is different from the average population
rec_percentages['score']=rec_percentages['similar']/rec_percentages['all']
resulting_movies=rec_percentages.sort_values(by="score",ascending=False).head(10)

In [145]:
collaborative_recs=pd.merge(resulting_movies,movies,left_index=True,right_on="movieId")

In [146]:
collaborative_recs

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.241379,4.142857,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
2355,0.312925,0.091954,3.403061,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
785,0.210884,0.083744,2.518207,1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical,Mary Poppins 1964
32,0.292517,0.123153,2.375238,34,Babe (1995),Children|Drama,Babe 1995
815,0.29932,0.126437,2.367347,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
1067,0.22449,0.09688,2.317191,1387,Jaws (1975),Action|Horror,Jaws 1975
2103,0.210884,0.091954,2.293367,2797,Big (1988),Comedy|Drama|Fantasy|Romance,Big 1988
701,0.217687,0.095238,2.285714,919,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical,Wizard of Oz The 1939
2038,0.258503,0.114943,2.24898,2716,Ghostbusters (a.k.a. Ghost Busters) (1984),Action|Comedy|Sci-Fi,Ghostbusters a k a Ghost Busters 1984
2195,0.231293,0.10509,2.200893,2918,Ferris Bueller's Day Off (1986),Comedy,Ferris Bueller s Day Off 1986


In [153]:
np.random.choice(movies.shape[0])

5006

In [156]:
movies.iloc[5006]
find_similar_movies(7773)

Unnamed: 0,score,title,genres
5835,295.5,"League of Ordinary Gentlemen, A (2004)",Documentary
5818,295.5,Hostage (2005),Action|Crime|Drama|Thriller
5015,295.5,"Bang, Bang, You're Dead (2002)",Drama
4247,295.5,Poolhall Junkies (2002),Comedy|Drama|Thriller
5914,295.5,It's All Gone Pete Tong (2004),Comedy|Drama|Musical
5767,295.5,Elektra (2005),Action|Adventure|Crime|Drama
5360,295.5,"Final Cut, The (2004)",Sci-Fi|Thriller
4116,295.5,Empire (2002),Crime|Drama
3555,295.5,"Last Castle, The (2001)",Action
1611,295.5,"Avengers, The (1998)",Action|Adventure


In [149]:
def find_similar_movies(movie_id):
  similar_users= ratings[(ratings['movieId']==movie_id) &(ratings['rating'] >=4)]     #Finding similar user based on movie id

  similar_user_recs=ratings[ratings['userId'].isin(similar_users['userId'])&(ratings['rating'] >=4)]  #Finding highly rated movies by similar users

  top_similar_users=similar_user_recs['movieId'].value_counts()/len(similar_users) #finding percenatge of highly rated movies

  top_similar_users=top_similar_users[top_similar_users>.20]   # taking only movies liked by more than 20% of similar users

  all_users=ratings[ratings['movieId'].isin(top_similar_users.index) & (ratings['rating'] >= 4)] #Finding all users ratings for these movies

  all_users=all_users['movieId'].value_counts()/len(all_users['userId'].unique()) #Finding percentages of these movies by general public

  rec_percentages=pd.concat([top_similar_users,all_users],axis=1)  #Comparing similar users ratings withh all users rating

  rec_percentages.columns=['similar','all']

  rec_percentages['score']=rec_percentages['similar']/rec_percentages['all']   #Creating score to filter out movies liked greater by similar users rather than all public

  resulting_movies=rec_percentages.sort_values(by="score",ascending=False).head(10)         #Seclecting only the top 10 movies

  collaborative_recs=pd.merge(resulting_movies,movies,left_index=True,right_on="movieId")[['score',"title","genres"]]

  return collaborative_recs







In [150]:
find_similar_movies(1)

Unnamed: 0,score,title,genres
0,3.986395,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2355,3.274538,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
785,2.423103,Mary Poppins (1964),Children|Comedy|Fantasy|Musical
32,2.285533,Babe (1995),Children|Drama
815,2.27794,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical
1067,2.229678,Jaws (1975),Action|Horror
2103,2.206754,Big (1988),Comedy|Drama|Fantasy|Romance
701,2.19939,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical
2038,2.164043,Ghostbusters (a.k.a. Ghost Busters) (1984),Action|Comedy|Sci-Fi
2195,2.117772,Ferris Bueller's Day Off (1986),Comedy


# Hybrid Recommendation System

* we will join the content based filtering recommendations with collaborative filtering
* We will use the movie id received from content based to find similar users in collaborative filtering

* We will create  a interactive widgets that combines these to recommendation filters



In [158]:

#Input Widget

movie_input=widgets.Text(
    value="Toy Story",
    description="Movie Title",
    disabled=False
)

#Creating Ouput widget
movie_list=widgets.Output()

def on_input(data):
  with movie_list:
    movie_list.clear_output()
    title=data['new']  #Input widgets are entered as a dictonary so we are grabbing the key to get the values
    if len(title) > 5 :
      result=search_title(title)
      movie_id=result.iloc[0]['movieId']
      display(find_similar_movies(movie_id))

#Connect input widget to on_input function

movie_input.observe(on_input,names="value")

display(movie_input,movie_list)

Text(value='Toy Story', description='Movie Title')

Output()

In [161]:
def movie_recommendation():
  movie_input=widgets.Text(
      value="Toy Story",
      description="Movie Title",
      disabled=False
  )

  #Creating Ouput widget
  movie_list=widgets.Output()

  def on_input(data):
    with movie_list:
      movie_list.clear_output()
      title=data['new']  #Input widgets are entered as a dictonary so we are grabbing the key to get the values
      if len(title) > 5 :
        result=search_title(title)
        movie_id=result.iloc[0]['movieId']
        display(find_similar_movies(movie_id))

  #Connect input widget to on_input function

  movie_input.observe(on_input,names="value")

  display(movie_input,movie_list)

In [162]:
movie_recommendation()

Text(value='Toy Story', description='Movie Title')

Output()

In [163]:
movie_recommendation()

Text(value='Toy Story', description='Movie Title')

Output()

In [164]:
movie_recommendation()

Text(value='Toy Story', description='Movie Title')

Output()