In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn
import re

In [2]:
pip install scikit-surprise

Note: you may need to restart the kernel to use updated packages.


In [3]:
from surprise import Reader, Dataset, SVD, accuracy, NMF, KNNBasic, KNNWithMeans
from surprise.model_selection import cross_validate

# Import and Check Dataset

In [4]:
#from google.colab import drive
#drive.mount('/content/drive')

In [5]:
#First we get our datasets from movielens: 
#https://grouplens.org/datasets/movielens/latest/

#this link contains two datasets
#one for 100k and another for 25m
#we will mainly be using the 25m 

links = pd.read_csv("links.csv")
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")

In [6]:
links.dropna(inplace=True)
print(links.isnull().sum())
links.head(10)

movieId    0
imdbId     0
tmdbId     0
dtype: int64


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
5,6,113277,949.0
6,7,114319,11860.0
7,8,112302,45325.0
8,9,114576,9091.0
9,10,113189,710.0


In [7]:
movies.genres = movies.genres.str.split('|')
print(movies.isnull().sum())
movies.head(10)

movieId    0
title      0
genres     0
dtype: int64


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
5,6,Heat (1995),"[Action, Crime, Thriller]"
6,7,Sabrina (1995),"[Comedy, Romance]"
7,8,Tom and Huck (1995),"[Adventure, Children]"
8,9,Sudden Death (1995),[Action]
9,10,GoldenEye (1995),"[Action, Adventure, Thriller]"


In [8]:
movies[movies['title'] == 'Toy Story (1995)'].movieId[0]

1

In [9]:
ratings.dropna(inplace=True)
print(ratings.isnull().sum())
ratings.head(10)

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264
5,1,1590,2.5,1256677236
6,1,1591,1.5,1256677475
7,1,2134,4.5,1256677464
8,1,2478,4.0,1256677239
9,1,2840,3.0,1256677500


In [10]:
print(len(tags['tag'].unique()))
tags.dropna(inplace=True)
print(tags.isnull().sum())
tags.head(10)

74715
userId       0
movieId      0
tag          0
timestamp    0
dtype: int64


Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,Medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,space action,1442169421
4,14,318,imdb top 250,1442615195
5,14,318,justice,1442615192
6,14,480,Dinosaurs,1443148563
7,14,593,psychothriller,1444014286
8,14,1682,philosophy,1442615158
9,14,1682,surveillance,1442615167


In [11]:
print("Movies:",movies.shape)
print("Ratings:",ratings.shape)

Movies: (58098, 3)
Ratings: (27753444, 4)


# Simple Recommender

In [12]:
movies_ratings_merged=movies.merge(ratings, on='movieId')
movies_ratings_merged.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",4,4.0,1113765937
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",10,5.0,948885850
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",14,4.5,1442169375
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",15,4.0,1370810063
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",22,4.0,1237622631


In [13]:
movies_average_rating=movies_ratings_merged.groupby('title')['rating'].mean().sort_values(ascending=False).reset_index().rename(columns={'rating':'Average Rating'})
print(movies_average_rating.shape)
movies_average_rating.head()


(53817, 2)


Unnamed: 0,title,Average Rating
0,VHS Revolution (2017),5.0
1,We Are Mountains (1969),5.0
2,Dragon Lee Vs. The 5 Brothers (1978),5.0
3,Living on Love (1937),5.0
4,Change of Plans (2011),5.0


In [14]:
movies_rating_count=movies_ratings_merged.groupby('title')['rating'].count().sort_values(ascending=True).reset_index().rename(columns={'rating':'Rating Count'}) 
movies_rating_count_avg=movies_rating_count.merge(movies_average_rating, on='title')
print(movies_rating_count_avg.shape)
movies_rating_count_avg.head()

(53817, 3)


Unnamed: 0,title,Rating Count,Average Rating
0,Good Times (Beste Zeit) (2007),1,3.0
1,Tokyo Girl (2008),1,2.5
2,The Ape Woman (1964),1,3.5
3,"Byron, Ballad for a Daemon (1998)",1,0.5
4,Radio Arrow (1998),1,4.0


In [15]:
movies_genres = movies_rating_count_avg.merge(movies, on='title')
movies_genres

Unnamed: 0,title,Rating Count,Average Rating,movieId,genres
0,Good Times (Beste Zeit) (2007),1,3.000000,114355,"[Action, Drama, Thriller]"
1,Tokyo Girl (2008),1,2.500000,149436,[(no genres listed)]
2,The Ape Woman (1964),1,3.500000,137652,"[Comedy, Drama]"
3,"Byron, Ballad for a Daemon (1998)",1,0.500000,174915,[Drama]
4,Radio Arrow (1998),1,4.000000,182415,"[Comedy, Drama]"
...,...,...,...,...,...
53890,"Matrix, The (1999)",84545,4.149695,2571,"[Action, Sci-Fi, Thriller]"
53891,"Silence of the Lambs, The (1991)",87899,4.151412,593,"[Crime, Horror, Thriller]"
53892,Pulp Fiction (1994),92406,4.173971,296,"[Comedy, Crime, Drama, Thriller]"
53893,Forrest Gump (1994),97040,4.056585,356,"[Comedy, Drama, Romance, War]"


In [16]:
popularity_threshold = 2000
popular_movies= movies_genres[movies_genres['Rating Count']>=popularity_threshold]
print(popular_movies.shape)
popular_movies.sort_values(by='Average Rating', ascending=False).head(15)

(2569, 5)


Unnamed: 0,title,Rating Count,Average Rating,movieId,genres
53894,"Shawshank Redemption, The (1994)",97999,4.424188,318,"[Crime, Drama]"
53877,"Godfather, The (1972)",60904,4.332893,858,"[Crime, Drama]"
53879,"Usual Suspects, The (1995)",62180,4.291959,50,"[Crime, Mystery, Thriller]"
53834,"Godfather: Part II, The (1974)",38875,4.263035,1221,"[Crime, Drama]"
53887,Schindler's List (1993),71516,4.257502,527,"[Drama, War]"
53495,Seven Samurai (Shichinin no samurai) (1954),14578,4.254116,2019,"[Action, Adventure, Drama]"
53608,12 Angry Men (1957),17931,4.237075,1203,[Drama]
53692,Rear Window (1954),22264,4.230799,904,"[Mystery, Thriller]"
53882,Fight Club (1999),65678,4.230663,2959,"[Action, Crime, Drama, Thriller]"
53845,One Flew Over the Cuckoo's Nest (1975),42181,4.22292,1193,[Drama]


In [17]:
def Genre_recommender(movies_genres, filter_genre, popularity_threshold = 1000):

  mask = movies_genres['genres'].apply(lambda x: filter_genre in x)
  df = movies_genres[mask]
  popular_movies = df[df['Rating Count']>=popularity_threshold]
  popular_movies = popular_movies.sort_values('Average Rating', ascending=False).head(300)

  return(popular_movies)

In [18]:
Genre_recommender(movies_genres, 'Comedy', popularity_threshold=1000).head(15)

Unnamed: 0,title,Rating Count,Average Rating,movieId,genres
53778,Dr. Strangelove or: How I Learned to Stop Worr...,29484,4.208876,750,"[Comedy, War]"
53892,Pulp Fiction (1994),92406,4.173971,296,"[Comedy, Crime, Drama, Thriller]"
53755,Life Is Beautiful (La Vita è bella) (1997),26995,4.163178,2324,"[Comedy, Drama, Romance, War]"
52200,"Thin Man, The (1934)",3704,4.150783,950,"[Comedy, Crime]"
53841,Monty Python and the Holy Grail (1975),40866,4.148351,1136,"[Adventure, Comedy, Fantasy]"
53606,"Sting, The (1973)",17906,4.141629,1234,"[Comedy, Crime]"
52491,It Happened One Night (1934),4750,4.134211,905,"[Comedy, Romance]"
53439,Intouchables (2011),13573,4.127533,92259,"[Comedy, Drama]"
53593,Wallace & Gromit: The Wrong Trousers (1993),17337,4.127531,1148,"[Animation, Children, Comedy, Crime]"
53849,"Princess Bride, The (1987)",42878,4.124808,1197,"[Action, Adventure, Comedy, Fantasy, Romance]"


In [19]:
import re
def clean_title_date(title):
  return(re.sub("[^a-zA-Z0-9 ]", "", title))


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(popular_movies["title"].apply(clean_title_date))

In [21]:
from sklearn.metrics.pairwise import cosine_similarity
def search_title(title):
  title = clean_title_date(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -10)[-10:]
  results = popular_movies.iloc[indices][::-1]
  return results

In [22]:
search_title('Gladiator')

Unnamed: 0,title,Rating Count,Average Rating,movieId,genres
53894,"Shawshank Redemption, The (1994)",97999,4.424188,318,"[Crime, Drama]"
52181,Hot Shots! (1991),3610,3.260249,5541,"[Action, Comedy, Romance, War]"
52235,Gladiator (1992),3853,3.914093,8132,"[Action, Drama]"
53859,Gladiator (2000),48666,3.956335,3578,"[Action, Adventure, Drama]"
52180,Two Weeks Notice (2002),3610,3.226731,5957,"[Comedy, Romance]"
52179,Raise the Red Lantern (Da hong deng long gao g...,3609,4.115406,1280,[Drama]
52184,Aeon Flux (2005),3624,2.911838,37386,"[Action, Sci-Fi]"
52183,"Limey, The (1999)",3619,3.621995,2912,"[Crime, Drama, Thriller]"
52182,Ender's Game (2013),3613,3.40465,106002,"[Action, Adventure, Sci-Fi, IMAX]"
52185,Room (2015),3631,3.979895,140174,[Drama]


In [23]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data['new']
    if len(title) > 3:
      display(search_title(title))

movie_input.observe(on_type, names='value')
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

# K Nearest Neighbors Collaborative Filtering

Due to the size of the dataset we need to only use users and movies that reach certain popularity thresholds

In [24]:
print(movies_ratings_merged.shape)
print(len(movies_ratings_merged['movieId'].unique()))
print(len(movies_ratings_merged['userId'].unique()))
print(53889*283228)

(27753444, 6)
53889
283228
15262873692


In [25]:
movie_ratings_threshold = int(1000)

movies_ratings_merged['count'] = movies_ratings_merged.groupby('movieId')['userId'].transform('count')
df2 = movies_ratings_merged.loc[movies_ratings_merged['count'] >= movie_ratings_threshold].drop(columns='count')
df2

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",4,4.0,1113765937
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",10,5.0,948885850
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",14,4.5,1442169375
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",15,4.0,1370810063
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",22,4.0,1237622631
...,...,...,...,...,...,...
27744150,187593,Deadpool 2 (2018),"[Action, Comedy, Sci-Fi]",282342,2.5,1528309531
27744151,187593,Deadpool 2 (2018),"[Action, Comedy, Sci-Fi]",282414,4.0,1527212481
27744152,187593,Deadpool 2 (2018),"[Action, Comedy, Sci-Fi]",282475,4.5,1531158389
27744153,187593,Deadpool 2 (2018),"[Action, Comedy, Sci-Fi]",282670,3.5,1535485882


In [26]:
user_ratings_threshold = int(100)

df2['count'] = df2.groupby('userId')['movieId'].transform('count')
df3 = df2.loc[df2['count'] >= user_ratings_threshold].drop(columns='count')
df3['title'] = df3['title'].apply(clean_title_date)
df3

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story 1995,"[Adventure, Animation, Children, Comedy, Fantasy]",4,4.0,1113765937
1,1,Toy Story 1995,"[Adventure, Animation, Children, Comedy, Fantasy]",10,5.0,948885850
2,1,Toy Story 1995,"[Adventure, Animation, Children, Comedy, Fantasy]",14,4.5,1442169375
3,1,Toy Story 1995,"[Adventure, Animation, Children, Comedy, Fantasy]",15,4.0,1370810063
9,1,Toy Story 1995,"[Adventure, Animation, Children, Comedy, Fantasy]",38,5.0,979742296
...,...,...,...,...,...,...
27744150,187593,Deadpool 2 2018,"[Action, Comedy, Sci-Fi]",282342,2.5,1528309531
27744151,187593,Deadpool 2 2018,"[Action, Comedy, Sci-Fi]",282414,4.0,1527212481
27744152,187593,Deadpool 2 2018,"[Action, Comedy, Sci-Fi]",282475,4.5,1531158389
27744153,187593,Deadpool 2 2018,"[Action, Comedy, Sci-Fi]",282670,3.5,1535485882


In [27]:
df4=df3.pivot(index='title',columns='userId',values='rating').fillna(0)
df4

userId,4,10,14,15,19,26,36,38,42,43,...,283170,283171,283183,283184,283185,283195,283204,283206,283224,283228
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Cloverfield Lane 2016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You 1999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.5,0.0,4.0,0.0,0.0
10000 BC 2008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians 1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians One Hundred and One Dalmatians 1961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,4.5,0.0,5.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
batteries not included 1987,4.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
burbs The 1989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
eXistenZ 1999,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xXx 2002,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
from scipy.sparse import csr_matrix
movie_features_df_matrix = csr_matrix(df4.values)
movie_features_df_matrix

<3931x65244 sparse matrix of type '<class 'numpy.float64'>'
	with 18705850 stored elements in Compressed Sparse Row format>

In [29]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [30]:
titles = df4.index[:]
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(titles)

In [31]:
def knn_recommender(movie_id):
  model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
  model_knn.fit(movie_features_df_matrix)
  distances, indices = model_knn.kneighbors(df4.iloc[movie_id,:].values.reshape(1, -1), n_neighbors = 11)
  for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(df4.index[movie_id]))
    else:
        print('{0}: {1}, with distance of {2}'.format(i, df4.index[indices.flatten()[i]], distances.flatten()[i]))

def search_pivot(title):
  #title = clean_title_date(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -10)[-10:]
  results = df4.iloc[indices][::-1]
  results = results.replace(0, np.nan)
  #results['Average Rating'] = results.mean(axis=1)
  #results['# of Reviews'] = results.count(axis=1)
  
  return results.index[0]

def find_index(title):
  index = np.asarray((df4.index == (title))==True).nonzero()
  return(index[0][0])


In [32]:
df4[df4.index == search_pivot("Toy Story")]

userId,4,10,14,15,19,26,36,38,42,43,...,283170,283171,283183,283184,283185,283195,283204,283206,283224,283228
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story 1995,4.0,5.0,4.5,4.0,0.0,0.0,0.0,5.0,4.0,5.0,...,0.0,0.0,5.0,4.0,0.0,5.0,0.0,0.0,0.0,4.5


In [33]:
distances, indices = model_knn.kneighbors(df4[df4.index == search_pivot("Toy Story")].values.reshape(1, -1), n_neighbors = 11)

In [34]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(df4[df4.index == search_pivot("Toy Story")].index[0]))
    else:
        print('{0}: {1}, with distance of {2}!'.format(i, df4.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Toy Story 1995:

1: Forrest Gump 1994, with distance of 0.2947991707335017!
2: Jurassic Park 1993, with distance of 0.30386620861245783!
3: Star Wars Episode IV  A New Hope 1977, with distance of 0.30677729258024!
4: Back to the Future 1985, with distance of 0.30700315479423423!
5: Toy Story 2 1999, with distance of 0.3083871569734219!
6: Star Wars Episode V  The Empire Strikes Back 1980, with distance of 0.3224837399544378!
7: Lion King The 1994, with distance of 0.3248278114283806!
8: Star Wars Episode VI  Return of the Jedi 1983, with distance of 0.3260653129345069!
9: Matrix The 1999, with distance of 0.3313326486650491!
10: Raiders of the Lost Ark Indiana Jones and the Raiders of the Lost Ark 1981, with distance of 0.3315553201574142!


In [35]:
knn_recommender(find_index('Toy Story 1995'))

Recommendations for Toy Story 1995:

1: Forrest Gump 1994, with distance of 0.2947991707335017
2: Jurassic Park 1993, with distance of 0.30386620861245783
3: Star Wars Episode IV  A New Hope 1977, with distance of 0.30677729258024
4: Back to the Future 1985, with distance of 0.30700315479423423
5: Toy Story 2 1999, with distance of 0.3083871569734219
6: Star Wars Episode V  The Empire Strikes Back 1980, with distance of 0.3224837399544378
7: Lion King The 1994, with distance of 0.3248278114283806
8: Star Wars Episode VI  Return of the Jedi 1983, with distance of 0.3260653129345069
9: Matrix The 1999, with distance of 0.3313326486650491
10: Raiders of the Lost Ark Indiana Jones and the Raiders of the Lost Ark 1981, with distance of 0.3315553201574142


In [36]:

movie_title = widgets.Combobox(
  value='',
  placeholder='Choose Movie',
  options=list(df4.index[0:3931]),
  description='Select Movie:',
  ensure_option=True,
  disabled=False
)

import ipywidgets as widgets
from IPython.display import display
button = widgets.Button(description="Get Recs!")

def on_button_clicked(b):
    knn_recommender(find_index(movie_title.value))

button.on_click(on_button_clicked)

display(movie_title, button)


Combobox(value='', description='Select Movie:', ensure_option=True, options=('10 Cloverfield Lane 2016', '10 T…

Button(description='Get Recs!', style=ButtonStyle())

# Content Based Recommender

In [37]:
from ast import literal_eval

In [38]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,Medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,space action,1442169421
4,14,318,imdb top 250,1442615195
...,...,...,...,...
1108992,283206,73017,fun,1264379059
1108993,283206,73017,homoerotic subtext,1264379058
1108994,283206,73017,pacing,1264379058
1108995,283206,73017,plot,1264379058


In [39]:
tags['tag'] = tags['tag'].str.lower()
tags['tag'] = tags['tag'].str.replace(' ', '')
tags['tag'] = tags['tag'].str.strip()
print(len(tags['tag'].unique()))
print(len(tags['movieId'].unique()))
print(len(tags['userId'].unique()))
tags['tag'].value_counts()

66389
45981
19325


sci-fi           9953
action           7232
atmospheric      6995
comedy           6681
surreal          5572
                 ... 
ex-fugitive         1
foliesbergère       1
hofmann             1
innersanctum        1
lenseflare          1
Name: tag, Length: 66389, dtype: int64

In [40]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,spaceaction,1442169421
4,14,318,imdbtop250,1442615195
...,...,...,...,...
1108992,283206,73017,fun,1264379059
1108993,283206,73017,homoeroticsubtext,1264379058
1108994,283206,73017,pacing,1264379058
1108995,283206,73017,plot,1264379058


In [41]:
movies_tags_merged=tags.merge(movies, on='movieId')
movies_tags_merged

Unnamed: 0,userId,movieId,tag,timestamp,title,genres
0,14,110,epic,1443148538,Braveheart (1995),"[Action, Drama, War]"
1,14,110,medieval,1443148532,Braveheart (1995),"[Action, Drama, War]"
2,815,110,overrated,1150006110,Braveheart (1995),"[Action, Drama, War]"
3,2577,110,oscar(bestpicture),1378324225,Braveheart (1995),"[Action, Drama, War]"
4,3086,110,epic,1463675332,Braveheart (1995),"[Action, Drama, War]"
...,...,...,...,...,...,...
1108976,282119,164453,engineer,1475084794,Failure of Engineer Garin (1973),"[Adventure, Drama, Sci-Fi, Thriller]"
1108977,282119,164453,sovietclassics,1475084778,Failure of Engineer Garin (1973),"[Adventure, Drama, Sci-Fi, Thriller]"
1108978,283000,91960,amazinglybad,1337484461,"Magic Christmas Tree, The (1964)","[Children, Comedy, Fantasy]"
1108979,283000,91960,effeminatetree,1337484484,"Magic Christmas Tree, The (1964)","[Children, Comedy, Fantasy]"


In [42]:
movies_tags_grouped = movies_tags_merged.groupby(['movieId'])['tag'].apply(lambda x: "{%s}" % ','.join(x))
movies_tags_grouped

movieId
1         {animated,buddymovie,cartoon,cgi,comedy,comput...
2         {fantasy,adaptedfrom:book,animals,badcgi,based...
3         {moldy,old,annmargaret,burgessmeredith,darylha...
4         {characters,girlmovie,characters,chickflick,ba...
5         {stevemartin,stevemartin,pregnancy,remake,agin...
                                ...                        
193761    {concert,hostage,kidnapping,operasinger,rebel,...
193811                            {doppelganger,juliedelpy}
193837                {aliens,anime,cat,spacemarines,virus}
193864    {alfredolanda,anticonception,comic,family,hous...
193868    {blackandwhite,comedy,friendship,fun,marriage,...
Name: tag, Length: 45981, dtype: object

In [43]:
movies_all_tags=movies.merge(movies_tags_grouped, on='movieId')
movies_all_tags

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","{animated,buddymovie,cartoon,cgi,comedy,comput..."
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","{fantasy,adaptedfrom:book,animals,badcgi,based..."
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","{moldy,old,annmargaret,burgessmeredith,darylha..."
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","{characters,girlmovie,characters,chickflick,ba..."
4,5,Father of the Bride Part II (1995),[Comedy],"{stevemartin,stevemartin,pregnancy,remake,agin..."
...,...,...,...,...
45976,193761,Bel Canto (2018),"[Drama, Thriller]","{concert,hostage,kidnapping,operasinger,rebel,..."
45977,193811,Burning Shadow (2018),[Thriller],"{doppelganger,juliedelpy}"
45978,193837,Lily C.A.T. (1987),"[Animation, Horror, Sci-Fi]","{aliens,anime,cat,spacemarines,virus}"
45979,193864,No somos de piedra (1968),[Comedy],"{alfredolanda,anticonception,comic,family,hous..."


In [44]:
def clean_title(title):
  clean_title = re.sub("[^a-zA-Z0-9 ]", "", title)
  return(clean_title.rstrip("0123456789"))


In [45]:
movies_all_tags['clean_title'] = movies_all_tags['title'].apply(clean_title)
movies_all_tags

Unnamed: 0,movieId,title,genres,tag,clean_title
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","{animated,buddymovie,cartoon,cgi,comedy,comput...",Toy Story
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","{fantasy,adaptedfrom:book,animals,badcgi,based...",Jumanji
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","{moldy,old,annmargaret,burgessmeredith,darylha...",Grumpier Old Men
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","{characters,girlmovie,characters,chickflick,ba...",Waiting to Exhale
4,5,Father of the Bride Part II (1995),[Comedy],"{stevemartin,stevemartin,pregnancy,remake,agin...",Father of the Bride Part II
...,...,...,...,...,...
45976,193761,Bel Canto (2018),"[Drama, Thriller]","{concert,hostage,kidnapping,operasinger,rebel,...",Bel Canto
45977,193811,Burning Shadow (2018),[Thriller],"{doppelganger,juliedelpy}",Burning Shadow
45978,193837,Lily C.A.T. (1987),"[Animation, Horror, Sci-Fi]","{aliens,anime,cat,spacemarines,virus}",Lily CAT
45979,193864,No somos de piedra (1968),[Comedy],"{alfredolanda,anticonception,comic,family,hous...",No somos de piedra


In [46]:
movies_genres_tags = movies_all_tags
movies_genres_tags['tag'] = movies_genres_tags['tag'].str.lstrip('{').str.rstrip('}')
movies_genres_tags['tag_list'] = movies_genres_tags['tag'].str.split(',')
movies_genres_tags['title_genres_tags'] = movies_genres_tags['genres'] + movies_genres_tags['tag_list']
movies_genres_tags.drop(['tag', 'genres', 'tag_list'], inplace=True, axis=1)
movies_genres_tags

Unnamed: 0,movieId,title,clean_title,title_genres_tags
0,1,Toy Story (1995),Toy Story,"[Adventure, Animation, Children, Comedy, Fanta..."
1,2,Jumanji (1995),Jumanji,"[Adventure, Children, Fantasy, fantasy, adapte..."
2,3,Grumpier Old Men (1995),Grumpier Old Men,"[Comedy, Romance, moldy, old, annmargaret, bur..."
3,4,Waiting to Exhale (1995),Waiting to Exhale,"[Comedy, Drama, Romance, characters, girlmovie..."
4,5,Father of the Bride Part II (1995),Father of the Bride Part II,"[Comedy, stevemartin, stevemartin, pregnancy, ..."
...,...,...,...,...
45976,193761,Bel Canto (2018),Bel Canto,"[Drama, Thriller, concert, hostage, kidnapping..."
45977,193811,Burning Shadow (2018),Burning Shadow,"[Thriller, doppelganger, juliedelpy]"
45978,193837,Lily C.A.T. (1987),Lily CAT,"[Animation, Horror, Sci-Fi, aliens, anime, cat..."
45979,193864,No somos de piedra (1968),No somos de piedra,"[Comedy, alfredolanda, anticonception, comic, ..."


In [47]:
movies_genres_tags['title_genres_tags'] = movies_genres_tags['title_genres_tags'].astype(str) + " " + movies_genres_tags['clean_title']
movies_genres_tags

Unnamed: 0,movieId,title,clean_title,title_genres_tags
0,1,Toy Story (1995),Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."
1,2,Jumanji (1995),Jumanji,"['Adventure', 'Children', 'Fantasy', 'fantasy'..."
2,3,Grumpier Old Men (1995),Grumpier Old Men,"['Comedy', 'Romance', 'moldy', 'old', 'annmarg..."
3,4,Waiting to Exhale (1995),Waiting to Exhale,"['Comedy', 'Drama', 'Romance', 'characters', '..."
4,5,Father of the Bride Part II (1995),Father of the Bride Part II,"['Comedy', 'stevemartin', 'stevemartin', 'preg..."
...,...,...,...,...
45976,193761,Bel Canto (2018),Bel Canto,"['Drama', 'Thriller', 'concert', 'hostage', 'k..."
45977,193811,Burning Shadow (2018),Burning Shadow,"['Thriller', 'doppelganger', 'juliedelpy'] Bur..."
45978,193837,Lily C.A.T. (1987),Lily CAT,"['Animation', 'Horror', 'Sci-Fi', 'aliens', 'a..."
45979,193864,No somos de piedra (1968),No somos de piedra,"['Comedy', 'alfredolanda', 'anticonception', '..."


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

In [49]:
tf = TfidfVectorizer(analyzer= 'word',ngram_range=(1,1),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies_genres_tags['title_genres_tags'].astype(str))
tfidf_matrix.shape

(45981, 85641)

In [50]:
tfidf_matrix

<45981x85641 sparse matrix of type '<class 'numpy.float64'>'
	with 673199 stored elements in Compressed Sparse Row format>

In [51]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [52]:
cosine_sim.shape

(45981, 45981)

In [53]:
movies_genres_tags = movies_genres_tags.reset_index()
titles = movies_genres_tags['title']
indices = pd.Series(movies_genres_tags.index, index=movies_genres_tags['title'])

movies_genres_tags

Unnamed: 0,index,movieId,title,clean_title,title_genres_tags
0,0,1,Toy Story (1995),Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."
1,1,2,Jumanji (1995),Jumanji,"['Adventure', 'Children', 'Fantasy', 'fantasy'..."
2,2,3,Grumpier Old Men (1995),Grumpier Old Men,"['Comedy', 'Romance', 'moldy', 'old', 'annmarg..."
3,3,4,Waiting to Exhale (1995),Waiting to Exhale,"['Comedy', 'Drama', 'Romance', 'characters', '..."
4,4,5,Father of the Bride Part II (1995),Father of the Bride Part II,"['Comedy', 'stevemartin', 'stevemartin', 'preg..."
...,...,...,...,...,...
45976,45976,193761,Bel Canto (2018),Bel Canto,"['Drama', 'Thriller', 'concert', 'hostage', 'k..."
45977,45977,193811,Burning Shadow (2018),Burning Shadow,"['Thriller', 'doppelganger', 'juliedelpy'] Bur..."
45978,45978,193837,Lily C.A.T. (1987),Lily CAT,"['Animation', 'Horror', 'Sci-Fi', 'aliens', 'a..."
45979,45979,193864,No somos de piedra (1968),No somos de piedra,"['Comedy', 'alfredolanda', 'anticonception', '..."


In [54]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return movies_genres_tags.iloc[movie_indices]

In [55]:
from sklearn.metrics.pairwise import cosine_similarity
def search_title(title):
  title = clean_title_date(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -10)[-10:]
  results = popular_movies.iloc[indices][::-1]
  return results

In [56]:
get_recommendations('13 Going on 30 (2004)').head(10)


Unnamed: 0,index,movieId,title,clean_title,title_genres_tags
11119,11119,50792,Catch and Release (2006),Catch and Release,"['Comedy', 'Drama', 'Romance', 'jennifergarner..."
9938,9938,36525,Just Like Heaven (2005),Just Like Heaven,"['Comedy', 'Fantasy', 'Romance', 'chickflick',..."
2559,2559,2797,Big (1988),Big,"['Comedy', 'Drama', 'Fantasy', 'Romance', 'com..."
1805,1805,2014,Freaky Friday (1977),Freaky Friday,"['Children', 'Comedy', 'Fantasy', 'switchingpl..."
9248,9248,31221,Elektra (2005),Elektra,"['Action', 'Adventure', 'Crime', 'Drama', 'com..."
14206,14206,74450,Valentine's Day (2010),Valentines Day,"['Comedy', 'Romance', 'comedy', 'ashtonkutcher..."
7761,7761,8813,We Don't Live Here Anymore (2004),We Dont Live Here Anymore,"['Drama', 'underrated', 'adultery', 'forbidden..."
23138,23138,116161,Foxcatcher (2014),Foxcatcher,"['Drama', 'markruffalo', 'wrestling', 'cynical..."
13666,13666,71520,"Invention of Lying, The (2009)",Invention of Lying The,"['Comedy', 'aboutlies', 'atheism', 'fantasy', ..."
5766,5766,6157,Daredevil (2003),Daredevil,"['Action', 'Crime', 'notasgoodasothercomicfilm..."


In [57]:
import ipywidgets as widgets
from IPython.display import display

movie_title = widgets.Combobox(
  value='',
  placeholder='Choose Movie',
  options=list(movies_genres_tags['title']),
  description='Select Movie:',
  ensure_option=True,
  disabled=False
)

button = widgets.Button(description="Get Recs!")

def on_button_clicked(b):
  print(get_recommendations(movie_title.value).title.head(10))

(button.on_click(on_button_clicked))

display(movie_title, button)


Combobox(value='', description='Select Movie:', ensure_option=True, options=('Toy Story (1995)', 'Jumanji (199…

Button(description='Get Recs!', style=ButtonStyle())

In [None]:
del cosine_sim, tfidf_matrix

# SVD Collaborative Filtering

In [58]:
reader = Reader()

In [65]:
movie_ratings_threshold = int(2000)
user_ratings_threshold = int(200)

movies_ratings_merged['count'] = movies_ratings_merged.groupby('movieId')['userId'].transform('count')
df2 = movies_ratings_merged.loc[movies_ratings_merged['count'] >= movie_ratings_threshold].drop(columns='count')
df2['count'] = df2.groupby('userId')['movieId'].transform('count')
df2 = df2.loc[df2['count'] >= user_ratings_threshold].drop(columns='count')
df2

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",4,4.0,1113765937
10,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",42,4.0,1251234072
11,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",43,5.0,1085779023
12,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",51,4.0,922276164
14,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",55,5.0,1059057617
...,...,...,...,...,...,...
27714106,179819,Star Wars: The Last Jedi (2017),"[Action, Adventure, Fantasy, Sci-Fi]",282421,3.0,1523662958
27714107,179819,Star Wars: The Last Jedi (2017),"[Action, Adventure, Fantasy, Sci-Fi]",282475,3.5,1521134945
27714108,179819,Star Wars: The Last Jedi (2017),"[Action, Adventure, Fantasy, Sci-Fi]",282670,2.5,1527720202
27714109,179819,Star Wars: The Last Jedi (2017),"[Action, Adventure, Fantasy, Sci-Fi]",282808,4.0,1517156568


In [66]:
data = Dataset.load_from_df(df2[['userId', 'movieId', 'rating']], reader)

In [67]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.7378  0.7378  0.7376  0.7377  0.0001  
MAE (testset)     0.5597  0.5598  0.5595  0.5597  0.0001  
Fit time          93.86   92.15   92.29   92.76   0.77    
Test time         43.66   33.58   32.84   36.69   4.94    


{'test_rmse': array([0.7377527 , 0.73780216, 0.73764498]),
 'test_mae': array([0.55970498, 0.55981893, 0.55951931]),
 'fit_time': (93.85506558418274, 92.14925265312195, 92.28592538833618),
 'test_time': (43.65939712524414, 33.575533866882324, 32.839865922927856)}

In [68]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1cc71baa7c0>

In [69]:
test = df2[df2['userId'] == 4]
test[test['movieId'] == 5]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
114186,5,Father of the Bride Part II (1995),[Comedy],4,2.0,1123990453


In [70]:
svd.predict(4, 5)

Prediction(uid=4, iid=5, r_ui=None, est=1.8890108628129079, details={'was_impossible': False})

In [71]:
import IPython
import ipywidgets as widgets
from IPython.display import display

In [72]:
len(df2['userId'].unique())

30177

In [73]:
def get_movie_id(Movie_Title):
  return int(movies[movies['title'] == str(Movie_Title)].movieId)

In [74]:
get_movie_id(Movie_Title='Jumanji (1995)')

2

In [75]:
users = list(df2['userId'].value_counts().index.sort_values())
list_string = map(str, users)
print(list(list_string))

['4', '19', '42', '43', '51', '55', '56', '73', '79', '81', '88', '100', '114', '134', '138', '147', '160', '173', '176', '183', '196', '214', '235', '239', '248', '255', '258', '268', '277', '294', '295', '313', '319', '321', '332', '339', '343', '357', '363', '372', '374', '378', '382', '384', '402', '408', '428', '443', '449', '450', '458', '465', '471', '473', '491', '497', '502', '505', '526', '540', '549', '553', '559', '572', '590', '593', '601', '603', '605', '627', '658', '667', '669', '670', '697', '698', '716', '719', '738', '758', '776', '788', '804', '807', '814', '815', '818', '828', '830', '842', '854', '856', '859', '861', '864', '867', '869', '890', '907', '911', '917', '920', '926', '930', '942', '953', '970', '972', '977', '982', '995', '996', '1000', '1010', '1024', '1028', '1030', '1035', '1040', '1045', '1051', '1059', '1064', '1068', '1073', '1075', '1077', '1086', '1089', '1093', '1094', '1117', '1118', '1120', '1132', '1143', '1153', '1158', '1165', '1171', '11

In [76]:
import ipywidgets as widgets
from IPython.display import display

movie_title = widgets.Combobox(
  value='',
  placeholder='Choose Movie',
  options=list(popular_movies['title'].sort_values(ascending=True)),
  description='Select Movie:',
  ensure_option=True,
  disabled=False
)

user_id = widgets.Combobox(
  value='',
  placeholder='Choose User',
  options=(list(map(str, list(df2['userId'].value_counts().index.sort_values())))),
  description='Select User:',
  ensure_option=True,
  disabled=False
)

button = widgets.Button(description="Get Predicted Score!")

def on_button_clicked(b):
  print("Movie: ", movie_title.value)
  print("Movie ID: ", get_movie_id(movie_title.value))
  print("User: ", int(user_id.value))
  prediction = svd.predict(int(user_id.value), get_movie_id(movie_title.value))
  print(prediction)

button.on_click(on_button_clicked)

display(movie_title, user_id, button)

Combobox(value='', description='Select Movie:', ensure_option=True, options=("'burbs, The (1989)", '(500) Days…

Combobox(value='', description='Select User:', ensure_option=True, options=('4', '19', '42', '43', '51', '55',…

Button(description='Get Predicted Score!', style=ButtonStyle())

Movie:  


TypeError: ignored

Movie:  


TypeError: ignored

# Non Negative Matrix Factorization

In [77]:
nmf = NMF()
cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8379  0.8360  0.8370  0.8369  0.0008  
MAE (testset)     0.6406  0.6388  0.6401  0.6398  0.0008  
Fit time          167.06  163.41  162.54  164.34  1.96    
Test time         42.81   38.16   31.04   37.33   4.84    


{'test_rmse': array([0.83792544, 0.83596483, 0.83695513]),
 'test_mae': array([0.64062948, 0.63877086, 0.64009173]),
 'fit_time': (167.06482768058777, 163.40985822677612, 162.54181742668152),
 'test_time': (42.807225942611694, 38.1556282043457, 31.035223722457886)}

In [78]:
nmf.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1cda2eb73a0>

In [79]:
nmf.predict(4,5)

Prediction(uid=4, iid=5, r_ui=None, est=2.9609744845057815, details={'was_impossible': False})

# KNNBasic 

In [59]:
links_small = pd.read_csv("links_small.csv")
movies_small = pd.read_csv("movies_small.csv")
ratings_small = pd.read_csv("ratings_small.csv")
tags_small = pd.read_csv("tags_small.csv")

movie_ratings_merged_small =movies_small.merge(ratings_small, on='movieId')
movie_ratings_merged_small

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [60]:
movie_ratings_threshold_small = int(1)
user_ratings_threshold_small = int(1)

movie_ratings_merged_small['count'] = movie_ratings_merged_small.groupby('movieId')['userId'].transform('count')
df2_small = movie_ratings_merged_small.loc[movie_ratings_merged_small['count'] >= movie_ratings_threshold_small].drop(columns='count')
df2_small['count'] = df2_small.groupby('userId')['movieId'].transform('count')
df2_small = df2_small.loc[df2_small['count'] >= user_ratings_threshold_small].drop(columns='count')
df2_small

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [61]:
data_small = Dataset.load_from_df(df2_small[['userId', 'movieId', 'rating']], reader)

In [62]:
knnBasic = KNNBasic()
cross_validate(knnBasic, data_small, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9591  0.9574  0.9614  0.9593  0.0016  
MAE (testset)     0.7347  0.7345  0.7395  0.7362  0.0023  
Fit time          0.09    0.09    0.08    0.09    0.00    
Test time         1.42    1.36    1.44    1.40    0.04    


{'test_rmse': array([0.95907767, 0.95741664, 0.96142629]),
 'test_mae': array([0.73465802, 0.73445061, 0.73950362]),
 'fit_time': (0.08707904815673828, 0.09208369255065918, 0.08307576179504395),
 'test_time': (1.4162888526916504, 1.3552320003509521, 1.4381434917449951)}

In [63]:
nmf = NMF()
cross_validate(nmf, data_small, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9300  0.9354  0.9344  0.9333  0.0024  
MAE (testset)     0.7132  0.7154  0.7145  0.7144  0.0009  
Fit time          1.48    1.59    1.42    1.50    0.07    
Test time         0.16    0.15    0.15    0.15    0.00    


{'test_rmse': array([0.92997835, 0.93543164, 0.93441118]),
 'test_mae': array([0.71321014, 0.71535643, 0.7145144 ]),
 'fit_time': (1.4793453216552734, 1.5934498310089111, 1.4222948551177979),
 'test_time': (0.15512514114379883, 0.14913535118103027, 0.1461319923400879)}

In [64]:
svd = SVD()
cross_validate(svd, data_small, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8858  0.8805  0.8736  0.8800  0.0050  
MAE (testset)     0.6811  0.6767  0.6715  0.6764  0.0039  
Fit time          0.64    0.65    0.62    0.64    0.01    
Test time         0.16    0.26    0.16    0.19    0.05    


{'test_rmse': array([0.88579423, 0.88045597, 0.87363181]),
 'test_mae': array([0.68107294, 0.67669228, 0.67146255]),
 'fit_time': (0.6425838470458984, 0.645587682723999, 0.6245682239532471),
 'test_time': (0.15914535522460938, 0.2622382640838623, 0.1561415195465088)}

# Hybrid Recommender

In [80]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [81]:
movies_genres_tags = movies_genres_tags.reset_index()
titles = movies_genres_tags['title']
indices = pd.Series(movies_genres_tags.index, index=movies_genres_tags['title'])

movies_genres_tags

Unnamed: 0,level_0,index,movieId,title,clean_title,title_genres_tags
0,0,0,1,Toy Story (1995),Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."
1,1,1,2,Jumanji (1995),Jumanji,"['Adventure', 'Children', 'Fantasy', 'fantasy'..."
2,2,2,3,Grumpier Old Men (1995),Grumpier Old Men,"['Comedy', 'Romance', 'moldy', 'old', 'annmarg..."
3,3,3,4,Waiting to Exhale (1995),Waiting to Exhale,"['Comedy', 'Drama', 'Romance', 'characters', '..."
4,4,4,5,Father of the Bride Part II (1995),Father of the Bride Part II,"['Comedy', 'stevemartin', 'stevemartin', 'preg..."
...,...,...,...,...,...,...
45976,45976,45976,193761,Bel Canto (2018),Bel Canto,"['Drama', 'Thriller', 'concert', 'hostage', 'k..."
45977,45977,45977,193811,Burning Shadow (2018),Burning Shadow,"['Thriller', 'doppelganger', 'juliedelpy'] Bur..."
45978,45978,45978,193837,Lily C.A.T. (1987),Lily CAT,"['Animation', 'Horror', 'Sci-Fi', 'aliens', 'a..."
45979,45979,45979,193864,No somos de piedra (1968),No somos de piedra,"['Comedy', 'alfredolanda', 'anticonception', '..."


In [82]:
df2

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",4,4.0,1113765937
10,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",42,4.0,1251234072
11,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",43,5.0,1085779023
12,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",51,4.0,922276164
14,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",55,5.0,1059057617
...,...,...,...,...,...,...
27714106,179819,Star Wars: The Last Jedi (2017),"[Action, Adventure, Fantasy, Sci-Fi]",282421,3.0,1523662958
27714107,179819,Star Wars: The Last Jedi (2017),"[Action, Adventure, Fantasy, Sci-Fi]",282475,3.5,1521134945
27714108,179819,Star Wars: The Last Jedi (2017),"[Action, Adventure, Fantasy, Sci-Fi]",282670,2.5,1527720202
27714109,179819,Star Wars: The Last Jedi (2017),"[Action, Adventure, Fantasy, Sci-Fi]",282808,4.0,1517156568


In [83]:
popular_movies.sort_values('Rating Count', ascending=False)

Unnamed: 0,title,Rating Count,Average Rating,movieId,genres
53894,"Shawshank Redemption, The (1994)",97999,4.424188,318,"[Crime, Drama]"
53893,Forrest Gump (1994),97040,4.056585,356,"[Comedy, Drama, Romance, War]"
53892,Pulp Fiction (1994),92406,4.173971,296,"[Comedy, Crime, Drama, Thriller]"
53891,"Silence of the Lambs, The (1991)",87899,4.151412,593,"[Crime, Horror, Thriller]"
53890,"Matrix, The (1999)",84545,4.149695,2571,"[Action, Sci-Fi, Thriller]"
...,...,...,...,...,...
51330,Man on Wire (2008),2006,3.829262,60766,[Documentary]
51329,"Life Less Ordinary, A (1997)",2004,3.185878,1658,"[Romance, Thriller]"
51328,"Big Blue, The (Grand bleu, Le) (1988)",2003,3.839241,1216,"[Adventure, Drama, Romance]"
51327,Scary Movie 4 (2006),2001,2.327086,44972,"[Comedy, Horror]"


In [84]:
indices

title
Toy Story (1995)                          0
Jumanji (1995)                            1
Grumpier Old Men (1995)                   2
Waiting to Exhale (1995)                  3
Father of the Bride Part II (1995)        4
                                      ...  
Bel Canto (2018)                      45976
Burning Shadow (2018)                 45977
Lily C.A.T. (1987)                    45978
No somos de piedra (1968)             45979
Dos tipos de cuidado (1953)           45980
Length: 45981, dtype: int64

In [85]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return movies_genres_tags.iloc[movie_indices]

In [86]:
get_recommendations("Toy Story (1995)")

Unnamed: 0,level_0,index,movieId,title,clean_title,title_genres_tags
2866,2866,2866,3114,Toy Story 2 (1999),Toy Story 2,"['Adventure', 'Animation', 'Children', 'Comedy..."
2132,2132,2132,2355,"Bug's Life, A (1998)",Bugs Life A,"['Adventure', 'Animation', 'Children', 'Comedy..."
4548,4548,4548,4886,"Monsters, Inc. (2001)",Monsters Inc,"['Adventure', 'Animation', 'Children', 'Comedy..."
5970,5970,5970,6377,Finding Nemo (2003),Finding Nemo,"['Adventure', 'Animation', 'Children', 'Comedy..."
14703,14703,14703,78499,Toy Story 3 (2010),Toy Story 3,"['Adventure', 'Animation', 'Children', 'Comedy..."
35350,35350,35350,157296,Finding Dory (2016),Finding Dory,"['Adventure', 'Animation', 'Comedy', 'adventur..."
19799,19799,19799,103141,Monsters University (2013),Monsters University,"['Adventure', 'Animation', 'Comedy', 'freindsh..."
7904,7904,7904,8961,"Incredibles, The (2004)",Incredibles The,"['Action', 'Adventure', 'Animation', 'Children..."
4867,4867,4867,5218,Ice Age (2002),Ice Age,"['Adventure', 'Animation', 'Children', 'Comedy..."
10563,10563,10563,45517,Cars (2006),Cars,"['Animation', 'Children', 'Comedy', 'redemptio..."


In [87]:
def hybrid(title, userId):
  movies = get_recommendations(title)
  movies['Estimate'] = movies.apply(lambda row : svd.predict(int(userId), row['movieId'], 3)[3], axis=1)
  print()
  return(movies.sort_values('Estimate', ascending=False)[['title','movieId', 'Estimate']].head(20))

hybrid('Toy Story (1995)', 4)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['Estimate'] = movies.apply(lambda row : svd.predict(int(userId), row['movieId'], 3)[3], axis=1)


Unnamed: 0,title,movieId,Estimate
7904,"Incredibles, The (2004)",8961,4.57942
13170,Up (2009),68954,4.254888
4867,Ice Age (2002),5218,4.21413
4548,"Monsters, Inc. (2001)",4886,4.197496
10563,Cars (2006),45517,4.165443
11132,Ratatouille (2007),50872,4.072102
5970,Finding Nemo (2003),6377,4.031853
14703,Toy Story 3 (2010),78499,3.919259
41924,Coco (2017),177765,3.890216
35350,Finding Dory (2016),157296,3.745923


In [88]:
movie_title = widgets.Combobox(
  value='',
  placeholder='Choose Movie',
  options=list(popular_movies['title'].sort_values(ascending=True)),
  description='Select Movie:',
  ensure_option=True,
  disabled=False
)

user_id = widgets.Combobox(
  value='',
  placeholder='Choose User',
  options=(list(map(str, list(df2['userId'].value_counts().index.sort_values())))),
  description='Select User:',
  ensure_option=True,
  disabled=False
)

button = widgets.Button(description="Get Predicted Score!")

def on_button_clicked(b):
  print("Movie: ", movie_title.value)
  print("Movie ID: ", get_movie_id(movie_title.value))
  print("User: ", (user_id.value))
  hybrid_predictions = hybrid(movie_title.value, int(user_id.value))
  print(hybrid_predictions)

button.on_click(on_button_clicked)

display(movie_title, user_id, button)

Combobox(value='', description='Select Movie:', ensure_option=True, options=("'burbs, The (1989)", '(500) Days…

Combobox(value='', description='Select User:', ensure_option=True, options=('4', '19', '42', '43', '51', '55',…

Button(description='Get Predicted Score!', style=ButtonStyle())

Movie:  Toy Story (1995)
Movie ID:  1
User:  4

                                   title  movieId  Estimate
7904             Incredibles, The (2004)     8961  4.579420
13170                          Up (2009)    68954  4.254888
4867                      Ice Age (2002)     5218  4.214130
4548               Monsters, Inc. (2001)     4886  4.197496
10563                        Cars (2006)    45517  4.165443
11132                 Ratatouille (2007)    50872  4.072102
5970                 Finding Nemo (2003)     6377  4.031853
14703                 Toy Story 3 (2010)    78499  3.919259
41924                        Coco (2017)   177765  3.890216
35350                Finding Dory (2016)   157296  3.745923
19799         Monsters University (2013)   103141  3.667244
24264  Toy Story That Time Forgot (2014)   120474  3.656751
10251                    Luxo Jr. (1986)    42191  3.656751
29176           The Good Dinosaur (2015)   136016  3.656751
18145                 Knick Knack (1989)    95856  3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['Estimate'] = movies.apply(lambda row : svd.predict(int(userId), row['movieId'], 3)[3], axis=1)
