# Movie Recommender System

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from ast import literal_eval
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

In [2]:
data1 = pd.read_csv('tmdb_5000_movies.csv')
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [3]:
data2 = pd.read_csv('tmdb_5000_credits.csv')
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [4]:
data2 = data2.drop(['title'], axis=1)

In [5]:
data = data1.merge(data2, left_on='id', right_on='movie_id')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [6]:
data = data.drop(['budget', 'homepage', 'revenue', 'runtime', 'movie_id'], axis=1)
data.head()

Unnamed: 0,genres,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


## Trending Movies
Lets find the top trending movies based on the popularity.

In [7]:
trend_movies = data.copy().sort_values('popularity', ascending=False)
trend_movies[['title', 'popularity', 'vote_average']].head(10)

Unnamed: 0,title,popularity,vote_average
546,Minions,875.581305,6.4
95,Interstellar,724.247784,8.1
788,Deadpool,514.569956,7.4
94,Guardians of the Galaxy,481.098624,7.9
127,Mad Max: Fury Road,434.278564,7.2
28,Jurassic World,418.708552,6.5
199,Pirates of the Caribbean: The Curse of the Bla...,271.972889,7.5
82,Dawn of the Planet of the Apes,243.791743,7.3
200,The Hunger Games: Mockingjay - Part 1,206.227151,6.6
88,Big Hero 6,203.73459,7.8


Lets try to find trending movies based on genre.

In [8]:
genre = data.copy()
genre['genres'] = genre['genres'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [9]:
temp = genre.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
temp.name = 'genre'
genre = genre.drop('genres', axis=1).join(temp)
genre['genre'].unique()

array(['Action', 'Adventure', 'Fantasy', 'Science Fiction', 'Crime',
       'Drama', 'Thriller', 'Animation', 'Family', 'Western', 'Comedy',
       'Romance', 'Horror', 'Mystery', 'History', 'War', 'Music',
       'Documentary', 'Foreign', 'TV Movie', nan], dtype=object)

In [10]:
def top_genre(g):
	tg = genre[genre['genre']==g]
	C = tg['vote_average'].mean()
	m = tg['vote_count'].quantile(0.9)
	tg_mod = tg.copy().loc[tg['vote_count']>=m]
	tg_mod['wr'] = tg_mod.apply(lambda x: (x['vote_count']/(x['vote_count']+m))*x['vote_average'] + (m/(x['vote_count']+m))*C, axis=1)
	tg_mod = tg_mod.sort_values('wr', ascending=False)

	return tg_mod[['title', 'vote_average', 'vote_count', 'wr']].head(10)

In [11]:
top_genre('Comedy')

Unnamed: 0,title,vote_average,vote_count,wr
809,Forrest Gump,8.2,7927,7.865313
77,Inside Out,8.0,6560,7.642508
2285,Back to the Future,8.0,6079,7.619461
298,The Wolf of Wall Street,7.9,6571,7.56038
1532,The Grand Budapest Hotel,8.0,4519,7.518861
88,Big Hero 6,7.8,6135,7.459066
66,Up,7.7,6870,7.40618
697,The Truman Show,7.8,4537,7.367022
1541,Toy Story,7.7,5269,7.335453
1260,Amélie,7.8,3310,7.253794


In [12]:
top_genre('Action')

Unnamed: 0,title,vote_average,vote_count,wr
65,The Dark Knight,8.2,12002,7.734742
96,Inception,8.1,13752,7.701648
329,The Lord of the Rings: The Return of the King,8.1,8064,7.500484
262,The Lord of the Rings: The Fellowship of the Ring,8.0,8705,7.459642
94,Guardians of the Galaxy,7.9,9742,7.427664
1990,The Empire Strikes Back,8.2,5879,7.420952
2912,Star Wars,8.1,6624,7.412603
330,The Lord of the Rings: The Two Towers,8.0,7487,7.398055
634,The Matrix,7.9,8907,7.395086
571,Inglourious Basterds,7.9,6430,7.265208


## Finding the top rated movies
For this we will use IMDB's weighted ratio (wr) as a metric to score the movies.

WR = {v/v+m}.R + {m/v+m}.C

where, 

	- v: number of votes for the movie

	- m: min votes required to be listed on the chart

	- R: avg rating of the movie

	- C: mean vote across the whole set

In [13]:
# Mean votes
C = data['vote_average'].mean()
C

6.092171559442011

For finding the min votes required, we will select movies which have votes more than at least 90% of the movies listed.

In [14]:
m = data['vote_count'].quantile(0.9)
m

1838.4000000000015

Selecting the movies with vote counts > m

In [15]:
top_charts = data.copy().loc[data['vote_count']>=m]
top_charts.head()

Unnamed: 0,genres,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [16]:
def wr(data, m=m, C=C):
	v = data['vote_count']
	R = data['vote_average']

	return (v/(v+m))*R + (m/(v+m))*C

Applying the weighted ration on the top_charts

In [17]:
top_charts['wr'] = top_charts.apply(wr, axis=1)

In [18]:
top_charts = top_charts.sort_values('wr', ascending=False)
top_charts[['title', 'vote_average', 'wr']].head(10)

Unnamed: 0,title,vote_average,wr
1881,The Shawshank Redemption,8.5,8.059258
662,Fight Club,8.3,7.939256
65,The Dark Knight,8.2,7.92002
3232,Pulp Fiction,8.3,7.904645
96,Inception,8.1,7.863239
3337,The Godfather,8.4,7.851236
95,Interstellar,8.1,7.809479
809,Forrest Gump,8.2,7.803188
329,The Lord of the Rings: The Return of the King,8.1,7.727243
1990,The Empire Strikes Back,8.2,7.697884


## Content Based Filtering

### Recommendation based on the overview of the movies
We will use the overview column and preprocess the text using TFIDF vectors.

Term frequency is the relative frequency of a word in a document and is given as **(term instances/total instances)**. Inverse Document Frequency is the relative count of documents containing the term is given as **log(number of documents/documents with term)** The overall importance of each word to the documents in which they appear is equal to TF * IDF

In [19]:
# Filling NaN with empty string
data['overview'] = data['overview'].fillna('')

In [20]:
vectorizer = TfidfVectorizer(stop_words='english')

tfidf_matrix = vectorizer.fit_transform(data['overview'])
tfidf_matrix.shape

(4803, 20978)

In [21]:
cos_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cos_sim[0]

array([1., 0., 0., ..., 0., 0., 0.])

In [22]:
titles = data['title']
indices = pd.Series(data.index,index=data['title'])

In [23]:
# Function to get the top10 recommendations of the a selected movie on the basis of similarity scores
def get_recommendations(title, cos_sim):
	ind = indices[title]
	sim_scores = list(enumerate(cos_sim[ind]))
	sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
	sim_scores = sim_scores[1:11]
	movie_indices = [i[0] for i in sim_scores]
	return titles.iloc[movie_indices]

In [24]:
get_recommendations('The Avengers', cos_sim)

7               Avengers: Age of Ultron
3144                            Plastic
1715                            Timecop
4124                 This Thing of Ours
3311              Thank You for Smoking
3033                      The Corruptor
588     Wall Street: Money Never Sleeps
2136         Team America: World Police
1468                       The Fountain
1286                        Snowpiercer
Name: title, dtype: object

These are the recommendations we got for 'The Avengers' movie.

The results are not that great in terms of what the user would expect. As 'The Avengers' is Marvel's production, so the user must be shown something more relevant to this.

Lets try to improve this model by considering more features from the dataset.

### Recommendations based on cast, crew, keywords

We will extract the director's name, actors, top 3 genres and keywords related to each movie and join them all as one feature and use cosine similarity to get the results.

In [25]:
# converting features into python objects
cols = ['genres', 'keywords', 'cast', 'crew']
data3 = data.copy()
for col in cols:
	data3[col] = data3[col].apply(literal_eval)

In [26]:
# get the director's name from the crew feature
def get_director(data):
	for i in data:
		if i['job']=='Director':
			return i['name']
	return np.nan


# get a list of top 3 keywords, genres, cast
def get_list(data):
    if isinstance(data, list):
        names = [i['name'] for i in data]
        if len(names) > 3:
            names = names[:3]
        return names
    return []

In [27]:
data3['director'] = data3['crew'].apply(get_director)

cols = ['genres', 'keywords', 'cast']
for col in cols:
	data3[col] = data3[col].apply(get_list)

data3.head(1)

Unnamed: 0,genres,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,"[Action, Adventure, Fantasy]",19995,"[culture clash, future, space war]",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{'credit_id': '52fe48009251416c750aca23', 'de...",James Cameron


The next step would be to convert the names and keyword instances into lowercase and strip all the spaces between them.

In [28]:
def clean_data(data):
    if isinstance(data, list):
        return [str.lower(i.replace(' ', '')) for i in data]
    else:
        if isinstance(data, str):
            return str.lower(data.replace(' ', ''))
        else:
            return ''

In [29]:
cols = ['genres', 'keywords', 'cast', 'director']
for col in cols:
	data3[col] = data3[col].apply(clean_data)

data3.head(1)

Unnamed: 0,genres,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,"[action, adventure, fantasy]",19995,"[cultureclash, future, spacewar]",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[samworthington, zoesaldana, sigourneyweaver]","[{'credit_id': '52fe48009251416c750aca23', 'de...",jamescameron


In [30]:
def get_soup(data):
	return ' '.join(data['genres']) + ' ' + ' '.join(data['keywords']) + ' ' + data['director'] + ' ' + ' '.join(data['cast'])

data3['soup'] = data3.apply(get_soup, axis=1)

In [31]:
cVectorizer = CountVectorizer(stop_words='english')
count_matrix = cVectorizer.fit_transform(data3['soup'])

cos_sim2 = cosine_similarity(count_matrix, count_matrix)

In [32]:
indices = pd.Series(data3.index, index=data3['title'])

In [33]:
get_recommendations('The Avengers', cos_sim2)

7                  Avengers: Age of Ultron
26              Captain America: Civil War
79                              Iron Man 2
169     Captain America: The First Avenger
174                    The Incredible Hulk
85     Captain America: The Winter Soldier
31                              Iron Man 3
33                   X-Men: The Last Stand
68                                Iron Man
94                 Guardians of the Galaxy
Name: title, dtype: object

Comparing to the above recommendation based on the overview, we are getting much better results.

## Collaborative Filtering

Here we will use the ratings dataset which contains the userID, movieId and the ratings which users have given to the selected movies.

We will build a user based CF model here and try to predict the movie rating based on what we have trained. This system recommend products to a user that similar users have liked.

Will use Singular Value Decomposition (SVD) to map each user and each item.

scikit-surprise package is used to build recommender system based on rating data.

In [34]:
ratings = pd.read_csv('ratings_small.csv')
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [35]:
ratings['userId'].unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

In [36]:
ratings = ratings.drop('timestamp', axis=1)

reader = Reader()
data = Dataset.load_from_df(ratings, reader)

In [37]:
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9035  0.8918  0.8994  0.8997  0.8894  0.8968  0.0053  
MAE (testset)     0.6950  0.6858  0.6932  0.6889  0.6859  0.6898  0.0038  
Fit time          5.59    5.22    5.57    6.76    6.58    5.95    0.61    
Test time         0.27    0.20    0.27    0.19    0.28    0.24    0.04    


{'test_rmse': array([0.90347656, 0.89177874, 0.89935041, 0.89974712, 0.88941978]),
 'test_mae': array([0.6949936 , 0.68582181, 0.69323493, 0.68885627, 0.68586236]),
 'fit_time': (5.588315010070801,
  5.2240753173828125,
  5.5701048374176025,
  6.763876914978027,
  6.579406976699829),
 'test_time': (0.26928210258483887,
  0.2014162540435791,
  0.27227163314819336,
  0.1906275749206543,
  0.2772233486175537)}

We got RMSE Of 0.89.

Lets train our model for predictions.

In [38]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x195e8996c70>

In [39]:
ratings[ratings['userId']==1]

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0
5,1,1263,2.0
6,1,1287,2.0
7,1,1293,2.0
8,1,1339,3.5
9,1,1343,2.0


As our model is trained, lets see how the user with id=1 is likely to rate the movie "Avatar" with movieid=19995.

In [45]:
svd.predict(1, 19995)

Prediction(uid=1, iid=19995, r_ui=None, est=2.677927991177004, details={'was_impossible': False})

We got an estimate of 2.67.