In [1]:
import numpy as np
import pandas as pd
import pickle
import matrix_factorization_utilities
import scipy.sparse as sp
from scipy.sparse.linalg import svds

In [2]:
ratings = pd.read_csv('Dataset/ratings.csv')

In [3]:
len(ratings)

1048575

In [4]:
ratings = ratings[['userId', 'movieId','rating']]

In [5]:
ratings_df = ratings.groupby(['userId','movieId']).aggregate(np.max)

In [6]:
len(ratings_df)

1048575

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [8]:
ratings_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
userId,movieId,Unnamed: 2_level_1
1,2,3.5
1,29,3.5
1,32,3.5
1,47,3.5
1,50,3.5


In [9]:
len(ratings['userId'].unique())

7120

In [10]:
#count_ratings = ratings.groupby('rating').count()
#count_ratings['perc_total']=round(count_ratings['userId']*100/count_ratings['userId'].sum(),1)

In [11]:
#count_ratings

In [12]:
movie_list = pd.read_csv('Dataset/movies.csv')

In [13]:
len(movie_list)

27278

In [14]:
movie_list.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [15]:
#tags = pd.read_csv('Dataset/tags.csv')

In [16]:
#tags.head()

In [17]:
genres = movie_list['genres']

In [18]:
genres.head()

0    Adventure|Animation|Children|Comedy|Fantasy
1                     Adventure|Children|Fantasy
2                                 Comedy|Romance
3                           Comedy|Drama|Romance
4                                         Comedy
Name: genres, dtype: object

In [19]:
genre_list = ""
for index,row in movie_list.iterrows():
        genre_list += row.genres + "|"
#split the string into a list of values
genre_list_split = genre_list.split('|')
#de-duplicate values
new_list = list(set(genre_list_split))
#remove the value that is blank
new_list.remove('')
#inspect list of genres
new_list

['Children',
 'IMAX',
 'Documentary',
 'Animation',
 'Mystery',
 'Film-Noir',
 'Action',
 '(no genres listed)',
 'Thriller',
 'Drama',
 'Crime',
 'Horror',
 'Romance',
 'Sci-Fi',
 'Musical',
 'Adventure',
 'Western',
 'Fantasy',
 'War',
 'Comedy']

In [20]:
#Enriching the movies dataset by adding the various genres columns.
movies_with_genres = movie_list.copy()

for genre in new_list :
    movies_with_genres[genre] = movies_with_genres.apply(lambda _:int(genre in _.genres), axis = 1)

In [21]:
movies_with_genres.head()

Unnamed: 0,movieId,title,genres,Children,IMAX,Documentary,Animation,Mystery,Film-Noir,Action,...,Crime,Horror,Romance,Sci-Fi,Musical,Adventure,Western,Fantasy,War,Comedy
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [22]:
no_of_users = len(ratings['userId'].unique())
no_of_movies = len(ratings['movieId'].unique())

sparsity = round(1.0 - len(ratings)/(1.0*(no_of_movies*no_of_users)),3)
print(sparsity)

0.99


In [23]:
len(ratings['movieId'].unique())

14026

In [24]:
avg_movie_rating = pd.DataFrame(ratings.groupby('movieId')['rating'].agg(['mean','count']))
avg_movie_rating['movieId']= avg_movie_rating.index

In [25]:
avg_movie_rating.head()

Unnamed: 0_level_0,mean,count,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.959323,2569,1
2,3.268398,1155,2
3,3.186861,685,3
4,3.0,138,4
5,3.143836,657,5


In [26]:
len(avg_movie_rating)

14026

In [27]:
#Get the average movie rating across all movies 
avg_rating_all=ratings['rating'].mean()
avg_rating_all
#set a minimum threshold for number of reviews that the movie has to have
min_reviews=30
min_reviews
movie_score = avg_movie_rating.loc[avg_movie_rating['count']>min_reviews]
movie_score.head()

Unnamed: 0_level_0,mean,count,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.959323,2569,1
2,3.268398,1155,2
3,3.186861,685,3
4,3.0,138,4
5,3.143836,657,5


In [28]:
len(movie_score)

4173

In [29]:
#create a function for weighted rating score based off count of reviews
def weighted_rating(x, m=min_reviews, C=avg_rating_all):
    v = x['count']
    R = x['mean']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [30]:
#Calculating the weighted score for each movie
movie_score['weighted_score'] = movie_score.apply(weighted_rating, axis=1)
movie_score.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,mean,count,movieId,weighted_score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3.959323,2569,1,3.954359
2,3.268398,1155,2,3.275003
3,3.186861,685,3,3.201228
4,3.0,138,4,3.094513
5,3.143836,657,5,3.160667


In [31]:
#join movie details to movie ratings
movie_score = pd.merge(movie_score,movies_with_genres,on='movieId')
#join movie links to movie ratings
#movie_score = pd.merge(movie_score,links,on='movieId')
movie_score.head()

Defaulting to column, but this will raise an ambiguity error in a future version
  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,mean,count,movieId,weighted_score,title,genres,Children,IMAX,Documentary,Animation,...,Crime,Horror,Romance,Sci-Fi,Musical,Adventure,Western,Fantasy,War,Comedy
0,3.959323,2569,1,3.954359,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,0,0,1,...,0,0,0,0,0,1,0,1,0,1
1,3.268398,1155,2,3.275003,Jumanji (1995),Adventure|Children|Fantasy,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,3.186861,685,3,3.201228,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,3.0,138,4,3.094513,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,3.143836,657,5,3.160667,Father of the Bride Part II (1995),Comedy,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [32]:
#list top scored movies over the whole range of movies
pd.DataFrame(movie_score.sort_values(['weighted_score'],ascending=False)[['title','count','mean','weighted_score','genres']][:10])

Unnamed: 0,title,count,mean,weighted_score,genres
242,"Shawshank Redemption, The (1994)",3216,4.469994,4.461299,Crime|Drama
565,"Godfather, The (1972)",2137,4.388161,4.37627,Crime|Drama
46,"Usual Suspects, The (1995)",2490,4.370482,4.360468,Crime|Mystery|Thriller
403,Schindler's List (1993),2598,4.295612,4.286864,Drama|War
791,"Godfather: Part II, The (1974)",1418,4.278561,4.263037,Crime|Drama
3230,Band of Brothers (2001),228,4.35307,4.25728,Action|Drama|War
593,Casablanca (1942),1282,4.268721,4.251813,Drama|Romance
2937,City of God (Cidade de Deus) (2002),646,4.258514,4.226151,Action|Adventure|Crime|Drama|Thriller
589,North by Northwest (1959),817,4.250306,4.224768,Action|Adventure|Mystery|Romance|Thriller
3814,"Dark Knight, The (2008)",1031,4.242968,4.222788,Action|Crime|Drama|IMAX


In [33]:
def best_movies_by_genre(genre,top_n):
    return pd.DataFrame(movie_score.loc[(movie_score[genre]==1)].sort_values(['weighted_score'],ascending=False)[['title','count','mean','weighted_score']][:top_n])

In [34]:
best_movies_by_genre('Musical',10)  

Unnamed: 0,title,count,mean,weighted_score
824,Duck Soup (1933),280,4.217857,4.15122
580,Singin' in the Rain (1952),542,4.097786,4.067969
3909,Dr. Horrible's Sing-Along Blog (2008),185,4.148649,4.062224
1777,Stop Making Sense (1984),119,4.142857,4.019316
2647,Fiddler on the Roof (1971),165,4.048485,3.968606
600,"Wizard of Oz, The (1939)",1229,3.947518,3.937552
588,"Gay Divorcee, The (1934)",57,4.149123,3.935381
1438,Nashville (1975),128,4.015625,3.923279
624,Top Hat (1935),120,4.004167,3.909188
3297,"Day at the Races, A (1937)",51,4.117647,3.89973


In [35]:
best_movies_by_genre('Action',10)  

Unnamed: 0,title,count,mean,weighted_score
3230,Band of Brothers (2001),228,4.35307,4.25728
2937,City of God (Cidade de Deus) (2002),646,4.258514,4.226151
589,North by Northwest (1959),817,4.250306,4.224768
3814,"Dark Knight, The (2008)",1031,4.242968,4.222788
1838,Fight Club (1999),2085,4.219185,4.209399
769,Raiders of the Lost Ark (Indiana Jones and the...,2289,4.206204,4.197446
1256,Seven Samurai (Shichinin no samurai) (1954),594,4.223906,4.19051
199,Star Wars: Episode IV - A New Hope (1977),2874,4.189457,4.182637
1629,"Matrix, The (1999)",2705,4.17634,4.169242
768,"Princess Bride, The (1987)",1725,4.177391,4.166312


In [36]:
best_movies_by_genre('Children',10)  

Unnamed: 0,title,count,mean,weighted_score
518,Wallace & Gromit: A Close Shave (1995),622,4.176045,4.146286
741,Wallace & Gromit: The Wrong Trousers (1993),798,4.133459,4.111568
793,"Grand Day Out with Wallace and Gromit, A (1989)",379,4.100264,4.058382
3833,WALL·E (2008),654,4.074159,4.05026
2927,My Neighbor Totoro (Tonari no Totoro) (1988),262,4.104962,4.045816
1753,"Christmas Story, A (1983)",686,4.035714,4.014495
3991,How to Train Your Dragon (2010),234,4.064103,4.003326
3931,Up (2009),502,4.030876,4.002591
4004,Toy Story 3 (2010),306,4.042484,3.996661
2989,Laputa: Castle in the Sky (Tenkû no shiro Rapy...,183,4.046448,3.973606


In [37]:
best_movies_by_genre('Drama',10)  

Unnamed: 0,title,count,mean,weighted_score
242,"Shawshank Redemption, The (1994)",3216,4.469994,4.461299
565,"Godfather, The (1972)",2137,4.388161,4.37627
403,Schindler's List (1993),2598,4.295612,4.286864
791,"Godfather: Part II, The (1974)",1418,4.278561,4.263037
3230,Band of Brothers (2001),228,4.35307,4.25728
593,Casablanca (1942),1282,4.268721,4.251813
2937,City of God (Cidade de Deus) (2002),646,4.258514,4.226151
3814,"Dark Knight, The (2008)",1031,4.242968,4.222788
765,One Flew Over the Cuckoo's Nest (1975),1538,4.235696,4.22218
1838,Fight Club (1999),2085,4.219185,4.209399


In [38]:
ratings_df = pd.pivot_table(ratings, index='userId', columns='movieId', aggfunc=np.max)

In [39]:
ratings_df.head()

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movieId,1,2,3,4,5,6,7,8,9,10,...,129350,129354,129428,129707,130052,130073,130219,130462,130490,130642
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,3.5,,,,,,,,,...,,,,,,,,,,
2,,,4.0,,,,,,,,...,,,,,,,,,,
3,4.0,,,,,,,,,,...,,,,,,,,,,
4,,,,,,3.0,,,,4.0,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,


In [40]:
ratings_df

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
movieId,1,2,3,4,5,6,7,8,9,10,...,129350,129354,129428,129707,130052,130073,130219,130462,130490,130642
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,3.5,,,,,,,,,...,,,,,,,,,,
2,,,4.0,,,,,,,,...,,,,,,,,,,
3,4.0,,,,,,,,,,...,,,,,,,,,,
4,,,,,,3.0,,,,4.0,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,
6,5.0,,3.0,,,,5.0,,,,...,,,,,,,,,,
7,,,3.0,,,,3.0,,,,...,,,,,,,,,,
8,4.0,,5.0,,,3.0,,,,4.0,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,4.0,,,,,,,,,,...,,,,,,,,,,


In [41]:
ratings_movies = pd.merge(ratings,movie_list, on = 'movieId')

In [42]:
ratings_movies.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji (1995),Adventure|Children|Fantasy
1,5,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
2,13,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
3,29,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
4,34,2,3.0,Jumanji (1995),Adventure|Children|Fantasy


In [43]:
ratings_movies

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji (1995),Adventure|Children|Fantasy
1,5,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
2,13,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
3,29,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
4,34,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
5,54,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
6,88,2,1.0,Jumanji (1995),Adventure|Children|Fantasy
7,91,2,3.5,Jumanji (1995),Adventure|Children|Fantasy
8,116,2,2.0,Jumanji (1995),Adventure|Children|Fantasy
9,119,2,4.0,Jumanji (1995),Adventure|Children|Fantasy


In [44]:
def get_other_movies(movie_name):
    #get all users who watched a specific movie
    df_movie_users_series = ratings_movies.loc[ratings_movies['title']==movie_name]['userId']
    #convert to a data frame
    df_movie_users = pd.DataFrame(df_movie_users_series,columns=['userId'])
    #get a list of all other movies watched by these users
    other_movies = pd.merge(df_movie_users,ratings_movies,on='userId')
    #get a list of the most commonly watched movies by these other user
    other_users_watched = pd.DataFrame(other_movies.groupby('title')['userId'].count()).sort_values('userId',ascending=False)
    other_users_watched['perc_who_watched'] = round(other_users_watched['userId']*100/other_users_watched['userId'][0],1)
    return other_users_watched[:10]

In [45]:
# Getting other top 10 movies which are watched by the people who saw 'Gone Girl'
get_other_movies('Gone Girl (2014)')

Unnamed: 0_level_0,userId,perc_who_watched
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Gone Girl (2014),61,100.0
"Matrix, The (1999)",54,88.5
Inception (2010),53,86.9
Fight Club (1999),52,85.2
"Shawshank Redemption, The (1994)",52,85.2
"Dark Knight, The (2008)",52,85.2
"Lord of the Rings: The Fellowship of the Ring, The (2001)",51,83.6
"Lord of the Rings: The Return of the King, The (2003)",50,82.0
Pulp Fiction (1994),48,78.7
"Silence of the Lambs, The (1991)",47,77.0


In [46]:
from sklearn.neighbors import NearestNeighbors


In [47]:
avg_movie_rating.head()

Unnamed: 0_level_0,mean,count,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.959323,2569,1
2,3.268398,1155,2
3,3.186861,685,3
4,3.0,138,4
5,3.143836,657,5


In [48]:
movie_plus_10_ratings = avg_movie_rating.loc[avg_movie_rating['count']>=10]
print(len(movie_plus_10_ratings))

6870


In [49]:
movie_plus_10_ratings

Unnamed: 0_level_0,mean,count,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.959323,2569,1
2,3.268398,1155,2
3,3.186861,685,3
4,3.000000,138,4
5,3.143836,657,5
6,3.836508,1260,6
7,3.381429,700,7
8,3.352564,78,8
9,3.017327,202,9
10,3.427003,1548,10


In [50]:
filtered_ratings = pd.merge(movie_plus_10_ratings, ratings, on="movieId")
len(filtered_ratings)

Defaulting to column, but this will raise an ambiguity error in a future version
  exec(code_obj, self.user_global_ns, self.user_ns)


1025973

In [51]:
filtered_ratings.head()

Unnamed: 0,mean,count,movieId,userId,rating
0,3.959323,2569,1,3,4.0
1,3.959323,2569,1,6,5.0
2,3.959323,2569,1,8,4.0
3,3.959323,2569,1,10,4.0
4,3.959323,2569,1,11,4.5


In [52]:
movie_wide = filtered_ratings.pivot(index = 'movieId', columns = 'userId', values = 'rating').fillna(0)
movie_wide.head()

userId,1,2,3,4,5,6,7,8,9,10,...,7111,7112,7113,7114,7115,7116,7117,7118,7119,7120
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,4.0,0.0,0.0,5.0,0.0,4.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,5.0,4.5
2,3.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
3,0.0,4.0,0.0,0.0,0.0,3.0,3.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,3.5,0.0,0.0,0.0,0.0


In [53]:
#specify model parameters
model_knn = NearestNeighbors(metric='cosine',algorithm='brute')
#fit model to the data set
model_knn.fit(movie_wide)


NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [54]:
#Gets the top 10 nearest neighbours got the movie
def print_similar_movies(query_index) :
    #get the list of user ratings for a specific query_index
    query_index_movie_ratings = movie_wide.loc[query_index,:].values.reshape(1,-1)
    #get the closest 10 movies and their distances from the movie specified
    distances,indices = model_knn.kneighbors(query_index_movie_ratings,n_neighbors = 11) 
    #write a loop that prints the similar movies for a specified movie.
    for i in range(0,len(distances.flatten())):
        #get the title of the random movie that was chosen
        get_movie = movie_list.loc[movie_list['movieId']==query_index]['title']
        #for the first movie in the list i.e closest print the title
        if i==0:
            print('Recommendations for {0}:\n'.format(get_movie))
        else :
            #get the indiciees for the closest movies
            indices_flat = indices.flatten()[i]
            #get the title of the movie
            get_movie = movie_list.loc[movie_list['movieId']==movie_wide.iloc[indices_flat,:].name]['title']
            #print the movie
            print('{0}: {1}, with distance of {2}:'.format(i,get_movie,distances.flatten()[i]))

In [55]:
print_similar_movies(112552)

Recommendations for 23663    Whiplash (2014)
Name: title, dtype: object:

1: 24693    The Imitation Game (2014)
Name: title, dtype: object, with distance of 0.5315254432260329:
2: 23561    Birdman (2014)
Name: title, dtype: object, with distance of 0.6297852813391156:
3: 23665    Gone Girl (2014)
Name: title, dtype: object, with distance of 0.6333635271762286:
4: 24387    Nightcrawler (2014)
Name: title, dtype: object, with distance of 0.6489868540544907:
5: 23401    Chef (2014)
Name: title, dtype: object, with distance of 0.6553093708289104:
6: 22189    Inside Llewyn Davis (2013)
Name: title, dtype: object, with distance of 0.6673366942887282:
7: 21749    Nebraska (2013)
Name: title, dtype: object, with distance of 0.6764847438525627:
8: 22919    Interstellar (2014)
Name: title, dtype: object, with distance of 0.687531270800998:
9: 18064    Shame (2011)
Name: title, dtype: object, with distance of 0.7232374343153795:
10: 24308    Fury (2014)
Name: title, dtype: object, with distance o

In [56]:
print_similar_movies(1)

Recommendations for 0    Toy Story (1995)
Name: title, dtype: object:

1: 257    Star Wars: Episode IV - A New Hope (1977)
Name: title, dtype: object, with distance of 0.4203702182940916:
2: 767    Independence Day (a.k.a. ID4) (1996)
Name: title, dtype: object, with distance of 0.4264775268701718:
3: 1184    Star Wars: Episode VI - Return of the Jedi (1983)
Name: title, dtype: object, with distance of 0.44730365269962324:
4: 352    Forrest Gump (1994)
Name: title, dtype: object, with distance of 0.45689872537724374:
5: 640    Mission: Impossible (1996)
Name: title, dtype: object, with distance of 0.457736564895886:
6: 3027    Toy Story 2 (1999)
Name: title, dtype: object, with distance of 0.4603017587650964:
7: 1242    Back to the Future (1985)
Name: title, dtype: object, with distance of 0.46115899489577405:
8: 582    Aladdin (1992)
Name: title, dtype: object, with distance of 0.46720198545017855:
9: 476    Jurassic Park (1993)
Name: title, dtype: object, with distance of 0.468007284

In [57]:
print_similar_movies(96079)

Recommendations for 19338    Skyfall (2012)
Name: title, dtype: object:

1: 18312    Dark Knight Rises, The (2012)
Name: title, dtype: object, with distance of 0.38339118524349736:
2: 19473    Looper (2012)
Name: title, dtype: object, with distance of 0.4655465783348146:
3: 20046    Hobbit: An Unexpected Journey, The (2012)
Name: title, dtype: object, with distance of 0.4791141110494047:
4: 20906    Iron Man 3 (2013)
Name: title, dtype: object, with distance of 0.4900484774282041:
5: 20994    Star Trek Into Darkness (2013)
Name: title, dtype: object, with distance of 0.4909417544965664:
6: 20124    Django Unchained (2012)
Name: title, dtype: object, with distance of 0.495007359587433:
7: 18318    Sherlock Holmes: A Game of Shadows (2011)
Name: title, dtype: object, with distance of 0.5006787501261142:
8: 18304    Hunger Games, The (2012)
Name: title, dtype: object, with distance of 0.500874607555973:
9: 18349    Mission: Impossible - Ghost Protocol (2011)
Name: title, dtype: object, wi

In [58]:
movies_with_genres.head()

Unnamed: 0,movieId,title,genres,Children,IMAX,Documentary,Animation,Mystery,Film-Noir,Action,...,Crime,Horror,Romance,Sci-Fi,Musical,Adventure,Western,Fantasy,War,Comedy
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [59]:
#Getting the movies list with only genres like Musical and other such columns
movie_content_df_temp = movies_with_genres.copy()
movie_content_df_temp.set_index('movieId')
movie_content_df = movie_content_df_temp.drop(columns = ['movieId','title','genres'])
movie_content_df = movie_content_df.as_matrix()
movie_content_df

  """


array([[1, 0, 0, ..., 1, 0, 1],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

In [60]:
#create a series of the movie id and title
indices = pd.Series(movie_content_df_temp.index, movie_content_df_temp['title'])
indices 

title
Toy Story (1995)                                                       0
Jumanji (1995)                                                         1
Grumpier Old Men (1995)                                                2
Waiting to Exhale (1995)                                               3
Father of the Bride Part II (1995)                                     4
Heat (1995)                                                            5
Sabrina (1995)                                                         6
Tom and Huck (1995)                                                    7
Sudden Death (1995)                                                    8
GoldenEye (1995)                                                       9
American President, The (1995)                                        10
Dracula: Dead and Loving It (1995)                                    11
Balto (1995)                                                          12
Nixon (1995)                                 

In [61]:
#get ordered list of movieIds
item_indices = pd.DataFrame(sorted(list(set(ratings['movieId']))),columns=['movieId'])
#add in data frame index value to data frame
item_indices['movie_index']=item_indices.index
#inspect data frame
item_indices.head()


Unnamed: 0,movieId,movie_index
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [62]:
#get ordered list of movieIds
user_indices = pd.DataFrame(sorted(list(set(ratings['userId']))),columns=['userId'])
#add in data frame index value to data frame
user_indices['user_index']=user_indices.index
#inspect data frame
user_indices.head()

Unnamed: 0,userId,user_index
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


In [63]:
#join the movie indices
df_with_index = pd.merge(ratings,item_indices,on='movieId')
#join the user indices
df_with_index=pd.merge(df_with_index,user_indices,on='userId')
#inspec the data frame
df_with_index.head()

Unnamed: 0,userId,movieId,rating,movie_index,user_index
0,1,2,3.5,1,0
1,1,29,3.5,28,0
2,1,32,3.5,31,0
3,1,47,3.5,46,0
4,1,50,3.5,49,0


In [64]:
#import train_test_split module
from sklearn.model_selection import train_test_split
#take 80% as the training set and 20% as the test set
df_train, df_test= train_test_split(df_with_index,test_size=0.2)
print(len(df_train))
print(len(df_test))

838860
209715


In [65]:
df_train.head()

Unnamed: 0,userId,movieId,rating,movie_index,user_index
182054,3284,3175,4.5,3004,3283
363578,6373,3681,4.0,3468,6372
391077,6976,4445,2.0,4210,6975
468521,6099,329,3.5,322,6098
748919,3451,1909,5.0,1769,3450


In [66]:
df_test.head()

Unnamed: 0,userId,movieId,rating,movie_index,user_index
709174,312,597,4.0,585,311
558574,2733,2000,2.0,1860,2732
183787,3293,288,3.0,282,3292
959749,4844,51662,5.0,9824,4843
101110,1849,4476,3.0,4239,1848


In [67]:
n_users = ratings.userId.unique().shape[0]
n_items = ratings.movieId.unique().shape[0]
print(n_users)
print(n_items)

7120
14026


In [68]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
    #for every line in the data
for line in df_train.itertuples():
    #set the value in the column and row to 
    #line[1] is userId, line[2] is movieId and line[3] is rating, line[4] is movie_index and line[5] is user_index
    train_data_matrix[line[5], line[4]] = line[3]
train_data_matrix.shape

(7120, 14026)

In [69]:
#Create two user-item matrices, one for training and another for testing
test_data_matrix = np.zeros((n_users, n_items))
    #for every line in the data
for line in df_test[:1].itertuples():
    #set the value in the column and row to 
    #line[1] is userId, line[2] is movieId and line[3] is rating, line[4] is movie_index and line[5] is user_index
    #print(line[2])
    test_data_matrix[line[5], line[4]] = line[3]
    #train_data_matrix[line['movieId'], line['userId']] = line['rating']
test_data_matrix.shape

(7120, 14026)

In [79]:
pd.DataFrame(train_data_matrix).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14016,14017,14018,14019,14020,14021,14022,14023,14024,14025
0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
df_train['rating'].max()

5.0

In [85]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [82]:
#Calculate the rmse sscore of SVD using different values of k (latent features)
rmse_list = []
for i in [1,2,5,20,40,60,100,200]:
    #apply svd to the test data
    u,s,vt = svds(train_data_matrix,k=i)
    #get diagonal matrix
    s_diag_matrix=np.diag(s)
    #predict x with dot product of u s_diag and vt
    X_pred = np.dot(np.dot(u,s_diag_matrix),vt)
    #calculate rmse score of matrix factorisation predictions
    rmse_score = rmse(X_pred,test_data_matrix)
    rmse_list.append(rmse_score)
    print("Matrix Factorisation with " + str(i) +" latent features has a RMSE of " + str(rmse_score))

Matrix Factorisation with 1 latent features has a RMSE of 4.489793893582221
Matrix Factorisation with 2 latent features has a RMSE of 4.490970525134155
Matrix Factorisation with 5 latent features has a RMSE of 4.471151948985039
Matrix Factorisation with 20 latent features has a RMSE of 4.447413436412684
Matrix Factorisation with 40 latent features has a RMSE of 4.468737463810905
Matrix Factorisation with 60 latent features has a RMSE of 4.464723702083002
Matrix Factorisation with 100 latent features has a RMSE of 4.462196857444757
Matrix Factorisation with 200 latent features has a RMSE of 4.393202007502313


In [83]:
#Convert predictions to a DataFrame
mf_pred = pd.DataFrame(X_pred)
mf_pred.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14016,14017,14018,14019,14020,14021,14022,14023,14024,14025
0,-0.312605,2.051824,0.109891,-0.184702,0.057702,-0.11617,-0.239341,-0.086637,-0.044397,-0.242954,...,-0.003127,0.026531,-0.005851,3e-05,0.0,0.008731,-0.014107,0.0,0.005129,-0.013597
1,0.53634,0.109913,0.628115,0.009178,0.344921,-0.255878,0.493251,0.064525,0.000413,0.540149,...,0.018497,0.012454,-0.013604,0.000951,0.0,-0.002187,-0.013114,0.0,0.000463,-0.014546
2,3.777909,0.159727,-0.161345,-0.070189,-0.426819,0.126388,0.214389,-0.111538,0.032325,-0.094633,...,-0.004263,0.018464,0.000122,0.00021,0.0,0.024228,-0.009818,0.0,0.006989,0.017813
3,-0.420936,0.375304,0.307641,0.03413,0.063456,2.266512,0.081596,0.126963,0.121576,2.397133,...,0.001216,-0.000194,-0.005442,0.000343,0.0,-0.001566,0.007922,0.0,0.008541,0.000708
4,0.08352,1.731199,0.15483,-0.100818,0.432238,0.233474,0.803544,0.080116,0.006299,0.401309,...,0.002388,0.019347,-0.021739,0.000618,0.0,0.011962,-0.010375,0.0,0.001627,0.001931


In [84]:
df_names = pd.merge(ratings,movie_list,on='movieId')
df_names.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji (1995),Adventure|Children|Fantasy
1,5,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
2,13,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
3,29,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
4,34,2,3.0,Jumanji (1995),Adventure|Children|Fantasy


In [85]:
#choose a user ID
user_id = 1
#get movies rated by this user id
users_movies = df_names.loc[df_names["userId"]==user_id]
#print how many ratings user has made 
print("User ID : " + str(user_id) + " has already rated " + str(len(users_movies)) + " movies")
#list movies that have been rated
users_movies

User ID : 1 has already rated 175 movies


Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji (1995),Adventure|Children|Fantasy
1155,1,29,3.5,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
1603,1,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3915,1,47,3.5,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
6156,1,50,3.5,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
8646,1,112,3.5,Rumble in the Bronx (Hont faan kui) (1995),Action|Adventure|Comedy|Crime
9273,1,151,4.0,Rob Roy (1995),Action|Drama|Romance|War
9968,1,223,4.0,Clerks (1994),Comedy
11227,1,253,4.0,Interview with the Vampire: The Vampire Chroni...,Drama|Horror
12682,1,260,4.0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi


In [87]:
user_index = df_train.loc[df_train["userId"]==user_id]['user_index'][:1].values[0]
#get movie ratings predicted for this user and sort by highest rating prediction
sorted_user_predictions = pd.DataFrame(mf_pred.iloc[user_index].sort_values(ascending=False))
#rename the columns
sorted_user_predictions.columns=['ratings']
#save the index values as movie id
sorted_user_predictions['movieId']=sorted_user_predictions.index
print("Top 10 predictions for User " + str(user_id))
#display the top 10 predictions for this user
pd.merge(sorted_user_predictions,movie_list, on = 'movieId')[:10]

NameError: name 'df_train' is not defined