In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


movies = pd.read_csv('/home/maria/Django-Onboarding/recommendation/movies.csv')
ratings = pd.read_csv('/home/maria/Django-Onboarding/recommendation/ratings.csv')


dataset = pd.merge(movies, ratings, how='left', on='movieId')


table = dataset.pivot_table(index='title', columns='userId', values='rating')


In [3]:
table.shape

(10323, 668)

In [4]:
table = table.fillna(0)

In [5]:
table.head(2)

userId,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,659.0,660.0,661.0,662.0,663.0,664.0,665.0,666.0,667.0,668.0
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
from scipy.sparse import csr_matrix

matrix = csr_matrix(table.values)

In [7]:
user_query_index = np.random.choice(table.shape[1])
user_query_index

184

In [8]:
from sklearn.metrics.pairwise import linear_kernel

cosine = linear_kernel(matrix, matrix)

## Normal Recommendation with Cosine Matrix

In [9]:
def recommendations(name, cosine = cosine):
    recommended_list = []
    idx = user_query_index
    score = pd.Series(cosine[idx]).sort_values(ascending=False)
    
    top_10 = list(score.iloc[0:11].index)
    for each in top_10:
        recommended_list.append(list(table.index)[each])
    return recommended_list

print('Recommendation for {0} :\n'.format(table.index[user_query_index]))
recommendations(table.index[user_query_index])

Recommendation for Above the Rim (1994) :



['Forrest Gump (1994)',
 'Dances with Wolves (1990)',
 'Dead Poets Society (1989)',
 'Back to the Future (1985)',
 'Field of Dreams (1989)',
 'Top Gun (1986)',
 'Platoon (1986)',
 'Die Hard (1988)',
 'Fugitive, The (1993)',
 'Silence of the Lambs, The (1991)',
 'Braveheart (1995)']

In [10]:
dataset = dataset.fillna(0)

## Movies rated by a user with ID 3

In [11]:
dataset.loc[dataset['userId'] == 3]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
394,5,Father of the Bride Part II (1995),Comedy,3.0,3.0,841483936.0
570,7,Sabrina (1995),Comedy|Romance,3.0,3.0,841484087.0
786,11,"American President, The (1995)",Comedy|Drama|Romance,3.0,4.0,841483689.0
1254,21,Get Shorty (1995),Comedy|Crime|Thriller,3.0,5.0,841483620.0
1893,34,Babe (1995),Children|Drama,3.0,5.0,841483604.0
...,...,...,...,...,...,...
19399,613,Jane Eyre (1996),Drama|Romance,3.0,4.0,848052408.0
19633,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,3.0,4.0,841483842.0
20686,736,Twister (1996),Action|Adventure|Romance|Thriller,3.0,3.0,841483822.0
22036,800,Lone Star (1996),Drama|Mystery|Western,3.0,5.0,841484158.0


# Predicting a user's rating

In [12]:
from surprise import SVD, Reader, Dataset
from surprise.model_selection import cross_validate
from sklearn.metrics.pairwise import linear_kernel

In [13]:
svd = SVD()
reader = Reader()
cosine = linear_kernel(matrix, matrix)

In [14]:
df = Dataset.load_from_df(dataset[['userId', 'movieId', 'rating']], reader)

In [15]:
cross_validate(svd, df, measures=['RMSE', 'MAE'], cv = 5, verbose= True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8701  0.8617  0.8713  0.8820  0.8629  0.8696  0.0073  
MAE (testset)     0.6693  0.6650  0.6710  0.6810  0.6659  0.6704  0.0057  
Fit time          4.05    4.33    4.21    3.98    3.93    4.10    0.15    
Test time         0.17    0.19    0.10    0.16    0.16    0.16    0.03    


{'test_rmse': array([0.870133  , 0.86172072, 0.87134575, 0.88195249, 0.86287192]),
 'test_mae': array([0.66931761, 0.66495364, 0.67098021, 0.6809685 , 0.66593514]),
 'fit_time': (4.045032739639282,
  4.32799506187439,
  4.205575942993164,
  3.97590970993042,
  3.9289863109588623),
 'test_time': (0.1657419204711914,
  0.19350194931030273,
  0.10323190689086914,
  0.15925145149230957,
  0.16277003288269043)}

In [16]:
trainset = df.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff71e3b2af0>

     Prediciting the rating of userId 1 for the movieId = user_query_index

In [17]:
svd.predict(1, user_query_index, 3)

Prediction(uid=1, iid=184, r_ui=3, est=3.6322938358894827, details={'was_impossible': False})