In [2]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import surprise

In [3]:
data = pd.read_csv('Amazon - Movies and TV Ratings.csv')
data.head(10)

Unnamed: 0,user_id,Movie1,Movie2,Movie3,Movie4,Movie5,Movie6,Movie7,Movie8,Movie9,...,Movie197,Movie198,Movie199,Movie200,Movie201,Movie202,Movie203,Movie204,Movie205,Movie206
0,A3R5OBKS7OM2IR,5.0,5.0,,,,,,,,...,,,,,,,,,,
1,AH3QC2PC1VTGP,,,2.0,,,,,,,...,,,,,,,,,,
2,A3LKP6WPMP9UKX,,,,5.0,,,,,,...,,,,,,,,,,
3,AVIY68KEPQ5ZD,,,,5.0,,,,,,...,,,,,,,,,,
4,A1CV1WROP5KTTW,,,,,5.0,,,,,...,,,,,,,,,,
5,AP57WZ2X4G0AA,,,,,2.0,,,,,...,,,,,,,,,,
6,A3NMBJ2LCRCATT,,,,,5.0,,,,,...,,,,,,,,,,
7,A5Y15SAOMX6XA,,,,,2.0,,,,,...,,,,,,,,,,
8,A3P671HJ32TCSF,,,,,5.0,,,,,...,,,,,,,,,,
9,A3VCKTRD24BG7K,,,,,5.0,,,,,...,,,,,,,,,,


In [4]:
data.shape

(4848, 207)

Exploratory Data Analysis:
Which movies have maximum views/ratings?

In [5]:
#calculate the statistical information of all movies:
desc = data.describe().T
desc

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Movie1,1.0,5.000000,,5.0,5.00,5.0,5.0,5.0
Movie2,1.0,5.000000,,5.0,5.00,5.0,5.0,5.0
Movie3,1.0,2.000000,,2.0,2.00,2.0,2.0,2.0
Movie4,2.0,5.000000,0.000000,5.0,5.00,5.0,5.0,5.0
Movie5,29.0,4.103448,1.496301,1.0,4.00,5.0,5.0,5.0
...,...,...,...,...,...,...,...,...
Movie202,6.0,4.333333,1.632993,1.0,5.00,5.0,5.0,5.0
Movie203,1.0,3.000000,,3.0,3.00,3.0,3.0,3.0
Movie204,8.0,4.375000,1.407886,1.0,4.75,5.0,5.0,5.0
Movie205,35.0,4.628571,0.910259,1.0,5.00,5.0,5.0,5.0


In [6]:
# The movie with max. review/ratings
desc2 = desc['count'].sort_values(ascending=False).to_frame()
pop = desc2[:1]
pop

Unnamed: 0,count
Movie127,2313.0


What is the average rating of each movie? Define the top 5 movies with the maximum ratings

In [7]:
data2 = data.drop('user_id',axis=1)
rating_avr = data2.mean().sort_values(ascending=False).to_frame().rename(columns= {0: 'Average Rating'})
rating_avr.head(5)

Unnamed: 0,Average Rating
Movie1,5.0
Movie66,5.0
Movie76,5.0
Movie75,5.0
Movie74,5.0


We can see from the table above, that some top movies are not as popular as we know, the reason could be the number of ratings are less. Then we should count the rating number of each movie and filter the one with too few viewers.

In [8]:
rating_avr['count'] = desc2['count']
rating_avr.head(5)

Unnamed: 0,Average Rating,count
Movie1,5.0,1.0
Movie66,5.0,1.0
Movie76,5.0,2.0
Movie75,5.0,1.0
Movie74,5.0,1.0


The table above has proved our presumtion, a big part of the top 25 has very few viewer.

In [9]:
rating_avr['count'].describe()

count     206.000000
mean       24.271845
std       168.937841
min         1.000000
25%         1.000000
50%         2.000000
75%         5.000000
max      2313.000000
Name: count, dtype: float64

From the information above, the difference between mean value(24.3) and 50% QUANTILE(2) is quite huge. So we can try to set the threshold as 10 to filter the outliers out of list

In [10]:
rating_avr_filtered = rating_avr[rating_avr['count']>10]
rating_avr_filtered.head(5)

Unnamed: 0,Average Rating,count
Movie206,4.923077,13.0
Movie162,4.866667,15.0
Movie140,4.83391,578.0
Movie184,4.823529,17.0
Movie158,4.818182,66.0


Define the top 5 movies with the least audience

In [11]:
rating_count = rating_avr['count'].sort_values(ascending=True).to_frame()
rating_count.head(5)

Unnamed: 0,count
Movie1,1.0
Movie34,1.0
Movie35,1.0
Movie36,1.0
Movie37,1.0


It could be more than 5 movies with only 1 audience, so we can list all movies with only 1 audience

In [12]:
rating_count[rating_count['count']==1]

Unnamed: 0,count
Movie1,1.0
Movie34,1.0
Movie35,1.0
Movie36,1.0
Movie37,1.0
...,...
Movie54,1.0
Movie84,1.0
Movie72,1.0
Movie77,1.0


From the above information, there are 89 movies with only 1 audience

Recommendation Model:

In [28]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
#from sparsesvd import sparsesvd
import warnings; warnings.simplefilter('ignore')
%matplotlib inline

Divide the data into test set and train set

In [14]:
data_melt = data.melt(id_vars=data.columns[0], value_vars=data.columns[1:], var_name="movies", value_name="ratings")
data_melt

Unnamed: 0,user_id,movies,ratings
0,A3R5OBKS7OM2IR,Movie1,5.0
1,AH3QC2PC1VTGP,Movie1,
2,A3LKP6WPMP9UKX,Movie1,
3,AVIY68KEPQ5ZD,Movie1,
4,A1CV1WROP5KTTW,Movie1,
...,...,...,...
998683,A1IMQ9WMFYKWH5,Movie206,5.0
998684,A1KLIKPUF5E88I,Movie206,5.0
998685,A5HG6WFZLO10D,Movie206,5.0
998686,A3UU690TWXCG1X,Movie206,5.0


In [15]:
dataset = data_melt.fillna(0)
dataset.head()

Unnamed: 0,user_id,movies,ratings
0,A3R5OBKS7OM2IR,Movie1,5.0
1,AH3QC2PC1VTGP,Movie1,0.0
2,A3LKP6WPMP9UKX,Movie1,0.0
3,AVIY68KEPQ5ZD,Movie1,0.0
4,A1CV1WROP5KTTW,Movie1,0.0


In [16]:
ds_train, ds_test = train_test_split(dataset, test_size=0.3, random_state=0)
print("Shape of training data:",ds_train.shape)
print("Shape of testing data:",ds_test.shape)

Shape of training data: (699081, 3)
Shape of testing data: (299607, 3)


Buid a recommendation model on training data

In [17]:
ds_cf = pd.concat([ds_train, ds_test]).reset_index()
ds_cf.head()

Unnamed: 0,index,user_id,movies,ratings
0,338560,A9Q5O3PAC51MV,Movie70,0.0
1,471992,A1EI65WJC85U68,Movie98,0.0
2,185811,A379SAP75SPDHD,Movie39,0.0
3,817860,A1JIPFV4OL520T,Movie169,0.0
4,251599,A11YEGV0NPRF3H,Movie52,0.0


In [18]:
ds_pivot = ds_cf.pivot(index = 'user_id', columns = 'movies', values = 'ratings')
ds_pivot.head()

movies,Movie1,Movie10,Movie100,Movie101,Movie102,Movie103,Movie104,Movie105,Movie106,Movie107,...,Movie90,Movie91,Movie92,Movie93,Movie94,Movie95,Movie96,Movie97,Movie98,Movie99
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0047322388NOTO4N8SKD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A00473363TJ8YSZ3YAGG9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1004AX2J2HXGL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100CQXJ6D44T9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100Z2S0880G9A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
print('Shape of the pivot table:',ds_pivot.shape)

Shape of the pivot table: (4848, 206)


In [20]:
ds_pivot['user_index'] = np.arange(0,ds_pivot.shape[0],1)
ds_pivot.head()

movies,Movie1,Movie10,Movie100,Movie101,Movie102,Movie103,Movie104,Movie105,Movie106,Movie107,...,Movie91,Movie92,Movie93,Movie94,Movie95,Movie96,Movie97,Movie98,Movie99,user_index
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0047322388NOTO4N8SKD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
A00473363TJ8YSZ3YAGG9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
A1004AX2J2HXGL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
A100CQXJ6D44T9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
A100Z2S0880G9A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


In [21]:
ds_pivot.set_index(['user_index'],inplace=True)
ds_pivot.head()

movies,Movie1,Movie10,Movie100,Movie101,Movie102,Movie103,Movie104,Movie105,Movie106,Movie107,...,Movie90,Movie91,Movie92,Movie93,Movie94,Movie95,Movie96,Movie97,Movie98,Movie99
user_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


As the table above shows it is a sparse matrix, decided to use SVD to build the model
Singular Value Decomposition:

In [29]:
U, sigma, Vt = svds(ds_pivot.to_numpy(), k = 10)

In [30]:
print('Left Singular Matrix: \n', U)

Left Singular Matrix: 
 [[-1.41948014e-07 -7.12649421e-18 -3.70967613e-07 ...  1.50829005e-07
   4.26840615e-02  7.96633419e-05]
 [ 1.35316796e-05  9.85805250e-18  3.46952163e-05 ... -8.11610274e-06
  -1.14463559e-04  1.91174114e-02]
 [ 3.38291990e-06  1.18081781e-18  8.67380407e-06 ... -2.02902569e-06
  -2.86158897e-05  4.77935284e-03]
 ...
 [ 5.83560217e-17  5.12079788e-17  2.15057739e-18 ...  2.80729933e-18
  -1.12638706e-18  9.77882255e-19]
 [ 1.01487597e-05  3.54245342e-18  2.60214122e-05 ... -6.08707706e-06
  -8.58476690e-05  1.43380585e-02]
 [ 6.76583981e-06  2.36163561e-18  1.73476081e-05 ... -4.05805137e-06
  -5.72317793e-05  9.55870568e-03]]


In [31]:
print('Sigma: \n', sigma)

Sigma: 
 [ 34.72750225  39.54231658  41.16418185  47.74283509  53.94837751
  75.614235    77.65328486  82.08093229 117.1348772  209.22865684]


As sigma is not a diagonal matrix, it has to be converted into diagonal matrix

In [32]:
sigma = np.diag(sigma)
print("Diagonal Matrix: \n", sigma)

Diagonal Matrix: 
 [[ 34.72750225   0.           0.           0.           0.
    0.           0.           0.           0.           0.        ]
 [  0.          39.54231658   0.           0.           0.
    0.           0.           0.           0.           0.        ]
 [  0.           0.          41.16418185   0.           0.
    0.           0.           0.           0.           0.        ]
 [  0.           0.           0.          47.74283509   0.
    0.           0.           0.           0.           0.        ]
 [  0.           0.           0.           0.          53.94837751
    0.           0.           0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
   75.614235     0.           0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.          77.65328486   0.           0.           0.        ]
 [  0.           0.           0.           0.           0.
    0.           0.      

In [33]:
print("Right Singular Matrix: \n",Vt)

Right Singular Matrix: 
 [[ 8.80742461e-18  4.12188543e-20 -5.14725821e-17 ...  4.29876196e-17
   1.17053737e-17 -2.86577494e-17]
 [ 6.04778802e-18  2.69716801e-20 -3.28390103e-17 ...  4.90601570e-17
   9.26786201e-18 -2.06991882e-17]
 [ 1.54140315e-17 -3.01696910e-20  5.19119135e-17 ... -1.04465392e-16
  -8.13516843e-18  6.54065377e-17]
 ...
 [ 2.47848977e-18  7.59626632e-21 -7.28740593e-18 ... -4.24420858e-18
   1.80783071e-18 -5.70492039e-20]
 [-1.80479109e-17 -4.48892113e-20  5.02959782e-17 ...  3.18575876e-17
  -1.35694779e-17 -1.46018920e-17]
 [-5.50504088e-17 -1.18723560e-19  1.28576337e-16 ...  1.06599787e-16
  -3.57470449e-17 -4.97935872e-17]]


Make predictions on test data

In [34]:
user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)
ds_pred = pd.DataFrame(user_predicted_ratings, columns = ds_pivot.columns)
ds_pred.head()

movies,Movie1,Movie10,Movie100,Movie101,Movie102,Movie103,Movie104,Movie105,Movie106,Movie107,...,Movie90,Movie91,Movie92,Movie93,Movie94,Movie95,Movie96,Movie97,Movie98,Movie99
0,-9.128838000000001e-17,-2.2720349999999996e-19,2.54568e-16,1.817148e-16,6.498682e-07,1.093853e-06,4.345189e-12,-2.497853e-09,-2.506972e-09,-3e-06,...,5.399395e-07,2.204104e-07,1.330251e-06,1.288095e-16,3.5141010000000006e-17,1e-06,1.482448e-16,1.60666e-16,-6.867427000000001e-17,-7.357047000000001e-17
1,-2.2007e-16,-4.753754999999999e-19,5.150589e-16,5.005842e-16,-6.889186e-05,-2.419713e-06,-9.70943e-10,-3.222229e-07,-3.202302e-07,0.000316,...,-5.695961e-05,-2.481068e-06,7.615735e-06,2.828954e-16,8.567926e-17,-0.000137,3.61654e-16,4.250451e-16,-1.43123e-16,-1.983919e-16
2,-5.5017500000000005e-17,-1.1884389999999999e-19,1.287647e-16,1.25146e-16,-1.722296e-05,-6.049284e-07,-2.427357e-10,-8.055573e-08,-8.005755e-08,7.9e-05,...,-1.42399e-05,-6.202669e-07,1.903934e-06,7.072385e-17,2.1419810000000002e-17,-3.4e-05,9.041350000000001e-17,1.062613e-16,-3.5780750000000006e-17,-4.959797e-17
3,4.2742050000000006e-17,2.5544299999999996e-19,-3.106092e-16,1.845739e-16,1.932794e-06,7.188265e-08,-1.267691e-10,-3.464287e-08,-3.43938e-08,-1e-05,...,1.617642e-06,7.717333e-08,-2.259384e-07,-7.598565000000001e-17,1.776231e-18,4e-06,-1.359844e-17,1.322711e-16,7.579251000000001e-17,-8.904957e-17
4,1.147485e-17,3.1056069999999997e-20,-2.720744e-17,-1.8749180000000002e-17,6.549731e-05,-3.99504e-05,-4.219027e-08,-4.507963e-06,-4.425759e-06,-0.000409,...,5.393707e-05,-0.0001041088,-0.0001538701,-1.2218290000000001e-17,-3.362624e-18,8.9e-05,-1.9273970000000003e-17,-2.6086710000000003e-17,7.004777e-18,2.888517e-18


Recommend the items with the Highest predicted ratings:

In [35]:
def recommend_items(user_index, ds_pivot, ds_pred, num_recommendations):
    user_idx = user_index
    # Get and sort the user's ratings
    sorted_user_ratings = ds_pivot.iloc[user_idx].sort_values(ascending=False)
    #sorted_user_ratings
    sorted_user_predictions = ds_pred.iloc[user_idx].sort_values(ascending=False)
    #sorted_user_predictions
    temp = pd.concat([sorted_user_ratings,sorted_user_predictions],axis=1)
    temp.index.name = 'Recommended Movie'
    temp.columns = ['user_ratings','user_predictions']
    temp = temp.loc[temp.user_ratings==0]
    temp = temp.sort_values('user_predictions',ascending=False)
    print('\n Below are the recommemded movies for user(user_index = {}): \n'.format(user_index))
    print(temp.head(num_recommendations))

In [36]:
user_index = 4
num_recommendations = 5
recommend_items(user_index, ds_pivot, ds_pred, num_recommendations)


 Below are the recommemded movies for user(user_index = 4): 

                   user_ratings  user_predictions
Recommended Movie                                
Movie162                    0.0          0.019604
Movie86                     0.0          0.011711
Movie185                    0.0          0.000109
Movie163                    0.0          0.000104
Movie95                     0.0          0.000089


In [37]:
user_index = 123
num_recommendations = 5
recommend_items(user_index, ds_pivot, ds_pred, num_recommendations)


 Below are the recommemded movies for user(user_index = 123): 

                   user_ratings  user_predictions
Recommended Movie                                
Movie202                    0.0          0.002864
Movie132                    0.0          0.002861
Movie188                    0.0          0.002838
Movie189                    0.0          0.002836
Movie190                    0.0          0.002835


In [38]:
user_index = 2345
num_recommendations = 5
recommend_items(user_index, ds_pivot, ds_pred, num_recommendations)


 Below are the recommemded movies for user(user_index = 2345): 

                   user_ratings  user_predictions
Recommended Movie                                
Movie86                     0.0          0.000005
Movie95                     0.0          0.000004
Movie140                    0.0          0.000002
Movie102                    0.0          0.000002
Movie90                     0.0          0.000002


Above result shows, it is a collaborative recommemder model. So all the three users given different recommendations based on users past behaviour

Model Evaluation:
Average actual ratings for each movie:

In [39]:
ds_pivot.mean().head()

movies
Movie1      0.001031
Movie10     0.001031
Movie100    0.000825
Movie101    0.005157
Movie102    0.001650
dtype: float64

In [40]:
ds_pred.mean().head()

movies
Movie1     -1.143966e-16
Movie10    -2.371565e-19
Movie100    2.546512e-16
Movie101    2.818410e-16
Movie102    7.905065e-04
dtype: float64

In [42]:
ds_rmse = pd.concat([ds_pivot.mean(), ds_pred.mean()], axis=1)
ds_rmse.columns = ['Avg_actual_ratings','Avg_predicted_ratings']
print(ds_rmse.shape)
ds_rmse.head()

(206, 2)


Unnamed: 0_level_0,Avg_actual_ratings,Avg_predicted_ratings
movies,Unnamed: 1_level_1,Unnamed: 2_level_1
Movie1,0.001031,-1.143966e-16
Movie10,0.001031,-2.371565e-19
Movie100,0.000825,2.546512e-16
Movie101,0.005157,2.81841e-16
Movie102,0.00165,0.0007905065


In [43]:
RMSE = round((((ds_rmse.Avg_actual_ratings - ds_rmse.Avg_predicted_ratings) **2).mean() **0.5), 5)
print("\nRMSE SVD Model = {} \n".format(RMSE))


RMSE SVD Model = 0.00669 



Getting top K(K=5) recommendations:

In [44]:
# Enter 'user_index' and num_recommendations for the user
user_index = 2333
num_recommendations = 5
recommend_items(user_index, ds_pivot, ds_pred, num_recommendations)


 Below are the recommemded movies for user(user_index = 2333): 

                   user_ratings  user_predictions
Recommended Movie                                
Movie202                    0.0          0.002292
Movie132                    0.0          0.002289
Movie188                    0.0          0.002270
Movie189                    0.0          0.002269
Movie190                    0.0          0.002268


This user based collaborative filtering model is a personalised recommender system, the recommendations are based on the past behaviour of the selected users. 