# Project 6: Building User-Based Recommendation Model for Amazon

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings

In [2]:
# Import datasetAmazon - Movies and TV Ratings.csv
df_amz = pd.read_csv("Amazon - Movies and TV Ratings.csv")
df_amz.head()

Unnamed: 0,user_id,Movie1,Movie2,Movie3,Movie4,Movie5,Movie6,Movie7,Movie8,Movie9,...,Movie197,Movie198,Movie199,Movie200,Movie201,Movie202,Movie203,Movie204,Movie205,Movie206
0,A3R5OBKS7OM2IR,5.0,5.0,,,,,,,,...,,,,,,,,,,
1,AH3QC2PC1VTGP,,,2.0,,,,,,,...,,,,,,,,,,
2,A3LKP6WPMP9UKX,,,,5.0,,,,,,...,,,,,,,,,,
3,AVIY68KEPQ5ZD,,,,5.0,,,,,,...,,,,,,,,,,
4,A1CV1WROP5KTTW,,,,,5.0,,,,,...,,,,,,,,,,


In [3]:
df_amz.shape

(4848, 207)

In [4]:
df_amz.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4848 entries, 0 to 4847
Columns: 207 entries, user_id to Movie206
dtypes: float64(206), object(1)
memory usage: 7.7+ MB


In [5]:
# Finding null values
df_amz.isnull().sum()

user_id        0
Movie1      4847
Movie2      4847
Movie3      4847
Movie4      4846
            ... 
Movie202    4842
Movie203    4847
Movie204    4840
Movie205    4813
Movie206    4835
Length: 207, dtype: int64

# Exploratory Data Analysis:

## 1. Which movies have maximum views/ratings

In [6]:
# list of total rating count per movie
df_ratings=[df_amz[col].notna().sum()
for col in df_amz.columns] 
df_ratings_count=pd.DataFrame(data=df_ratings,columns=['Total rating count'],index=df_amz.columns)
df_ratings_count.drop('user_id',axis=0,inplace=True)
df_ratings_count.sort_values(by='Total rating count',axis=0,ascending=False).head()

Unnamed: 0,Total rating count
Movie127,2313
Movie140,578
Movie16,320
Movie103,272
Movie29,243


#### Movie 127 has the highest views/ratings by the users

## 2. What is the average rating for each movie? Define the top 5 movies with the maximum ratings

In [7]:
# Identifying average rating
avg_rating=[round(df_amz[col].mean(),2) 
for col in df_amz.drop('user_id',axis=1).columns]
avg_rating

[5.0,
 5.0,
 2.0,
 5.0,
 4.1,
 4.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 4.0,
 5.0,
 4.52,
 3.0,
 5.0,
 3.5,
 3.0,
 5.0,
 5.0,
 5.0,
 4.4,
 5.0,
 3.0,
 5.0,
 3.33,
 4.81,
 4.5,
 5.0,
 4.5,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 4.86,
 5.0,
 1.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 4.5,
 3.47,
 2.0,
 5.0,
 5.0,
 5.0,
 5.0,
 1.0,
 2.0,
 1.0,
 5.0,
 3.0,
 5.0,
 3.0,
 5.0,
 5.0,
 1.0,
 5.0,
 1.0,
 5.0,
 4.0,
 5.0,
 2.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 5.0,
 4.0,
 4.42,
 5.0,
 3.0,
 5.0,
 5.0,
 4.67,
 5.0,
 4.0,
 4.58,
 1.83,
 4.58,
 4.77,
 5.0,
 4.33,
 3.33,
 5.0,
 4.8,
 5.0,
 4.0,
 4.0,
 5.0,
 4.0,
 4.56,
 4.5,
 5.0,
 5.0,
 4.0,
 4.67,
 4.38,
 4.88,
 4.64,
 5.0,
 3.75,
 4.43,
 4.0,
 5.0,
 4.73,
 5.0,
 4.38,
 5.0,
 4.25,
 5.0,
 5.0,
 4.75,
 4.8,
 4.5,
 4.11,
 5.0,
 4.0,
 4.5,
 5.0,
 5.0,
 5.0,
 4.0,
 5.0,
 5.0,
 4.0,
 4.08,
 5.0,
 4.83,
 4.14,
 5.0,
 5.0,
 1.0,
 5.0,
 4.0,
 5.0,
 5.0,
 5.0,
 5.0,
 4.5,
 5.0,
 5.0,
 1.0,
 4.0,
 4.0,
 5.0,
 4.82,
 3.0,
 4.67,
 4.6,
 4.8

In [8]:
# Average rating of each movie
df_avg_rating=pd.DataFrame(data=avg_rating,index=df_amz.drop('user_id',axis=1).columns,columns=['Average ratings'])
df_avg_rating.sort_values(by='Average ratings',ascending=False).head()

Unnamed: 0,Average ratings
Movie1,5.0
Movie66,5.0
Movie76,5.0
Movie75,5.0
Movie74,5.0


#### Top 5 movies with best average ratings


## 3. Define the top 5 movies with the least audience

In [9]:
df_ratings_count.sort_values(by='Total rating count',ascending=True).head()

Unnamed: 0,Total rating count
Movie1,1
Movie71,1
Movie145,1
Movie69,1
Movie68,1


#### Top 5 movies with least audience

# Recommendation Model:

In [10]:
# Reshape Movies
melt_df=df_amz.melt(id_vars= df_amz.columns[0],value_vars=df_amz.columns[1:],var_name='Movie',value_name='rating')
melt_df

Unnamed: 0,user_id,Movie,rating
0,A3R5OBKS7OM2IR,Movie1,5.0
1,AH3QC2PC1VTGP,Movie1,
2,A3LKP6WPMP9UKX,Movie1,
3,AVIY68KEPQ5ZD,Movie1,
4,A1CV1WROP5KTTW,Movie1,
...,...,...,...
998683,A1IMQ9WMFYKWH5,Movie206,5.0
998684,A1KLIKPUF5E88I,Movie206,5.0
998685,A5HG6WFZLO10D,Movie206,5.0
998686,A3UU690TWXCG1X,Movie206,5.0


In [11]:
melt_df.shape

(998688, 3)

In [12]:
#Filling nan values with zero before applying SVD algorithm
melt_filtered = melt_df.fillna(0)
melt_filtered

Unnamed: 0,user_id,Movie,rating
0,A3R5OBKS7OM2IR,Movie1,5.0
1,AH3QC2PC1VTGP,Movie1,0.0
2,A3LKP6WPMP9UKX,Movie1,0.0
3,AVIY68KEPQ5ZD,Movie1,0.0
4,A1CV1WROP5KTTW,Movie1,0.0
...,...,...,...
998683,A1IMQ9WMFYKWH5,Movie206,5.0
998684,A1KLIKPUF5E88I,Movie206,5.0
998685,A5HG6WFZLO10D,Movie206,5.0
998686,A3UU690TWXCG1X,Movie206,5.0


In [13]:
# Import necessary libraries for model building
import surprise
from surprise import Reader
from surprise import Dataset
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

In [14]:
reader = Reader(rating_scale=(-1,10))
data = Dataset.load_from_df(melt_df.fillna(0), reader=reader)

## 4. Divide the data into training and test data

In [15]:
trainset, testset = train_test_split(data, test_size=0.25)

In [16]:
algo = SVD()

## 5. Build a recommendation model on training data

In [17]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x24443e1d460>

## 6. Make prediction on the Dataset

In [18]:
predict= algo.test(testset)

In [19]:
accuracy.rmse(predict)

RMSE: 0.2779


0.2778561702009678

In [20]:
accuracy.mae(predict)

MAE:  0.0412


0.04117377958961099

In [21]:
cross_validate(algo,data,measures=['RMSE','MAE'],cv=3,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.2866  0.2773  0.2821  0.2820  0.0038  
MAE (testset)     0.0429  0.0421  0.0428  0.0426  0.0004  
Fit time          66.12   65.81   66.03   65.99   0.13    
Test time         5.41    5.92    4.95    5.43    0.39    


{'test_rmse': array([0.28659903, 0.27729528, 0.28211307]),
 'test_mae': array([0.04291844, 0.04211121, 0.04284539]),
 'fit_time': (66.11824107170105, 65.81160616874695, 66.03335571289062),
 'test_time': (5.406068563461304, 5.917743444442749, 4.952959775924683)}

In [22]:
user_id='A1CV1WROP5KTTW'
Movie='Movie6'
rating='5'
algo.predict(user_id,Movie,r_ui=rating)
print(cross_validate(algo,data,measures=['RMSE','MAE'],cv=3,verbose=True))

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.2782  0.2883  0.2803  0.2823  0.0044  
MAE (testset)     0.0422  0.0435  0.0426  0.0428  0.0005  
Fit time          67.34   65.67   66.47   66.49   0.68    
Test time         5.83    5.37    5.34    5.52    0.22    
{'test_rmse': array([0.27817503, 0.2882884 , 0.28032356]), 'test_mae': array([0.04224814, 0.04349893, 0.04260549]), 'fit_time': (67.33540987968445, 65.66965985298157, 66.4742317199707), 'test_time': (5.827927827835083, 5.374822378158569, 5.343570709228516)}
