## Building User-Based Recommendation Model for Amazon .


In [2]:
# Import the libraries
import pandas as pd

In [3]:
## Understand the data
movie=pd.read_csv(r'data\Amazon - Movies and TV Ratings.csv')
movie

Unnamed: 0,user_id,Movie1,Movie2,Movie3,Movie4,Movie5,Movie6,Movie7,Movie8,Movie9,...,Movie197,Movie198,Movie199,Movie200,Movie201,Movie202,Movie203,Movie204,Movie205,Movie206
0,A3R5OBKS7OM2IR,5.0,5.0,,,,,,,,...,,,,,,,,,,
1,AH3QC2PC1VTGP,,,2.0,,,,,,,...,,,,,,,,,,
2,A3LKP6WPMP9UKX,,,,5.0,,,,,,...,,,,,,,,,,
3,AVIY68KEPQ5ZD,,,,5.0,,,,,,...,,,,,,,,,,
4,A1CV1WROP5KTTW,,,,,5.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4843,A1IMQ9WMFYKWH5,,,,,,,,,,...,,,,,,,,,,5.0
4844,A1KLIKPUF5E88I,,,,,,,,,,...,,,,,,,,,,5.0
4845,A5HG6WFZLO10D,,,,,,,,,,...,,,,,,,,,,5.0
4846,A3UU690TWXCG1X,,,,,,,,,,...,,,,,,,,,,5.0


In [4]:
movie.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4848 entries, 0 to 4847
Columns: 207 entries, user_id to Movie206
dtypes: float64(206), object(1)
memory usage: 7.7+ MB


In [5]:
movie.isnull().sum()

user_id        0
Movie1      4847
Movie2      4847
Movie3      4847
Movie4      4846
            ... 
Movie202    4842
Movie203    4847
Movie204    4840
Movie205    4813
Movie206    4835
Length: 207, dtype: int64

## Exploratory Data Analysis:

In [6]:
# Which movies have maximum views/ratings?
p = movie.describe().T
p

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Movie1,1.0,5.000000,,5.0,5.00,5.0,5.0,5.0
Movie2,1.0,5.000000,,5.0,5.00,5.0,5.0,5.0
Movie3,1.0,2.000000,,2.0,2.00,2.0,2.0,2.0
Movie4,2.0,5.000000,0.000000,5.0,5.00,5.0,5.0,5.0
Movie5,29.0,4.103448,1.496301,1.0,4.00,5.0,5.0,5.0
...,...,...,...,...,...,...,...,...
Movie202,6.0,4.333333,1.632993,1.0,5.00,5.0,5.0,5.0
Movie203,1.0,3.000000,,3.0,3.00,3.0,3.0,3.0
Movie204,8.0,4.375000,1.407886,1.0,4.75,5.0,5.0,5.0
Movie205,35.0,4.628571,0.910259,1.0,5.00,5.0,5.0,5.0


In [7]:
#Which movies have maximum views?
p['count'].sort_values(ascending=False).head(3).to_frame()

Unnamed: 0,count
Movie127,2313.0
Movie140,578.0
Movie16,320.0


In [8]:
# Which movies have maximum ratings?
movie.drop(['user_id'],axis=1).sum().sort_values(ascending=False).head(3).to_frame()

Unnamed: 0,0
Movie127,9511.0
Movie140,2794.0
Movie16,1446.0


In [9]:
# What is the average rating for each movie? Define the top 5 movies with the maximum ratings.
movie.drop(['user_id'],axis=1).mean().sort_values(ascending=False).head(5).to_frame()

Unnamed: 0,0
Movie1,5.0
Movie66,5.0
Movie76,5.0
Movie75,5.0
Movie74,5.0


In [10]:
# Define the top 5 movies with the least audience?
p['count'].sort_values(ascending=True).head(5).to_frame()

Unnamed: 0,count
Movie1,1.0
Movie71,1.0
Movie145,1.0
Movie69,1.0
Movie68,1.0


## Recommendation Model:

In [11]:
## Some of the movies hadn’t been watched and therefore, are not rated by the users. 
# Netflix would like to take this as an opportunity and build a machine learning recommendation algorithm 
# which provides the ratings for each of the users.
movie_data = movie.melt(id_vars = movie.columns[0],value_vars=movie.columns[1:],var_name="Movies",value_name="Rating")
movie_data

Unnamed: 0,user_id,Movies,Rating
0,A3R5OBKS7OM2IR,Movie1,5.0
1,AH3QC2PC1VTGP,Movie1,
2,A3LKP6WPMP9UKX,Movie1,
3,AVIY68KEPQ5ZD,Movie1,
4,A1CV1WROP5KTTW,Movie1,
...,...,...,...
998683,A1IMQ9WMFYKWH5,Movie206,5.0
998684,A1KLIKPUF5E88I,Movie206,5.0
998685,A5HG6WFZLO10D,Movie206,5.0
998686,A3UU690TWXCG1X,Movie206,5.0


In [12]:
movie_data=movie_data.fillna(0)
movie_data

Unnamed: 0,user_id,Movies,Rating
0,A3R5OBKS7OM2IR,Movie1,5.0
1,AH3QC2PC1VTGP,Movie1,0.0
2,A3LKP6WPMP9UKX,Movie1,0.0
3,AVIY68KEPQ5ZD,Movie1,0.0
4,A1CV1WROP5KTTW,Movie1,0.0
...,...,...,...
998683,A1IMQ9WMFYKWH5,Movie206,5.0
998684,A1KLIKPUF5E88I,Movie206,5.0
998685,A5HG6WFZLO10D,Movie206,5.0
998686,A3UU690TWXCG1X,Movie206,5.0


In [17]:
# Divide the data into training and test data
from surprise import Dataset, SVD,accuracy,reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split





In [18]:
rd = Reader()
data = Dataset.load_from_df(movie_data.fillna(0),reader=rd)
data
trainset, testset = train_test_split(data,test_size=0.25)
svd = SVD()
svd.fit(trainset)
pred = svd.test(testset)
accuracy.rmse(pred)
accuracy.mae(pred)
cross_validate(svd, data, measures = ['RMSE', 'MAE'], cv = 3, verbose = True)

NameError: name 'Reader' is not defined