# Movie recommendation system using Python

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Read the csv file
df = pd.read_csv("Amazon - Movies and TV Ratings.csv")

In [3]:
# Check the first 5 values
df.head()

Unnamed: 0,user_id,Movie1,Movie2,Movie3,Movie4,Movie5,Movie6,Movie7,Movie8,Movie9,...,Movie197,Movie198,Movie199,Movie200,Movie201,Movie202,Movie203,Movie204,Movie205,Movie206
0,A3R5OBKS7OM2IR,5.0,5.0,,,,,,,,...,,,,,,,,,,
1,AH3QC2PC1VTGP,,,2.0,,,,,,,...,,,,,,,,,,
2,A3LKP6WPMP9UKX,,,,5.0,,,,,,...,,,,,,,,,,
3,AVIY68KEPQ5ZD,,,,5.0,,,,,,...,,,,,,,,,,
4,A1CV1WROP5KTTW,,,,,5.0,,,,,...,,,,,,,,,,


In [4]:
# Chech additional information and dimensions
df.info(), df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4848 entries, 0 to 4847
Columns: 207 entries, user_id to Movie206
dtypes: float64(206), object(1)
memory usage: 7.7+ MB


(None, (4848, 207))

In [5]:
# Calculating statistical terms for each movie
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Movie1,1.0,5.000000,,5.0,5.00,5.0,5.0,5.0
Movie2,1.0,5.000000,,5.0,5.00,5.0,5.0,5.0
Movie3,1.0,2.000000,,2.0,2.00,2.0,2.0,2.0
Movie4,2.0,5.000000,0.000000,5.0,5.00,5.0,5.0,5.0
Movie5,29.0,4.103448,1.496301,1.0,4.00,5.0,5.0,5.0
...,...,...,...,...,...,...,...,...
Movie202,6.0,4.333333,1.632993,1.0,5.00,5.0,5.0,5.0
Movie203,1.0,3.000000,,3.0,3.00,3.0,3.0,3.0
Movie204,8.0,4.375000,1.407886,1.0,4.75,5.0,5.0,5.0
Movie205,35.0,4.628571,0.910259,1.0,5.00,5.0,5.0,5.0


### Task 1 - Which movies have the maximum views/ratings?

In [6]:
# Movie with the highest views
df.describe().T["count"].sort_values(ascending=False)[:1]

Movie127    2313.0
Name: count, dtype: float64

In [7]:
# Moview with the highest ratings
df.iloc[:, 1:].sum().sort_values(ascending=False)[:1]

Movie127    9511.0
dtype: float64

### Task 2 - What is the average rating for each movie? Define the top 5 movies with the maximum ratings

In [8]:
# Average rating of each movie
df.describe().T["mean"].to_frame()

Unnamed: 0,mean
Movie1,5.000000
Movie2,5.000000
Movie3,2.000000
Movie4,5.000000
Movie5,4.103448
...,...
Movie202,4.333333
Movie203,3.000000
Movie204,4.375000
Movie205,4.628571


In [9]:
# Top 5 movies 
df.iloc[:, 1:].mean().sort_values(ascending=False)[:5].to_frame().rename(columns={0:"Average rating"})

Unnamed: 0,Average rating
Movie1,5.0
Movie66,5.0
Movie76,5.0
Movie75,5.0
Movie74,5.0


### Task 3 - Define the top 5 movies with the least audience

In [10]:
df.describe().T["count"].sort_values()[:5].to_frame()

Unnamed: 0,count
Movie1,1.0
Movie71,1.0
Movie145,1.0
Movie69,1.0
Movie68,1.0


### Task 4 - Build the recommendation model

In [11]:
#importing libiraies for model building
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise import SVD
from surprise.model_selection import train_test_split

ModuleNotFoundError: No module named 'surprise'

In [None]:
movie_data = df.melt(id_vars = df.columns[0],value_vars=df.columns[1:],var_name="Movies",value_name="Rating")
movie_data

In [None]:
#creating a dataset for training and testing
rd = Reader(rating_scale=(-1,10))
data = Dataset.load_from_df(movie_data.fillna(0),reader=rd)
data

In [None]:
train_data,test_data = train_test_split(data,test_size=0.20)

In [None]:
#Using SVD (Singular Value Descomposition)
svd = SVD()
svd.fit(train_data)

In [None]:
pred = svd.test(test_data)
pred

In [None]:
# Check the accuracy of the results 
accuracy.rmse(pred), accuracy.mae(pred)

In [None]:
u_id='AH3QC2PC1VTGP'
mv = 'Movie206'
r_id = 5.0
svd.predict(u_id, mv, r_ui=r_id, verbose= True)

In [None]:
from surprise.model_selection import cross_validate

cross_validate(svd, data, measures = ['RMSE', 'MAE'], cv = 3, verbose = True)

In [None]:
def repeat(ml_type,dframe,min_,max_):
    rd = Reader()
    data = Dataset.load_from_df(dframe,reader=rd)
    print(cross_validate(ml_type, data, measures = ['RMSE', 'MAE'], cv = 3, verbose = True))
    print("#"*10)
    u_id = 'AH3QC2PC1VTGP'
    m_id = 'Movie206'
    ra_u = 5.0
    print(ml_type.predict(u_id,mv,r_ui=ra_u,verbose=True))
    print("#"*10)
    print()

In [None]:
df = df.iloc[:3000, :50]
movie_data = df.melt(id_vars = df.columns[0],value_vars=df.columns[1:],var_name="Movies",value_name="Rating")

In [None]:
repeat(SVD(),movie_data.fillna(0),-1,10)
repeat(SVD(),movie_data.fillna(movie_data.mean()),-1,10)
repeat(SVD(),movie_data.fillna(movie_data.median()),-1,10)