In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("CapstoneDataSet.csv")
data

Unnamed: 0,Cust_Id,Rating,Movie_Id,Genre,MovieName
0,1488844,3,1,Action,Dinosaur Planet
1,822109,5,1,Action,Dinosaur Planet
2,885013,4,1,Action,Dinosaur Planet
3,30878,4,1,Action,Dinosaur Planet
4,823519,3,1,Action,Dinosaur Planet
...,...,...,...,...,...
1048569,196625,4,241,Horror,North by NorthWest
1048570,138835,3,241,Horror,North by NorthWest
1048571,1400154,5,241,Horror,North by NorthWest
1048572,1018992,3,241,Horror,North by NorthWest


In [3]:
df_title = data[['Movie_Id','Genre','MovieName']]
df_title = df_title.drop_duplicates()

In [4]:
data['Rating'] = data['Rating'].astype('float')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048574 entries, 0 to 1048573
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   Cust_Id    1048574 non-null  int64  
 1   Rating     1048574 non-null  float64
 2   Movie_Id   1048574 non-null  int64  
 3   Genre      1047488 non-null  object 
 4   MovieName  1048574 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 40.0+ MB


In [6]:
no_of_movies = data['MovieName'].nunique()
no_of_movies

241

In [7]:
rating_dataset = data.groupby('Rating')['Rating'].agg('count')
rating_dataset

Rating
1.0     46054
2.0     99398
3.0    287967
4.0    370551
5.0    244604
Name: Rating, dtype: int64

In [8]:
#most liked genre is historical
list = data.groupby('Genre')['Movie_Id'].agg('count')
a=list.sort_values(ascending=False)
list_of_popular_movies = a.head()

list_of_popular_movies

Genre
Historical     240327
Animation      123898
Educational    111976
Mystery        100898
Crime           97323
Name: Movie_Id, dtype: int64

In [9]:
data_movie_Summary = data.groupby('Movie_Id')['Rating'].agg(['count','mean'])
movie_benchmark = round(data_movie_Summary['count'].quantile(0.7),0)
movie_drop = data_movie_Summary[data_movie_Summary['count']<movie_benchmark].index

print(f'Movie minimum times of review: {movie_benchmark}')

Movie minimum times of review: 1393.0


In [10]:
movie_drop

Int64Index([  1,   2,   4,   5,   6,   7,   9,  10,  11,  12,
            ...
            227, 228, 229, 230, 231, 233, 234, 235, 236, 237],
           dtype='int64', name='Movie_Id', length=168)

In [11]:
data_customer_summary = data.groupby('Cust_Id')['Rating'].agg('count')
benchmark = data_customer_summary.quantile(0.7)
customer_drop = data_customer_summary[data_customer_summary < benchmark].index

print(f'Customer minimum times of review: {benchmark}')

Customer minimum times of review: 4.0


In [12]:
customer_drop

Int64Index([     10,      25,      33,      42,      59,      94,      97,
                116,     131,     158,
            ...
            2649328, 2649331, 2649336, 2649351, 2649370, 2649376, 2649388,
            2649401, 2649404, 2649409],
           dtype='int64', name='Cust_Id', length=186773)

In [13]:
print(f'Original data shape :{data.shape}')

Original data shape :(1048574, 5)


In [14]:
data = data[~data['Movie_Id'].isin(movie_drop)]

data = data[~data["Cust_Id"].isin(customer_drop)]

In [15]:
print(f'Cleaned data shape :{data.shape}')

Cleaned data shape :(682197, 5)


In [16]:
data.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id,Genre,MovieName
693,712664,5.0,3,Horror,Character
694,1331154,4.0,3,Horror,Character
696,44937,5.0,3,Horror,Character
697,656399,4.0,3,Horror,Character
698,439011,1.0,3,Horror,Character


In [17]:
data_p = pd.pivot_table(data, index = 'Cust_Id', columns = 'Movie_Id')
data_p

Unnamed: 0_level_0,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating
Movie_Id,3,8,16,17,18,26,28,30,32,33,...,213,215,216,223,225,232,238,239,240,241
Cust_Id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
6,,,,,,,,3.0,,,...,,,,,,,,,,
7,,5.0,,,,,4.0,5.0,,,...,,,,,,,,,,
79,,,,,,,,3.0,,,...,,,,,2.0,,,,,
87,,,,,,,,,,,...,,,,,,,,,,
134,,,,,,,5.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649335,,,,,,,,,,,...,,,,,,,,,,
2649375,,,,,,,4.0,,,,...,,,,,,,,,,
2649378,,,,,,,3.0,3.0,,,...,,,,,,,,,,
2649426,,,,4.0,,,4.0,4.0,,,...,,,4.0,,,,,,,


In [18]:
conda install -c conda-forge scikit-surprise

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [19]:
import math


from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [20]:
reader = Reader()

data1 = Dataset.load_from_df(data[['Cust_Id','Rating','Movie_Id']][:65000], reader)

svd = SVD()

cross_validate(svd, data1, measures=['RMSE','MAE'], cv=4, verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 4 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Mean    Std     
RMSE (testset)    18.9854 19.0494 18.9781 18.9704 18.9958 0.0314  
MAE (testset)     17.3364 17.4096 17.3285 17.3153 17.3475 0.0366  
Fit time          3.21    3.22    3.22    3.20    3.21    0.01    
Test time         0.09    0.16    0.14    0.16    0.14    0.03    


{'test_rmse': array([18.98535847, 19.04936101, 18.97814415, 18.97042391]),
 'test_mae': array([17.33640156, 17.40957741, 17.32852457, 17.31534807]),
 'fit_time': (3.2112350463867188,
  3.217993974685669,
  3.217991828918457,
  3.2023682594299316),
 'test_time': (0.09372878074645996,
  0.15625357627868652,
  0.14056158065795898,
  0.1562182903289795)}

In [21]:
data

Unnamed: 0,Cust_Id,Rating,Movie_Id,Genre,MovieName
693,712664,5.0,3,Horror,Character
694,1331154,4.0,3,Horror,Character
696,44937,5.0,3,Horror,Character
697,656399,4.0,3,Horror,Character
698,439011,1.0,3,Horror,Character
...,...,...,...,...,...
1048568,1769515,5.0,241,Horror,North by NorthWest
1048570,138835,3.0,241,Horror,North by NorthWest
1048571,1400154,5.0,241,Horror,North by NorthWest
1048572,1018992,3.0,241,Horror,North by NorthWest


In [22]:
data_712664 = data[(data['Cust_Id']==712664 ) & ( data['Rating']==5.0)]
data_712664 = data_712664.set_index('Movie_Id')
data_712664

Unnamed: 0_level_0,Cust_Id,Rating,Genre,MovieName
Movie_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,712664,5.0,Horror,Character
79,712664,5.0,Historical,The Killing
175,712664,5.0,Historical,Reservoir Dogs
199,712664,5.0,Crime,The Deer Hunter
241,712664,5.0,Horror,North by NorthWest


In [23]:
user_712664 = df_title.copy()

user_712664 = user_712664.reset_index()

user_712664 = user_712664[~user_712664['Movie_Id'].isin(movie_drop)]

data1 = Dataset.load_from_df(data[['Cust_Id','Rating','Movie_Id']], reader)

train = data1.build_full_trainset()
svd.fit(train)

user_712664['Estimate_Score'] = user_712664['Movie_Id'].apply(lambda x: svd.predict(712664, x).est)

user_712664

Unnamed: 0,index,Movie_Id,Genre,MovieName,Estimate_Score
2,692,3,Horror,Character,5
7,5098,8,Animation,What the #$*! Do We Know!?,5
15,21629,16,Animation,Screamers,5
16,24328,17,Thriller,7 Seconds,5
17,31436,18,Animation,Immortal Beloved,5
...,...,...,...,...,...
231,1003200,232,Documentary,Gross Anatomy,5
237,1009606,238,Other,Led Zeppelin: The Song Remains the Same,5
238,1011941,239,Mystery,Winnie the Pooh: Springtime with Roo,5
239,1014131,240,Drama,Woman of the Year,5


In [24]:
user_712664 = user_712664.drop(columns = ['index','Movie_Id'])
user_712664

Unnamed: 0,Genre,MovieName,Estimate_Score
2,Horror,Character,5
7,Animation,What the #$*! Do We Know!?,5
15,Animation,Screamers,5
16,Thriller,7 Seconds,5
17,Animation,Immortal Beloved,5
...,...,...,...
231,Documentary,Gross Anatomy,5
237,Other,Led Zeppelin: The Song Remains the Same,5
238,Mystery,Winnie the Pooh: Springtime with Roo,5
239,Drama,Woman of the Year,5


In [28]:
user_712664 = user_712664.sort_values('Estimate_Score', ascending=False)
user_712664=user_712664.head(10).reset_index()
user_712664 = user_712664.drop('index',axis=1)

print("The top 10 movie suggestions for user_712664 are:")
print(user_712664)

The top 10 movie suggestions for user_712664 are:
        Genre                                          MovieName  \
0      Horror                                          Character   
1     Mystery                                       Fatal Beauty   
2       Crime  Missing in Action 2: The Beginning / Missing i...   
3   Animation                                      The Last Shot   
4        Gang                                            Scratch   
5      RomCom                                       Regular Guys   
6  Historical                                     Reservoir Dogs   
7   Biography                                The Devil's Brigade   
8       Crime                                         Funny Face   
9   Animation                                         The Chorus   

   Estimate_Score  
0               5  
1               5  
2               5  
3               5  
4               5  
5               5  
6               5  
7               5  
8               5  
9