#  Recommendation System for Netflix Prize Dataset

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# To load the 'combined_data_1' dataset after uploading it to Jupyter notebook

In [3]:
# Reading dataset file
dataset = pd.read_csv('combined_data_1.txt',header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])
dataset.head()

Unnamed: 0,Cust_Id,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [3]:
# Convert Ratings column to a float
dataset['Rating'] = dataset['Rating'].astype(float)

In [5]:
#To print the datatype of columns
dataset.dtypes

Cust_Id     object
Rating     float64
dtype: object

In [6]:
#To inspect the shape of the datset
dataset.shape

(24058263, 2)

In [7]:
#To print the head of dataset
dataset.head()

Unnamed: 0,Cust_Id,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [8]:
#To find the distribution of different ratings in the datset
p = dataset.groupby('Rating')['Rating'].count()
p

Rating
1.0    1118186
2.0    2439073
3.0    6904181
4.0    8085741
5.0    5506583
Name: Rating, dtype: int64

In [9]:
p=pd.DataFrame(p)
p

Unnamed: 0_level_0,Rating
Rating,Unnamed: 1_level_1
1.0,1118186
2.0,2439073
3.0,6904181
4.0,8085741
5.0,5506583


In [10]:
#p=p.rename(columns = {'Rating':'Count'})

In [11]:
p.rename(columns = {'Rating':'Count'}, inplace = True)

In [12]:
#p.reset_index(inplace=True)

In [13]:
p.sort_values(by='Count', ascending=False)

Unnamed: 0_level_0,Count
Rating,Unnamed: 1_level_1
4.0,8085741
3.0,6904181
5.0,5506583
2.0,2439073
1.0,1118186


In [14]:
dataset.shape

(24058263, 2)

In [15]:
dataset.isna()

Unnamed: 0,Cust_Id,Rating
0,False,True
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
24058258,False,False
24058259,False,False
24058260,False,False
24058261,False,False


In [17]:
#dataset.isna().count() #This will count whole data in the dataset.

Cust_Id    24058263
Rating     24058263
dtype: int64

In [18]:
dataset.isna().sum()

Cust_Id       0
Rating     4499
dtype: int64

In [19]:
dataset.isnull().sum()

Cust_Id       0
Rating     4499
dtype: int64

In [17]:
#dir(pd)

In [18]:
#dir(np)

In [26]:
# get movie count by counting nan values
movie_count = dataset.isnull().sum()[1]#1 in the square brackets is the index of the count of null values in the ratings column
movie_count

4499

In [27]:
dataset['Cust_Id'].nunique()#will display number of unique values in the column

475257

In [21]:
dataset['Cust_Id'].unique()# will display unique values of the column

array(['1:', '1488844', '822109', ..., '72311', '4499:', '594210'],
      dtype=object)

In [22]:
#dataset['Cust_Id'].value_counts()

In [23]:
# get customer count
cust_count = dataset['Cust_Id'].nunique()-movie_count
cust_count

470758

In [24]:
# get rating count

rating_count = dataset['Cust_Id'].count() - movie_count

rating_count

24053764

In [25]:
p

Unnamed: 0_level_0,Count
Rating,Unnamed: 1_level_1
1.0,1118186
2.0,2439073
3.0,6904181
4.0,8085741
5.0,5506583


In [26]:
p.reset_index(inplace=True)# reset_index() will reset the index, the previous index will reset 
#as a column and in place of the index, we will be having 0,1,2....
#inplace=True--- will be updating the values in the same variable

In [27]:
p

Unnamed: 0,Rating,Count
0,1.0,1118186
1,2.0,2439073
2,3.0,6904181
3,4.0,8085741
4,5.0,5506583


## To plot the distribution of the ratings in as a bar plot

In [28]:
y=p['Count']
y

0    1118186
1    2439073
2    6904181
3    8085741
4    5506583
Name: Count, dtype: int64

In [29]:
dataset.head()

Unnamed: 0,Cust_Id,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [30]:
#cust_id    rating    movie_id
#1	1488844	3.0     1
#2	822109	5.0     1
#3	885013	4.0     1
#4	30878	4.0     1

#638248  4.0       2

# To create a numpy array containing movie ids corresponding to the rows in the 'ratings' dataset

In [31]:
# To count all the 'nan' values in the Ratings column in the 'ratings' dataset
df_nan = pd.DataFrame(pd.isnull(dataset.Rating) )
df_nan.head()

Unnamed: 0,Rating
0,True
1,False
2,False
3,False
4,False


In [32]:
# To store the index of all the rows containing 'nan' values
df_nan = df_nan[df_nan['Rating'] == True]
df_nan.shape

(4499, 1)

In [33]:
df_nan

Unnamed: 0,Rating
0,True
548,True
694,True
2707,True
2850,True
...,...
24046714,True
24047329,True
24056849,True
24057564,True


In [34]:
# To reset the index of the dataframe
df_nan=df_nan.reset_index()
df_nan.head()

Unnamed: 0,index,Rating
0,0,True
1,548,True
2,694,True
3,2707,True
4,2850,True


In [35]:
#cust_id    rating    movie_id
#1488844	3.0     1
#2	822109	5.0     1
#3	885013	4.0     1
#4	30878	4.0     1

#638248  4       2

In [36]:
df_nan.head(5)

Unnamed: 0,index,Rating
0,0,True
1,548,True
2,694,True
3,2707,True
4,2850,True


In [37]:
df_nan.shape# 0-4498 #

(4499, 2)

In [38]:
df_nan['index'][:-1]#will extract all the records from the index column except for the last index-- 4498

0              0
1            548
2            694
3           2707
4           2850
          ...   
4493    24046583
4494    24046714
4495    24047329
4496    24056849
4497    24057564
Name: index, Length: 4498, dtype: int64

In [39]:
df_nan['index'][1:]#this sytax will extract records from the index column from the 1st index

1            548
2            694
3           2707
4           2850
5           3991
          ...   
4494    24046714
4495    24047329
4496    24056849
4497    24057564
4498    24057834
Name: index, Length: 4498, dtype: int64

In [40]:
np.full((2,4), 'python')

array([['python', 'python', 'python', 'python'],
       ['python', 'python', 'python', 'python']], dtype='<U6')

In [41]:
dataset.head()

Unnamed: 0,Cust_Id,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [42]:
dataset.shape

(24058263, 2)

In [43]:
#To create a numpy array containing movie ids according the 'ratings' dataset

movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    # numpy approach
    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

# Account for last record and corresponding length
# numpy approach
last_record = np.full((1,len(dataset) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

print(f'Movie numpy: {movie_np}')
print(f'Length: {len(movie_np)}')

Movie numpy: [1.000e+00 1.000e+00 1.000e+00 ... 4.499e+03 4.499e+03 4.499e+03]
Length: 24053764


In [178]:
print('monvie numpy:', movie_np)

monvie numpy: [1.000e+00 1.000e+00 1.000e+00 ... 4.499e+03 4.499e+03 4.499e+03]


In [172]:
l=[3,4,5,3,2]

In [173]:
l[-1]

2

In [170]:
len(dataset)

24058263

In [171]:
df_nan.iloc[-1, 0]

24057834

In [176]:
np.full((1,len(dataset) - df_nan.iloc[-1, 0] - 1),4499)

array([[4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
        4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
        4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
        4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
        4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
        4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
        4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
        4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
        4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
        4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
        4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
        4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
        4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499, 4499,
        4499, 4499, 4499, 4499, 4499, 

In [169]:
df_nan.head()

Unnamed: 0,index,Rating
0,0,True
1,548,True
2,694,True
3,2707,True
4,2850,True


In [167]:
694-548-1

145

In [None]:
#for the 1st loop, i=548, j=0
#np.full((1,i-j-1), movie_id)-- np.full((1,547),1)

#for the 2nd loop, i=694, j=548
#np.full((1,145),2)

In [168]:
np.full((1,145),2)

array([[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]])

In [163]:
r=zip(df_nan['index'][1:],df_nan['index'][:-1])

In [164]:
r

<zip at 0x27e0dc4d140>

In [165]:
tuple(r)

((548, 0),
 (694, 548),
 (2707, 694),
 (2850, 2707),
 (3991, 2850),
 (5011, 3991),
 (5105, 5011),
 (20016, 5105),
 (20112, 20016),
 (20362, 20112),
 (20561, 20362),
 (21108, 20561),
 (21234, 21108),
 (21353, 21234),
 (21644, 21353),
 (24344, 21644),
 (31453, 24344),
 (42176, 31453),
 (42716, 42176),
 (42833, 42716),
 (43052, 42833),
 (43256, 43052),
 (43872, 43256),
 (45206, 43872),
 (46414, 45206),
 (52276, 46414),
 (52550, 52276),
 (92303, 52550),
 (92827, 92303),
 (211241, 92827),
 (211463, 211241),
 (213318, 211463),
 (220209, 213318),
 (220318, 220209),
 (221158, 220318),
 (222098, 221158),
 (222380, 222098),
 (223183, 222380),
 (223936, 223183),
 (224428, 223936),
 (224522, 224428),
 (224651, 224522),
 (224757, 224651),
 (233259, 224757),
 (235879, 233259),
 (242438, 235879),
 (244789, 242438),
 (248381, 244789),
 (248613, 248381),
 (248942, 248613),
 (249033, 248942),
 (254181, 249033),
 (254296, 254181),
 (254456, 254296),
 (256863, 254456),
 (258697, 256863),
 (262260, 258697)

In [44]:
len(dataset)

24058263

In [43]:
df_nan.iloc[-1, 0] - 1

24057833

In [42]:
movie_id = 1
np.full((1,len(dataset) - df_nan.iloc[-1, 0] - 1),movie_id)

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [35]:
df_nan['index'][1:]

1            548
2            694
3           2707
4           2850
5           3991
          ...   
4494    24046714
4495    24047329
4496    24056849
4497    24057564
4498    24057834
Name: index, Length: 4498, dtype: int64

In [36]:
df_nan['index'][:-1]

0              0
1            548
2            694
3           2707
4           2850
          ...   
4493    24046583
4494    24046714
4495    24047329
4496    24056849
4497    24057564
Name: index, Length: 4498, dtype: int64

In [33]:
x =zip(df_nan['index'][1:],df_nan['index'][:-1])
x

<zip at 0x27e00ec5d80>

In [34]:
tuple(x)

((548, 0),
 (694, 548),
 (2707, 694),
 (2850, 2707),
 (3991, 2850),
 (5011, 3991),
 (5105, 5011),
 (20016, 5105),
 (20112, 20016),
 (20362, 20112),
 (20561, 20362),
 (21108, 20561),
 (21234, 21108),
 (21353, 21234),
 (21644, 21353),
 (24344, 21644),
 (31453, 24344),
 (42176, 31453),
 (42716, 42176),
 (42833, 42716),
 (43052, 42833),
 (43256, 43052),
 (43872, 43256),
 (45206, 43872),
 (46414, 45206),
 (52276, 46414),
 (52550, 52276),
 (92303, 52550),
 (92827, 92303),
 (211241, 92827),
 (211463, 211241),
 (213318, 211463),
 (220209, 213318),
 (220318, 220209),
 (221158, 220318),
 (222098, 221158),
 (222380, 222098),
 (223183, 222380),
 (223936, 223183),
 (224428, 223936),
 (224522, 224428),
 (224651, 224522),
 (224757, 224651),
 (233259, 224757),
 (235879, 233259),
 (242438, 235879),
 (244789, 242438),
 (248381, 244789),
 (248613, 248381),
 (248942, 248613),
 (249033, 248942),
 (254181, 249033),
 (254296, 254181),
 (254456, 254296),
 (256863, 254456),
 (258697, 256863),
 (262260, 258697)

In [112]:
694-548-1

145

In [None]:
1,(i-j-1)
1, (694-548-1)
1,145

In [113]:
temp = np.full((1,145),2 )

In [114]:
print(temp)

[[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
  2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
  2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
  2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
  2]]


In [180]:
dataset.head()

Unnamed: 0,Cust_Id,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [182]:
dataset[pd.notnull(dataset['Rating'])]

Unnamed: 0,Cust_Id,Rating
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0
5,823519,3.0
...,...,...
24058258,2591364,2.0
24058259,1791000,2.0
24058260,512536,5.0
24058261,988963,3.0


In [44]:
#To append the above created array to the datset after removing the 'nan' rows
dataset = dataset[pd.notnull(dataset['Rating'])]

dataset['Movie_Id'] = movie_np.astype(int)
dataset['Cust_Id'] =dataset['Cust_Id'].astype(int)
print('-Dataset examples-')
dataset.head()

-Dataset examples-


Unnamed: 0,Cust_Id,Rating,Movie_Id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1


In [45]:
dataset.head(20)

Unnamed: 0,Cust_Id,Rating,Movie_Id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1
6,893988,3.0,1
7,124105,4.0,1
8,1248029,3.0,1
9,1842128,4.0,1
10,2238063,3.0,1


In [46]:
dataset.shape

(24053764, 3)

# Data Cleaning

In [28]:
f = ['count','mean']

In [48]:
dataset.groupby('Movie_Id').agg(f)

Unnamed: 0_level_0,Cust_Id,Cust_Id,Rating,Rating
Unnamed: 0_level_1,count,mean,count,mean
Movie_Id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,547,1.249876e+06,547,3.749543
2,145,1.341926e+06,145,3.558621
3,2012,1.324290e+06,2012,3.641153
4,142,1.402854e+06,142,2.739437
5,1140,1.302384e+06,1140,3.919298
...,...,...,...,...
4495,614,1.321093e+06,614,3.478827
4496,9519,1.332006e+06,9519,3.763000
4497,714,1.330543e+06,714,2.715686
4498,269,1.262035e+06,269,2.464684


In [29]:
f

['count', 'mean']

In [49]:
dataset.groupby('Movie_Id')['Rating'].agg(f)

Unnamed: 0_level_0,count,mean
Movie_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,547,3.749543
2,145,3.558621
3,2012,3.641153
4,142,2.739437
5,1140,3.919298
...,...,...
4495,614,3.478827
4496,9519,3.763000
4497,714,2.715686
4498,269,2.464684


In [50]:
#To create a list of all the movies rated less often(only include top 30% rated movies)
dataset_movie_summary = dataset.groupby('Movie_Id')['Rating'].agg(f)

dataset_movie_summary.index = dataset_movie_summary.index.map(int)

movie_benchmark = round(dataset_movie_summary['count'].quantile(0.75),2)

drop_movie_list = dataset_movie_summary[dataset_movie_summary['count'] <= movie_benchmark].index

print('Movie minimum times of review: {}'.format(movie_benchmark))

Movie minimum times of review: 2538.0


In [None]:
2538.0

In [51]:
#calculate the thershold value of each customer,
dataset_cust_summary = dataset.groupby('Cust_Id')['Rating'].agg(f)#f= count(), mean()
dataset_cust_summary.index = dataset_cust_summary.index.map(int)
cust_benchmark = round(dataset_cust_summary['count'].quantile(0.75),0)
drop_cust_list = dataset_cust_summary[dataset_cust_summary['count'] < cust_benchmark].index

print(f'Customer minimum times of review: {cust_benchmark}')

Customer minimum times of review: 64.0


In [52]:
print(f'Original Shape: {dataset.shape}')


Original Shape: (24053764, 3)


In [53]:
drop_movie_list

Int64Index([   1,    2,    3,    4,    5,    6,    7,    9,   10,   11,
            ...
            4484, 4486, 4487, 4489, 4491, 4494, 4495, 4497, 4498, 4499],
           dtype='int64', name='Movie_Id', length=3374)

In [54]:
drop_cust_list

Int64Index([      8,      10,      25,      33,      42,      59,      83,
                 87,      94,     116,
            ...
            2649351, 2649375, 2649376, 2649379, 2649384, 2649401, 2649404,
            2649409, 2649421, 2649429],
           dtype='int64', name='Cust_Id', length=351840)

In [30]:
dataset.head()

Unnamed: 0,Cust_Id,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [None]:
#isna(), isin()

In [31]:
print('The value of a is:', 34)

The value of a is: 34


In [32]:
print('The value of a is {}'.format(60))

The value of a is 60


In [55]:
dataset = dataset[~dataset['Movie_Id'].isin(drop_movie_list)]#~ symbol will not include the true values coming from the isin()
dataset = dataset[~dataset['Cust_Id'].isin(drop_cust_list)]
print('After Trim Shape: {}'.format(dataset.shape))

After Trim Shape: (15622656, 3)


In [56]:
print('-Data Examples-')
dataset.head()

-Data Examples-


Unnamed: 0,Cust_Id,Rating,Movie_Id
5106,824097,2.0,8
5109,785314,1.0,8
5110,243963,3.0,8
5112,1447783,4.0,8
5116,1912665,1.0,8


# Create ratings matrix for 'ratings' matrix with Rows = userId, Columns = movieId

In [None]:
#sparce matrix-- 

In [57]:
df_p = pd.pivot_table(dataset,values='Rating',index='Cust_Id',columns='Movie_Id')
print(df_p.shape)

(118918, 1125)


In [58]:
df_p.head()

Movie_Id,8,16,17,18,26,28,30,33,44,45,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
Cust_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,3.0,,,,...,3.0,,,,,,,,,
7,5.0,,,,,4.0,5.0,,,,...,3.0,,,5.0,,,,,,
79,,,,,,,3.0,,,,...,4.0,,,,,,4.0,,,
97,,,,,,,,,,,...,,,,,,,,,,
134,,,,,,5.0,,,,,...,,,,,,,,,,


### To load the movie_titles dataset

In [None]:
#df_title['Year']=df_title['Year'].astype('int')

In [None]:
#utf-8' codec can't decode byte 0xe9 in position 207672: invalid continuation byte
#encoding="ISO-8859-1"
#

In [26]:
import pandas as pd
df_title = pd.read_csv(r"C:\Users\lenovo\Documents\Kritika's drive\movie_titles.csv",encoding="ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'])

df_title.set_index('Movie_Id', inplace = True)

print (df_title.head(10))

            Year                          Name
Movie_Id                                      
1         2003.0               Dinosaur Planet
2         2004.0    Isle of Man TT 2004 Review
3         1997.0                     Character
4         1994.0  Paula Abdul's Get Up & Dance
5         2004.0      The Rise and Fall of ECW
6         1997.0                          Sick
7         1992.0                         8 Man
8         2004.0    What the #$*! Do We Know!?
9         1991.0      Class of Nuke 'Em High 2
10        2001.0                       Fighter


In [None]:
#df_title=df_title.loc[:,['Year','Name']]

# To install the scikit-surprise library for implementing SVD

### Run the following command in the Anaconda Prompt to install surprise package

In [None]:
#!pip install scikit-surprise-- works in colab

In [None]:
#conda install -c conda-forge scikit-surprise-- anaconda prompt
#conda update -n base -c defaults conda-- update your conda in anaconda prompt
#!pip install scikit-surprise config --global http.sslVerify false -- for jupyter users--  


In [None]:
#overfiting means-- 
#train the model on train dataset
#test
#acc on top of test dataset== 70%
#acc on top of train dataset== 100%

In [60]:
# Import required libraries
import math
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate#k-fold 

In [None]:
#MAE -- mean alsolute error
#modulus() of(prediction-actual)

#0.56
#1.7


In [61]:
# Load Reader library
reader = Reader()

# get just top 100K rows for faster run time
data = Dataset.load_from_df(dataset[['Cust_Id', 'Movie_Id', 'Rating']][:100000], reader)

# Use the SVD algorithm.
svd = SVD()

# Compute the RMSE of the SVD algorithm
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9902  0.9906  0.9949  0.9919  0.0021  
MAE (testset)     0.7924  0.7862  0.8042  0.7943  0.0075  
Fit time          6.10    6.13    5.96    6.07    0.08    
Test time         0.41    0.41    0.37    0.40    0.02    


{'test_rmse': array([0.99021928, 0.99055512, 0.99485526]),
 'test_mae': array([0.79243484, 0.78619932, 0.80421303]),
 'fit_time': (6.10461163520813, 6.133504629135132, 5.958886384963989),
 'test_time': (0.41113877296447754, 0.408764123916626, 0.37480711936950684)}

In [62]:
data

<surprise.dataset.DatasetAutoFolds at 0x21b204f5fd0>

In [66]:
dataset.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id
5106,824097,2.0,8
5109,785314,1.0,8
5110,243963,3.0,8
5112,1447783,4.0,8
5116,1912665,1.0,8


In [33]:
df_title

Unnamed: 0_level_0,Year,Name
Movie_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW
...,...,...
17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17767,2004.0,Fidel Castro: American Experience
17768,2000.0,Epoch
17769,2003.0,The Company


In [None]:
'''
  movie id  Cust_Id	Rating Name
    90	824097	5- movie_name1
    8	785314	5- movie_name 2
    56	243963	5
    100	1447783	5
    290	1912665	5
'''

## To find all the movies rated as 5 stars by user with userId = 712664

In [67]:
dataset_712664 = dataset[(dataset['Cust_Id'] == 712664) & (dataset['Rating'] == 5)]
dataset_712664 = dataset_712664.set_index('Movie_Id')
dataset_712664 = dataset_712664.join(df_title)['Name']
dataset_712664.head(10)

Movie_Id
79                           The Killing
175                       Reservoir Dogs
199                      The Deer Hunter
241                   North by Northwest
256    Ghost Dog: The Way of the Samurai
348        The Last Temptation of Christ
357                House of Sand and Fog
416                             Elephant
442                  Mississippi Burning
457                    Kill Bill: Vol. 2
Name: Name, dtype: object

In [None]:
user_712664=df_title

In [35]:
df_title

Unnamed: 0_level_0,Year,Name
Movie_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW
...,...,...
17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17767,2004.0,Fidel Castro: American Experience
17768,2000.0,Epoch
17769,2003.0,The Company


# Train an SVD to predict ratings for user with userId = 1

In [69]:
# Create a shallow copy for the movies dataset
user_712664 = df_title.copy()

user_712664 = user_712664.reset_index()

#To remove all the movies rated less often 
user_712664 = user_712664[~user_712664['Movie_Id'].isin(drop_movie_list)]
#user_712664-- is only containg the names of the movies

# getting full dataset
data = Dataset.load_from_df(dataset[['Cust_Id', 'Movie_Id', 'Rating']], reader)

#create a training set for svd
trainset = data.build_full_trainset()
svd.fit(trainset)

#Predict the ratings for user_712664
user_712664['Estimate_Score'] = user_712664['Movie_Id'].apply(lambda x: svd.predict(712664, x).est)

#Drop extra columns from the user_712664 data frame
user_712664 = user_712664.drop('Movie_Id', axis = 1)

# Sort predicted ratings for user_712664 in descending order
user_712664 = user_712664.sort_values('Estimate_Score', ascending=False)

#Print top 10 recommendations
print(user_712664.head(10))

        Year                                   Name  Estimate_Score
995   1961.0                                Yojimbo        5.000000
871   1954.0                          Seven Samurai        5.000000
174   1992.0                         Reservoir Dogs        5.000000
3289  1974.0                          The Godfather        5.000000
560   1963.0                           High and Low        4.993983
2101  1994.0                 The Simpsons: Season 6        4.990440
721   2003.0                     The Wire: Season 1        4.921276
222   2003.0             Chappelle's Show: Season 1        4.919926
1707  1936.0                           Modern Times        4.913572
3443  2004.0  Family Guy: Freakin' Sweet Collection        4.908444
