

# Netflix Recommendation Engine


## Importing Necessary Libraries

In [1]:
!pip install scikit-surprise --quiet

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Mounting Google Drive for dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Dataframe of the dataset

In [4]:
netflix_dataset = pd.read_csv('/content/drive/MyDrive/Netflix/Copy of combined_data_1.txt.zip', header=None,names=['Cust_Id','Ratings'],usecols=[0,1])

## Exploring Data

In [5]:
netflix_dataset.head(5)

Unnamed: 0,Cust_Id,Ratings
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [6]:
netflix_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24058263 entries, 0 to 24058262
Data columns (total 2 columns):
 #   Column   Dtype  
---  ------   -----  
 0   Cust_Id  object 
 1   Ratings  float64
dtypes: float64(1), object(1)
memory usage: 367.1+ MB


## Structuring the Data

### total count of movies

- using  is null because the movie ID does not have any Ratings column value

In [7]:
movie_count = netflix_dataset.isnull().sum()

In [8]:
movie_count = movie_count['Ratings']

In [9]:
movie_count

4499

### totl unique values of Cust_Id column

In [10]:
total_count = netflix_dataset['Cust_Id'].nunique()

In [11]:
total_count

475257

In [12]:
customer_count=total_count-movie_count

In [13]:
customer_count

470758

### value counts of Rating Column

In [14]:
review_count = netflix_dataset['Ratings'].value_counts()

In [15]:
review_count

Unnamed: 0_level_0,count
Ratings,Unnamed: 1_level_1
4.0,8085741
3.0,6904181
5.0,5506583
2.0,2439073
1.0,1118186


In [16]:
review_count = netflix_dataset['Ratings'].value_counts().sum()

In [17]:
review_count

24053764

### to make a new column for movie_ids,

- creating new array of movie id of each rows by replacing

In [18]:
movie_np = []

for customer in netflix_dataset['Cust_Id']:
  if ":" in customer:
    movie_id = int(customer.replace(":",""))
  movie_np.append(movie_id)

In [19]:
movie_np

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


### adding new column to the dataframe

In [20]:
netflix_dataset['Movie_Id']=movie_np

In [21]:
netflix_dataset

Unnamed: 0,Cust_Id,Ratings,Movie_Id
0,1:,,1
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
...,...,...,...
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499


### remove the movie ID at the Cust_Id column


In [22]:
netflix_dataset.dropna(inplace=True)

In [23]:
netflix_dataset

Unnamed: 0,Cust_Id,Ratings,Movie_Id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1
...,...,...,...
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499


In [24]:
netflix_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24053764 entries, 1 to 24058262
Data columns (total 3 columns):
 #   Column    Dtype  
---  ------    -----  
 0   Cust_Id   object 
 1   Ratings   float64
 2   Movie_Id  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 734.1+ MB


In [25]:
netflix_dataset['Cust_Id'] = netflix_dataset['Cust_Id'].astype(int)

In [26]:
netflix_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24053764 entries, 1 to 24058262
Data columns (total 3 columns):
 #   Column    Dtype  
---  ------    -----  
 0   Cust_Id   int64  
 1   Ratings   float64
 2   Movie_Id  int64  
dtypes: float64(1), int64(2)
memory usage: 734.1 MB


In [27]:
movie_re_count=netflix_dataset['Movie_Id'].value_counts()

In [28]:
movie_re_count

Unnamed: 0_level_0,count
Movie_Id,Unnamed: 1_level_1
1905,193941
2152,162597
3860,160454
4432,156183
571,154832
...,...
4294,44
915,43
3656,42
4338,39


In [29]:
bench_mark=round(movie_re_count.quantile(0.6),0)

In [30]:
drop_movie_index = movie_re_count[movie_re_count<bench_mark].index

In [31]:
drop_movie_index

Index([1598, 1733, 1647, 4099, 1616, 1446,  263, 4259,  160, 1988,
       ...
       1858, 4035, 3693, 2805,  820, 4294,  915, 3656, 4338, 4362],
      dtype='int64', name='Movie_Id', length=2699)

In [32]:
len(drop_movie_index)

2699

In [33]:
cust_rew_count=netflix_dataset['Cust_Id'].value_counts()

In [34]:
cust_rew_count

Unnamed: 0_level_0,count
Cust_Id,Unnamed: 1_level_1
305344,4467
387418,4422
2439493,4195
1664010,4019
2118461,3769
...,...
1300341,1
2550360,1
11848,1
930788,1


In [35]:
bench_mark_cus=round(cust_rew_count.quantile(0.6),0)

In [36]:
bench_mark_cus

36.0

In [37]:
drop_cust_index=cust_rew_count[cust_rew_count<bench_mark_cus].index

In [38]:
drop_cust_index

Index([2194851,  600295, 1739398, 1157368,  532108, 2157249,  256134,  640441,
       1272324, 1346990,
       ...
       1969065,  899932,  611596, 2147176,  811650, 1300341, 2550360,   11848,
        930788,  594210],
      dtype='int64', name='Cust_Id', length=282042)

In [39]:
drop_movie_index,drop_cust_index

(Index([1598, 1733, 1647, 4099, 1616, 1446,  263, 4259,  160, 1988,
        ...
        1858, 4035, 3693, 2805,  820, 4294,  915, 3656, 4338, 4362],
       dtype='int64', name='Movie_Id', length=2699),
 Index([2194851,  600295, 1739398, 1157368,  532108, 2157249,  256134,  640441,
        1272324, 1346990,
        ...
        1969065,  899932,  611596, 2147176,  811650, 1300341, 2550360,   11848,
         930788,  594210],
       dtype='int64', name='Cust_Id', length=282042))

In [40]:
netflix_dataset=netflix_dataset[~netflix_dataset['Movie_Id'].isin(drop_movie_index)]
netflix_dataset=netflix_dataset[~netflix_dataset['Cust_Id'].isin(drop_cust_index)]

In [41]:
netflix_dataset.shape

(19695836, 3)

In [42]:
netflix_dataset

Unnamed: 0,Cust_Id,Ratings,Movie_Id
696,712664,5.0,3
697,1331154,4.0,3
698,2632461,3.0,3
699,44937,5.0,3
700,656399,4.0,3
...,...,...,...
24056842,1055714,5.0,4496
24056843,2643029,4.0,4496
24056844,267802,4.0,4496
24056845,1559566,3.0,4496


In [43]:
movie_title=pd.read_csv('/content/drive/MyDrive/Netflix/Copy of movie_titles.csv',encoding='ISO-8859-1',header=None,names=['Movie_Id','Year','Name'],usecols=[0,1,2])

In [44]:
movie_title

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [45]:
!pip install numpy==1.23.5 --quiet

In [46]:
!pip install scikit-surprise --quiet

In [47]:
from surprise import Reader, Dataset, SVD # Now import the necessary modules


In [48]:
from surprise.model_selection import cross_validate

In [49]:
reader=Reader()

In [50]:
data=Dataset.load_from_df(netflix_dataset[['Movie_Id','Cust_Id','Ratings']][:100000],reader)

In [51]:
data

<surprise.dataset.DatasetAutoFolds at 0x7e208a0872d0>

In [52]:
model=SVD()

In [53]:
cross_validate(model,data,measures=['RMSE'],cv=3)

{'test_rmse': array([1.0177564 , 1.01765568, 1.02173265]),
 'fit_time': (1.602170705795288, 1.6650099754333496, 3.8817574977874756),
 'test_time': (0.2205522060394287, 0.40613627433776855, 1.6507842540740967)}

In [54]:
user_rating=netflix_dataset[netflix_dataset['Cust_Id']==1331154]

In [55]:
user_rating

Unnamed: 0,Cust_Id,Ratings,Movie_Id
697,1331154,4.0,3
5178,1331154,4.0,8
31460,1331154,3.0,18
92840,1331154,4.0,30
224761,1331154,3.0,44
...,...,...,...
23439584,1331154,4.0,4389
23546489,1331154,2.0,4402
23649431,1331154,4.0,4432
23844441,1331154,3.0,4472


In [56]:
user_1331154=movie_title.copy()
user_1331154

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [57]:
user_1331154=user_1331154[~user_1331154['Movie_Id'].isin(drop_movie_index)]
user_1331154

Unnamed: 0,Movie_Id,Year,Name
2,3,1997.0,Character
4,5,2004.0,The Rise and Fall of ECW
5,6,1997.0,Sick
7,8,2004.0,What the #$*! Do We Know!?
15,16,1996.0,Screamers
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [58]:
est=[]
for x in user_1331154['Movie_Id']:
  temp=model.predict(1331154,x).est
  est.append(temp)

user_1331154['Estimated']=est

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_1331154['Estimated']=est


In [59]:
user_1331154

Unnamed: 0,Movie_Id,Year,Name,Estimated
2,3,1997.0,Character,3.585342
4,5,2004.0,The Rise and Fall of ECW,3.585342
5,6,1997.0,Sick,3.585342
7,8,2004.0,What the #$*! Do We Know!?,3.585342
15,16,1996.0,Screamers,3.585342
...,...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...,3.585342
17766,17767,2004.0,Fidel Castro: American Experience,3.585342
17767,17768,2000.0,Epoch,3.585342
17768,17769,2003.0,The Company,3.585342


In [60]:
user_1331154=user_1331154.sort_values('Estimated',ascending=False)

In [61]:
user_1331154.head()

Unnamed: 0,Movie_Id,Year,Name,Estimated
13874,13875,1982.0,Gilbert and Sullivan: The Mikado,3.908033
17450,17451,2000.0,Along for the Ride,3.869362
13510,13511,1993.0,Much Ado About Nothing,3.85346
15673,15674,1999.0,Arlington Road,3.80771
14034,14035,1993.0,Italian Movie,3.800759
