In [2]:
# netflix movie recommender system using SVD(singluar value decomposition)

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

# scikit learn's 'surprise' package is used for recommendation engines.

# installation:

# With pip (you'll need numpy, and a C compiler. Windows users might prefer using conda):
# $ pip install numpy
# $ pip install scikit-surprise

# With conda:
# $ conda install -c conda-forge scikit-surprise

# THIS IS THE BEST METHOD:
# For the latest version, you can also clone the repo and build the source (you'll first need Cython and numpy):
# $ pip install numpy cython
# $ git clone https://github.com/NicolasHug/surprise.git
# $ cd surprise
# $ python setup.py install
from surprise import SVD, Reader, Dataset
from surprise.model_selection import cross_validate

In [4]:
# importing the dataset

df = pd.read_csv('combined_data_1.txt', header = None, names = ['user_id', 'rating'], usecols = [0, 1])
df.head()

Unnamed: 0,user_id,rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


in the dataset, user_id column has '1:' which represents movie number and below that is the 'user_id' who has rated that movie. you can notice that the 'rating' against the 'user_id' is 'NaN'.  

In [5]:
df.shape

(24058263, 2)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24058263 entries, 0 to 24058262
Data columns (total 2 columns):
 #   Column   Dtype  
---  ------   -----  
 0   user_id  object 
 1   rating   float64
dtypes: float64(1), object(1)
memory usage: 367.1+ MB


In [7]:
df.isnull().sum()

user_id       0
rating     4499
dtype: int64

In [8]:
# calculating total number of ratings in each category(1,2 , 3, 4, 5)

ratings = df['rating'].value_counts()
ratings = pd.DataFrame(ratings)
ratings.columns = ['total_ratings']
ratings.sort_values(by = 'rating', ascending = False)

Unnamed: 0_level_0,total_ratings
rating,Unnamed: 1_level_1
5.0,5506583
4.0,8085741
3.0,6904181
2.0,2439073
1.0,1118186


In [9]:
# calculating total number of movies
# earlier we noticed that the value of 'rating' against the movies in df is null. so we can just calculate the total null values in 'rating' column which will give us the total number of movies.
total_movies = df['rating'].isnull().sum()
total_movies

4499

In [10]:
# total number of customers will be {unique_user_id - total_movies} as the 'user_id' column contains both 'movie_id' as well as 'user_id' 

total_users = df['user_id'].nunique() - total_movies
total_users

470758

In [11]:
# adding 'movie_id' column to the dataset which will contain the movie numbers.
df_nan = df['rating'].isnull()
df_nan = pd.DataFrame(df_nan)
df_nan = df_nan[df_nan.rating == True]
df_nan = df_nan.reset_index()
df_nan

Unnamed: 0,index,rating
0,0,True
1,548,True
2,694,True
3,2707,True
4,2850,True
...,...,...
4494,24046714,True
4495,24047329,True
4496,24056849,True
4497,24057564,True


In [12]:
df_nan['index']

0              0
1            548
2            694
3           2707
4           2850
          ...   
4494    24046714
4495    24047329
4496    24056849
4497    24057564
4498    24057834
Name: index, Length: 4499, dtype: int64

In [13]:
index_values = zip(df_nan['index'][:-1], df_nan['index'][1:], [1 for i in range(1, 4499)])
temp_movie_id = []
ini_id = 1
for i, j, k in index_values:
    temp = np.full((1, j-i-k), ini_id)
    # temp_movie_id = np.append(temp_movie_id, temp)
    temp_movie_id.append(temp.tolist()[0])
    ini_id += 1

# for last movie
temp = np.full((1, (len(df) - df_nan.iloc[-1,0] - 1)), ini_id)
temp_movie_id.append(temp.tolist()[0])
temp_movie_id = [item for sublist in temp_movie_id for item in sublist] 

# merging 'temp_movie_id' to 'df' dataframe
df.drop(df_nan['index'], inplace = True) #dropping all the movie_id from user_id column
df['movie'] = temp_movie_id # merging movie_id into df

In [14]:
df.tail()

Unnamed: 0,user_id,rating,movie
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499
24058262,1704416,3.0,4499


In [19]:
# since the data is very large, we will take only ratings of 500 movies(from 1 to 500 movie_id)
df_trimmed = df[df['movie'] < 501]

In [20]:
df_trimmed

Unnamed: 0,user_id,rating,movie
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1
...,...,...,...
2799199,651950,4.0,500
2799200,924510,3.0,500
2799201,965381,3.0,500
2799202,822391,1.0,500


# NOTE: 
    you have to keep in mind that the data passed in SVD() should be in the following format:
            
             user:item:rating

In [26]:
# training SVD model
reader = Reader(rating_scale=(1, 5)) # reader()  helps SVD to read the dataset and tells that the rating is on the scale (1, 5)
data = Dataset.load_from_df(df_trimmed[['user_id', 'movie', 'rating']][:10000], reader) # converting our dataset for cross_validation. we take only first 10000 entries to speedup the execution

In [27]:
svd = SVD()
cross_validate(svd, data, cv = 3, verbose= True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2244  1.2318  1.2362  1.2308  0.0049  
MAE (testset)     1.0053  1.0116  1.0126  1.0098  0.0032  
Fit time          0.24    0.21    0.61    0.35    0.18    
Test time         0.04    0.03    0.07    0.04    0.02    


{'test_rmse': array([1.22438613, 1.23179222, 1.23615091]),
 'test_mae': array([1.00532599, 1.01159357, 1.01261564]),
 'fit_time': (0.23670172691345215, 0.20729899406433105, 0.6095578670501709),
 'test_time': (0.03792619705200195, 0.025056838989257812, 0.0665736198425293)}

In [28]:
# fitting the whole dataset to SVD
data = Dataset.load_from_df(df_trimmed[['user_id', 'movie', 'rating']], reader) # we will use all the data to train the model
trainset = data.build_full_trainset() # transforming the data to fit using SVD
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a77b532f90>

In [47]:
svd.predict(1488844, 1)# predicting rating for user = 1488844, movie = 1
# """
#         svd.predict(1488844, 1).est -> returns predicted rating
#         svd.predict(1488844, 1).uid -> returns user id
#         svd.predict(1488844, 1).iid -> returns movie id
# """

Prediction(uid=1488844, iid=1, r_ui=None, est=3.8835583715155155, details={'was_impossible': False})

In [55]:
# recommending 10 movies for user 1488844
pred = [[x, svd.predict('1488844', x).est] for x in range(1, 501)]
pred = pd.DataFrame(pred, columns=['movie_id', 'predicted_rating'])

# top 10 movie recommendation
pred = pred.sort_values(by = 'predicted_rating',ascending=False)

pred.head(10)

Unnamed: 0,movie_id,predicted_rating
208,209,4.397441
12,13,4.037695
344,345,4.003365
84,85,3.996612
312,313,3.988527
31,32,3.964655
324,325,3.948617
137,138,3.923141
134,135,3.903675
341,342,3.889223


these are the top 10 movies that he is most likely to watch.