SVD (Singular Value Decompostiton):  is a method of decomposing a matrix into 3 other matrices.

#### Import libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
# ratings file

ratings = pd.read_csv('D:/Data/new-movie-dataset/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# movies file

movies = pd.read_csv('D:/Data/new-movie-dataset/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# unique users 

n_users = ratings.userId.unique().shape[0]
n_users

610

In [5]:
# unique movies

n_movies = ratings.movieId.unique().shape[0]
n_movies

9724

In [6]:
print(f'Number of users {n_users} and Number of movies {n_movies}')

Number of users 610 and Number of movies 9724


In [7]:
# SVD takes matrix so we convert into matrix
Ratings = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# conda install -c conda-forge scikit-surprise

In [12]:
from surprise import Reader, Dataset, SVD 
from surprise.model_selection import cross_validate

# Initialize reader library
reader = Reader()   

# load ratings dataset with Dataset library
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# SVD algorithm
svd = SVD()

In [13]:
# compute the efficiency of SVD algorithm - RMSE(root mean square error) and MAE(Mean Absolute error)

cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)    # verbose=True - prints the results with process

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8752  0.8797  0.8828  0.8792  0.0031  
MAE (testset)     0.6723  0.6758  0.6793  0.6758  0.0029  
Fit time          8.59    8.25    8.33    8.39    0.15    
Test time         0.59    0.55    0.58    0.57    0.02    


{'test_rmse': array([0.87524852, 0.87968765, 0.88275449]),
 'test_mae': array([0.67226869, 0.67576402, 0.67930959]),
 'fit_time': (8.590767860412598, 8.251573324203491, 8.328555345535278),
 'test_time': (0.5937812328338623, 0.5469067096710205, 0.5781586170196533)}

In [17]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [18]:
# Now we take userId and rating like 5, given for movies by user 1

ratings_1 = ratings[(ratings['userId']==1) & (ratings['rating']==5)]
ratings_1 = ratings_1.set_index('movieId')   # sets index 'movieId'
ratings_1 = ratings_1.join(movies)['title']  # joins the movie df 'title'
ratings_1.head()

movieId
47          Mighty Aphrodite (1995)
50                   Georgia (1995)
101    Anne Frank Remembered (1995)
151                 Mad Love (1995)
157              Nine Months (1995)
Name: title, dtype: object

In [19]:
# Copying the movies data into user_1 using copy() method, it will not affetc the original data if we make some changes

user_1 = movies.copy()
user_1 = user_1.reset_index()

In [20]:
# get the full dataset and create training set, then fit using svd

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)    # dataset

trainset = data.build_full_trainset()    # training set

# fit the train dataset
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1c91590ba8>

In [21]:
# estimate the score of ratings by particular user for different movies
# lamda takes movieId for movies and predict the estimated ratings score

user_1['Estimate_Score'] = user_1['movieId'].apply(lambda x : svd.predict(1,x).est)

# Now we not need these columns only, needed title and esimate score
user_1 = user_1.drop(['movieId', 'genres', 'index'], axis=1)

user_1 = user_1.sort_values('Estimate_Score', ascending=False)
print(user_1.head(10))

                                                  title  Estimate_Score
946                                Graduate, The (1967)             5.0
922                      Godfather: Part II, The (1974)             5.0
975                               Cool Hand Luke (1967)             5.0
596          Ghost in the Shell (Kôkaku kidôtai) (1995)             5.0
314                                 Forrest Gump (1994)             5.0
46                           Usual Suspects, The (1995)             5.0
2743  For a Few Dollars More (Per qualche dollaro in...             5.0
602   Dr. Strangelove or: How I Learned to Stop Worr...             5.0
1701                                  Thing, The (1982)             5.0
659                               Godfather, The (1972)             5.0
