# Recommendation System for MoveLens Dataset using SVD

In [1]:
# Import libraries
import numpy as np
import pandas as pd

# To load the 'ratings' and 'movies' dataset after uploading them to Jupyter notebook

In [3]:
# Reading ratings file

ratings = pd.read_csv('ratings.csv', usecols=['userId','movieId','rating','timestamp'])

In [4]:
# Reading movies file

movies = pd.read_csv('movies.csv', usecols=['movieId','title','genres'])

In [5]:
# Print first five rows of movies datset

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# Print first five rows of ratings datset

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2.0,3.5,1112486000.0
1,1,29.0,3.5,1112485000.0
2,1,32.0,3.5,1112485000.0
3,1,47.0,3.5,1112485000.0
4,1,50.0,3.5,1112485000.0


# To find the unique number of users and movies in the 'ratings' dataset

In [7]:

n_users = ratings.userId.unique().shape[0]

n_movies = ratings.movieId.unique().shape[0]

print(f'Number of users = {n_users} and Number of movies = {n_movies}')

Number of users = 969 and Number of movies = 9645


# To create a rating matrix for the 'ratings' dataset

In [8]:
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

movieId,NaN,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,120813.0,120815.0,120821.0,120823.0,121117.0,121135.0,125916.0,128488.0,128594.0,128715.0
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# To install the scikit-surprise library for implementing SVD

### Run the following command in the Anaconda Prompt to install surprise package

In [12]:
!pip install scikit-surprise


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 KB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp39-cp39-linux_x86_64.whl size=3193648 sha256=fda7a60ba7490ea980919ecd4ed67eae107955c1683d49b6dceb4397c8509e14
  Stored in directory: /root/.cache/pip/wheels/c6/3a/46/9b17b3512bdf283c6cb84f59929cdd5199d4e754d596d22784
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [13]:
# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

# Load Reader library
reader = Reader()

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Use the SVD algorithm.
svd = SVD()

# Compute the RMSE of the SVD algorithm.
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.8798  nan     1.8741  nan     nan     
MAE (testset)     1.5241  nan     1.5196  nan     nan     
Fit time          4.48    3.32    2.54    3.45    0.80    
Test time         0.85    1.00    0.59    0.81    0.17    


{'test_rmse': array([1.8797989 ,        nan, 1.87406202]),
 'test_mae': array([1.5241439 ,        nan, 1.51963649]),
 'fit_time': (4.478881597518921, 3.3237030506134033, 2.539552927017212),
 'test_time': (0.8468966484069824, 0.9983129501342773, 0.5856435298919678)}

In [14]:
# Print the head of ratings dataset
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2.0,3.5,1112486000.0
1,1,29.0,3.5,1112485000.0
2,1,32.0,3.5,1112485000.0
3,1,47.0,3.5,1112485000.0
4,1,50.0,3.5,1112485000.0


# To find all the movies rated as more than 4 stars by user with userId = 1

In [15]:
ratings_1 = ratings[(ratings['userId'] == 1) & (ratings['rating'] >= 4)]
ratings_1 = ratings_1.set_index('movieId')
ratings_1 = ratings_1.join(movies)['title']
ratings_1.head(10)

movieId
151.0                                 Batman Forever (1995)
223.0                                      Dream Man (1995)
253.0                                         Junior (1994)
260.0                              Ladybird Ladybird (1994)
293.0                                   Pulp Fiction (1994)
296.0                                         Priest (1994)
318.0     Strawberry and Chocolate (Fresa y chocolate) (...
541.0                                          Harem (1985)
1036.0                                          Jude (1996)
1079.0                                       Top Gun (1986)
Name: title, dtype: object

# Train an SVD to predict ratings for user with userId = 1

In [16]:
# Create a shallow copy for the movies dataset
user_1 = movies.copy()

#Reset the index for user_1 dataset
user_1 = user_1.reset_index()



# getting full dataset
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)


#create a training set for svd
trainset = data.build_full_trainset()
svd.fit(trainset)

#Predict the ratings for user1
user_1['Estimate_Score'] = user_1['movieId'].apply(lambda x: svd.predict(1, x).est)

#Drop extra columns from the user1 data frame
user_1 = user_1.drop(['movieId','genres','index'], axis = 1)

# Sort predicted ratings for user1 in descending order
user_1 = user_1.sort_values('Estimate_Score', ascending=False)

#Print top 10 recommendations
print(user_1.head(10))

                                  title  Estimate_Score
0                      Toy Story (1995)               5
18193                   LennoNYC (2010)               5
18191          Wuthering Heights (2011)               5
18190                    Wichita (1955)               5
18189      Story of Mankind, The (1957)               5
18188                     Fright (1972)               5
18187  Shaolin (Xin shao lin si) (2011)               5
18186          Big Hangover, The (1950)               5
18185          Another Happy Day (2011)               5
18184     I Want to Be a Soldier (2011)               5
