## Imports

In [1]:
import pandas as pd
import numpy as np

from surprise import SVD,Reader,Dataset, accuracy,dump
from surprise.model_selection import train_test_split, cross_validate

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px



import os
import time
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

## load data

In [2]:
#list of movies and there genre
movies = pd.read_csv("../../Data/ml-25m/movies.csv") #load the mushrooms data
#rating given to a movie by a user
ratings = pd.read_csv("../../Data/ml-25m/ratings.csv") #load the mushrooms data
#links to the movies on imdb and tmdb(not really used for recomender system but will be used in final product to show additional info)
links = pd.read_csv("../../Data/ml-25m/links.csv") #load the mushrooms data

## Preprocessing

In [3]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [4]:
ratings = ratings.drop(['timestamp'],axis=1)

In [5]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0


## User-User Collaborative filtering using SVD

In [6]:
#used to parse ratings file
# the rating scale by default is (1,5) which is consistent with our movies dataset ratings
ratingsReader = Reader()

In [7]:
#used to load data from pandas datframe
#must be in format user ids, the item ids, and the ratings
#The movies data was previously formated to be in this formt
surpriseData = Dataset.load_from_df(ratings,ratingsReader)

In [8]:
surpriseData.raw_ratings

[(1, 296, 5.0, None),
 (1, 306, 3.5, None),
 (1, 307, 5.0, None),
 (1, 665, 5.0, None),
 (1, 899, 3.5, None),
 (1, 1088, 4.0, None),
 (1, 1175, 3.5, None),
 (1, 1217, 3.5, None),
 (1, 1237, 5.0, None),
 (1, 1250, 4.0, None),
 (1, 1260, 3.5, None),
 (1, 1653, 4.0, None),
 (1, 2011, 2.5, None),
 (1, 2012, 2.5, None),
 (1, 2068, 2.5, None),
 (1, 2161, 3.5, None),
 (1, 2351, 4.5, None),
 (1, 2573, 4.0, None),
 (1, 2632, 5.0, None),
 (1, 2692, 5.0, None),
 (1, 2843, 4.5, None),
 (1, 3448, 4.0, None),
 (1, 3569, 5.0, None),
 (1, 3949, 5.0, None),
 (1, 4144, 5.0, None),
 (1, 4308, 3.0, None),
 (1, 4325, 5.0, None),
 (1, 4422, 3.0, None),
 (1, 4703, 4.0, None),
 (1, 4973, 4.5, None),
 (1, 5147, 4.0, None),
 (1, 5269, 0.5, None),
 (1, 5684, 2.0, None),
 (1, 5767, 5.0, None),
 (1, 5878, 4.0, None),
 (1, 5912, 3.0, None),
 (1, 5952, 4.0, None),
 (1, 6016, 5.0, None),
 (1, 6370, 4.5, None),
 (1, 6377, 4.0, None),
 (1, 6539, 3.5, None),
 (1, 6711, 5.0, None),
 (1, 6954, 3.5, None),
 (1, 7209, 4.0, 

In [9]:
# crossVal = cross_validate(svd,surpriseData,measures=['RMSE'],cv=10,verbose=True)

In [10]:
#split the data into train test split
trainSet,testSet = train_test_split(surpriseData,test_size=0.3)

In [11]:
startTime = time.time()
svd = SVD()

In [12]:
svd.fit(trainSet)
print("SVD train time:",time.time() - startTime)

SVD train time: 998.1731200218201


### save the model

In [13]:
file_name = os.path.expanduser('modelMovieLens')

In [14]:
dump.dump(file_name, algo=svd)

### accuracy 

In [16]:
predict = svd.test(testSet)

In [17]:
accuracy.rmse(predict)

RMSE: 0.7849


0.7849399178972604

In [None]:
_, svd2Test = dump.load(file_name)

In [None]:
svd2Test.predict(1,296)

In [None]:
predict = svd.test(testSet)

In [None]:
accuracy.rmse(predict)

In [None]:
svd.qi.shape

In [None]:
svd.predict(1,306)

In [None]:
svd.trainset