In [20]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate, GridSearchCV

In [2]:
user_data = pd.read_csv(
    "/home/dmitrii/vscode_projects/PMLDL/Assignment2/data/raw/ml-100k/u.data",
    sep='\t',
    names=['user_id', 'movie_id', 'rating', 'timestamp']
    )
user_data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
genres = pd.read_csv('/home/dmitrii/vscode_projects/PMLDL/Assignment2/data/raw/ml-100k/u.genre',
                    names=["genre", "index"],
                    sep='|')
genres_list = genres.genre.to_list()

user_item_names = ['movie_id', 'movie_title', 'release_date', 'video_release_date,', 'imdb_url']
user_item_names.extend(genres_list)
user_item = pd.read_csv(
    "/home/dmitrii/vscode_projects/PMLDL/Assignment2/data/raw/ml-100k/u.item", 
    sep='|',
    encoding='latin-1',
    index_col=False,
    names=user_item_names
)

user_item.head()

Unnamed: 0,movie_id,movie_title,release_date,"video_release_date,",imdb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
user_user = pd.read_csv(
    "/home/dmitrii/vscode_projects/PMLDL/Assignment2/data/raw/ml-100k/u.user",
    sep='|',
    names=['user_id', 'age', 'gender', 'occupation', 'zipcode']
)

user_user.head()

Unnamed: 0,id,age,gender,occupation,zipcode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [7]:
reader = Reader()

In [11]:
data = Dataset.load_from_df(user_data[['user_id', 'movie_id', 'rating']], reader)

In [12]:
model = SVD()
cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9396  0.9326  0.9374  0.9355  0.9431  0.9376  0.0036  
MAE (testset)     0.7407  0.7348  0.7409  0.7391  0.7419  0.7395  0.0025  
Fit time          0.78    0.76    0.79    0.86    0.94    0.83    0.07    
Test time         0.07    0.07    0.14    0.09    0.07    0.08    0.03    


{'test_rmse': array([0.93964606, 0.93258681, 0.93735012, 0.93551247, 0.94311493]),
 'test_mae': array([0.74065961, 0.73483851, 0.74085641, 0.73909173, 0.74194527]),
 'fit_time': (0.783341646194458,
  0.7636086940765381,
  0.7886347770690918,
  0.8598840236663818,
  0.9434778690338135),
 'test_time': (0.06825017929077148,
  0.06569075584411621,
  0.1351466178894043,
  0.08742403984069824,
  0.06634974479675293)}

In [14]:
trainset = data.build_full_trainset()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9f927c7b50>

In [17]:
user_data[user_data['user_id'] == 1]

Unnamed: 0,user_id,movie_id,rating,timestamp
202,1,61,4,878542420
305,1,189,3,888732928
333,1,33,4,878542699
334,1,160,4,875072547
478,1,20,4,887431883
...,...,...,...,...
92049,1,28,4,875072173
92487,1,172,5,874965478
94019,1,122,3,875241498
96699,1,152,5,878542589


In [19]:
model.predict(1, 302, 3)

Prediction(uid=1, iid=31, r_ui=3, est=3.425961973435509, details={'was_impossible': False})

In [28]:
param_grid = {'n_epochs': [5, 10, 20, 40],
              'lr_all': [0.002, 0.005, 0.007]}


In [43]:
gs = GridSearchCV(SVD,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=3,
                  n_jobs=-1,
                  joblib_verbose=True)

In [44]:
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    6.4s finished


In [45]:
print(f"RMSE: {gs.best_score['rmse']}")
print(gs.best_params['rmse'])

RMSE: 0.9449955876745197
{'n_epochs': 20, 'lr_all': 0.005}


In [46]:
svd_model = SVD(**gs.best_params['rmse'])
train = data.build_full_trainset()
svd_model.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9fae1db0d0>

In [50]:
svd_model.predict(uid=1.0, iid=189, verbose=True)

user: 1.0        item: 189        r_ui = None   est = 4.21   {'was_impossible': False}


Prediction(uid=1.0, iid=189, r_ui=None, est=4.212657822256764, details={'was_impossible': False})

In [48]:
user_data[user_data['user_id'] == 1]

Unnamed: 0,user_id,movie_id,rating,timestamp
202,1,61,4,878542420
305,1,189,3,888732928
333,1,33,4,878542699
334,1,160,4,875072547
478,1,20,4,887431883
...,...,...,...,...
92049,1,28,4,875072173
92487,1,172,5,874965478
94019,1,122,3,875241498
96699,1,152,5,878542589
