In [37]:
from surprise import Dataset, Reader, SVD, NMF, NormalPredictor, KNNBasic
from surprise.model_selection import cross_validate, GridSearchCV, train_test_split, KFold
from surprise.accuracy import rmse
import pandas as pd
from collections import defaultdict

In [4]:
path = 'E:\\181255_RNMP_HW2\\ml-100k\\'

In [7]:
k = 5

# Read datasets

In [5]:
reader = Reader(line_format="user item rating timestamp", sep=",", skip_lines=1)
df = Dataset.load_from_file(path+"data.csv", reader)
df

<surprise.dataset.DatasetAutoFolds at 0x27c2db0aa30>

In [35]:
df_movies = pd.read_csv(path+'movies.csv', encoding='latin')
df_movies

Unnamed: 0,movie_id,title,release_date,IMDb URL,genre
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,"['Animation', ""Children's"", 'Comedy']"
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,"['Action', 'Adventure', 'Thriller']"
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,['Thriller']
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,"['Action', 'Comedy', 'Drama']"
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),"['Crime', 'Drama', 'Thriller']"
...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,['Drama']
1678,1679,B. Monkey (1998),06-Feb-1998,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,"['Romance', 'Thriller']"
1679,1680,Sliding Doors (1998),01-Jan-1998,http://us.imdb.com/Title?Sliding+Doors+(1998),"['Drama', 'Romance']"
1680,1681,You So Crazy (1994),01-Jan-1994,http://us.imdb.com/M/title-exact?You%20So%20Cr...,['Comedy']


# Model selection

In [6]:
def get_average_cv_rmse(folds: list, k: int) -> float:
    return sum(folds) / k

In [8]:
models = [
        {
            'name': 'svd',
            'instance': SVD(),
            f'{k}-fold cv rmse score': float('inf')
        },
        {
            'name': 'nmf',
            'instance': NMF(),
            f'{k}-fold cv rmse score': float('inf')
        },
        {
            'name': 'normal_predictor',
            'instance': NormalPredictor(),
            f'{k}-fold cv rmse score': float('inf')
        },
        {
            'name': 'knn_basic',
            'instance': KNNBasic(),
            f'{k}-fold cv rmse score': float('inf')
        }
    ]

In [10]:
for model in models:
    cv = cross_validate(model['instance'], df, measures=['RMSE'], cv=5, verbose=True)
    model[f'{k}-fold cv rmse score'] = get_average_cv_rmse(cv['test_rmse'], k)

best_model = min(models, key=lambda dict_: list(dict_.values())[2])

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9361  0.9332  0.9413  0.9331  0.9358  0.9359  0.0030  
Fit time          0.88    0.85    0.84    0.83    0.87    0.85    0.02    
Test time         0.33    0.15    0.10    0.15    0.10    0.17    0.09    
Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9644  0.9541  0.9634  0.9690  0.9660  0.9634  0.0050  
Fit time          1.28    1.56    1.39    1.35    1.28    1.37    0.10    
Test time         0.16    0.11    0.15    0.10    0.15    0.13    0.02    
Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5134  1.5213  1.5144  1.5179  1.5134  1.5161  0.0031  
Fit time          0.08    0.11    0.10    0.10    0.11    0.10    0.01    
Test time       

In [11]:
best_model

{'name': 'svd',
 'instance': <surprise.prediction_algorithms.matrix_factorization.SVD at 0x27c2d9f3910>,
 '5-fold cv rmse score': 0.9358994209691712}

# SVD Hyperparameter tuning

## Define parameter search space

In [12]:
param_grid = {
        'n_factors': [10, 20, 50, 100, 200],
        'n_epochs': [10, 20, 50, 100, 200],
        'lr_all': [0.001, 0.005, 0.01],
        'reg_all': [0.01, 0.05, 0.1]
    }

## Estimate best parameters using K-fold cross validation

In [16]:
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=-1)
grid_search.fit(df)

## Estimated parameter values of the best model

In [17]:
print(grid_search.best_score['rmse'])
print(grid_search.best_params['rmse'])

0.9085883039094661
{'n_factors': 200, 'n_epochs': 100, 'lr_all': 0.005, 'reg_all': 0.1}


## Compute CV error on u_i train/test datasets

In [18]:
model = SVD(n_factors=200,
           n_epochs=100,
           lr_all=0.005,
           reg_all=0.1)

In [21]:
kf = KFold(n_splits=k)
for fold, (train, test) in enumerate(kf.split(df)):
    train_predictions = model.fit(train).test(train.build_testset())
    test_predictions = model.fit(train).test(test)
    train_rmse = rmse(train_predictions)
    test_rmse = rmse(test_predictions)
    print(f'Fold {fold+1} train set RMSE: {train_rmse}')
    print(f'Fold {fold+1} test set RMSE: {test_rmse}')

RMSE: 0.6376
RMSE: 0.9096
Fold 1 train set RMSE: 0.63755752657926
Fold 1 test set RMSE: 0.9095874540304604
RMSE: 0.6380
RMSE: 0.9119
Fold 2 train set RMSE: 0.6379685827474141
Fold 2 test set RMSE: 0.9118767658864118
RMSE: 0.6388
RMSE: 0.9057
Fold 3 train set RMSE: 0.6387638700434524
Fold 3 test set RMSE: 0.9057354578375343
RMSE: 0.6380
RMSE: 0.9107
Fold 4 train set RMSE: 0.637986904895005
Fold 4 test set RMSE: 0.910724556423682
RMSE: 0.6372
RMSE: 0.9073
Fold 5 train set RMSE: 0.637179003760379
Fold 5 test set RMSE: 0.907309692820498


## Train/Test split

In [23]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Predictions

In [24]:
predictions = model.fit(train).test(test)

In [25]:
rmse(predictions)

RMSE: 0.9104


0.9103796517833047

# Recommendations

In [29]:
train_set = df.build_full_trainset()
model.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x27c38bbe670>

In [28]:
predictions = model.test(train_set.build_anti_testset())

In [30]:
def get_predictions(predictions):
    
    top_n = defaultdict(list)
    for uid, iid, true, est, _ in predictions:
        top_n[uid].append((iid, est))
    
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
    
    return top_n

In [31]:
def get_n_predictions(n):
    result = defaultdict(list)
    for uid, user_ratings in get_predictions(predictions).items():
        result[uid] = user_ratings[:n]
    return result

In [42]:
recommendations = pd.DataFrame.from_dict(get_n_predictions(5)).transpose()
recommendations = recommendations.explode([0, 1, 2, 3, 4])
recommendations = recommendations.iloc[::2, :].reset_index()
recommendations = pd.melt(recommendations, id_vars=['index'], var_name='col', value_name='movie_id')
recommendations.drop('col', inplace=True, axis=1)
recommendations.movie_id = recommendations.movie_id.astype('int64')
recommendations = recommendations.join(df_movies.set_index('movie_id'), on='movie_id')
recommendations.rename(columns={'index': 'user_id'}, inplace=True)

recommendations


Unnamed: 0,user_id,movie_id,title,release_date,IMDb URL,genre
0,196,1449,Pather Panchali (1955),22-Mar-1996,http://us.imdb.com/M/title-exact?Pather%20Panc...,['Drama']
1,186,318,Schindler's List (1993),01-Jan-1993,http://us.imdb.com/M/title-exact?Schindler's%2...,"['Drama', 'War']"
2,22,408,"Close Shave, A (1995)",28-Apr-1996,http://us.imdb.com/M/title-exact?Close%20Shave...,"['Animation', 'Comedy', 'Thriller']"
3,244,1449,Pather Panchali (1955),22-Mar-1996,http://us.imdb.com/M/title-exact?Pather%20Panc...,['Drama']
4,166,169,"Wrong Trousers, The (1993)",01-Jan-1993,http://us.imdb.com/M/title-exact?Wrong%20Trous...,"['Animation', 'Comedy']"
...,...,...,...,...,...,...
4710,939,318,Schindler's List (1993),01-Jan-1993,http://us.imdb.com/M/title-exact?Schindler's%2...,"['Drama', 'War']"
4711,936,603,Rear Window (1954),01-Jan-1954,http://us.imdb.com/M/title-exact?Rear%20Window...,"['Mystery', 'Thriller']"
4712,930,513,"Third Man, The (1949)",01-Jan-1949,"http://us.imdb.com/M/title-exact?Third%20Man,%...","['Mystery', 'Thriller']"
4713,920,318,Schindler's List (1993),01-Jan-1993,http://us.imdb.com/M/title-exact?Schindler's%2...,"['Drama', 'War']"


In [None]:
recommendations.to_csv(path+'')