# Matrix Factorization for Recommendation Systems

In [None]:
# import libraries
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.colors import ListedColormap

from collections import defaultdict #data collector

#Surprise: https://surprise.readthedocs.io/en/stable/
import surprise

from surprise.reader import Reader
from surprise import Dataset, accuracy
from surprise.model_selection import GridSearchCV,PredefinedKFold

##Matrix Factorization Algorithms
from surprise import SVD
from surprise import NMF

## Import data
source: https://grouplens.org/datasets/movielens/100k/

In [None]:
# The full u data set, 100000 ratings by 943 users on 1682 items.
ratings = # YOUR CODE HERE
ratings.head()

In [None]:
# Information about the items (movies)
movie_columns  = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL',
                  'unknown','Action','Adventure', 'Animation',"Children's", 'Comedy', 'Crime',
                  'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                  'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = # YOUR CODE HERE
movies.head()

## Explore the datasets

In [None]:
# YOUR CODE HERE

#### Plots

In [None]:
# Histogram of ratings
# YOUR CODE HERE

In [None]:
# Histogram of average ratings by user
# YOUR CODE HERE

In [None]:
# Histogram of number of ratings by user
# YOUR CODE HERE

#### Create R matrix: user x items

In [None]:
rating_matrix = # YOUR CODE HERE

In [None]:
# remove nan values
nan_mask = np.isnan(rating_matrix)
rating_matrix[nan_mask] = 0
rating_matrix

In [None]:
# snap of the rating matrix
maxA = int(np.max(rating_matrix))
cmap1 = ListedColormap(['white', 'paleturquoise','turquoise',  'mediumturquoise','darkturquoise', 'royalblue'][:maxA+1])

plt.figure(figsize=(12,6))
plt.imshow(rating_matrix[:50,:100],cmap=cmap1)
cb = plt.colorbar(fraction=0.046, ticks=list(np.arange(maxA+1)))
cb.ax.tick_params(labelsize=20)

plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

ax = plt.gca()
ax.xaxis.set_ticks([])
ax.yaxis.set_ticks([])
plt.tight_layout()

## Model with surprise package

In [None]:
# path to dataset folder
files_dir = os.path.expanduser('./ml-100k/')

# this time, we'll use the built-in reader.
reader = Reader('ml-100k')

### GridsearchCV to choose the best number of factors

In [None]:
data_full = # YOUR CODE HERE
data_full

In [None]:
param_grid = {'n_factors': [5,7,9,11,14,16,18,20]}
gs = GridSearchCV(# YOUR CODE HERE)
gs.fit(data_full)

# best RMSE score
# YOUR CODE HERE

# combination of parameters that gave the best RMSE score
# YOUR CODE HERE

### Cross Validation to choose the best model among SVD, SVD biased, NMF, and NMF biased

In [None]:
# Let's fix 
K = 16
maxit = 20

In [None]:
# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(# YOUR CODE HERE)
pkf = PredefinedKFold()

In [None]:
RMSE = {}
MAE = {}

##### Using `SVD`

In [None]:
algo = # YOUR CODE HERE
RMSE['SVD'] = []
MAE['SVD'] = []
# YOUR CODE HERE

##### Using `SVD biased`

In [None]:
algo = # YOUR CODE HERE
RMSE['SVDbiased'] = []
MAE['SVDbiased'] = []
# YOUR CODE HERE

##### Using `NMF biased`

In [None]:
algo = # YOUR CODE HERE
RMSE['NMF'] = []
MAE['NMF'] = []
# YOUR CODE HERE

##### Using `NMF biased`

In [None]:
algo = # YOUR CODE HERE
RMSE['NMFbiased'] = []
MAE['NMFbiased'] = []
# YOUR CODE HERE

In [None]:
fig, ax = plt.subplots(1,2,figsize=(16,4))

for i,k in enumerate(RMSE.keys()):
    ax[0].scatter(i,np.mean(RMSE[k]),label=k)
    ax[0].errorbar(i,np.mean(RMSE[k]),yerr= np.std(RMSE[k]))
    ax[1].scatter(i,np.mean(MAE[k]),label=k)
    ax[1].errorbar(i,np.mean(MAE[k]),yerr= np.std(MAE[k]))
ax[0].set_xticks([0.0, 1.0, 2., 3.0])
ax[0].set_xticklabels(RMSE.keys(), fontsize=15)
ax[0].set_ylabel('RMSE', fontsize=15)
ax[1].set_xticks([0.0, 1.0, 2., 3.0])
ax[1].set_xticklabels(RMSE.keys(), fontsize=15)
ax[1].set_ylabel('MAE', fontsize=15)

plt.show()

### Make predictions and recommendations

In [None]:
trainset = data_full.build_full_trainset()
best_algo = # YOUR CODE HERE
best_algo.fit(trainset)

testset = trainset.build_anti_testset()
predictions = best_algo.test(testset)

In [None]:
predictions[:2]

In [None]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # YOUR CODE HERE

    return top_n


In [None]:
top_n = get_top_n(predictions, n=5)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])