# Lab 10: Recommender System

In this assignment, we will study how to do user-based collaborative filtering and item-based collaborative filtering. 

## 1. Dataset

In this assignment, we will use MovieLens-100K dataset. It includes about 100,000 ratings from 1000 users on 1700 movies.  

In [3]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import linear_kernel
from sklearn.neighbors import NearestNeighbors


# 1. load data
user_ratings_train = pd.read_csv('./ml-100k/u1.base',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

user_ratings_test = pd.read_csv('./ml-100k/u1.test',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

movie_info =  pd.read_csv('./ml-100k/u.item', 
                          sep='|', names=['movie_id','title'], usecols=[0,1],
                          encoding="ISO-8859-1")

user_ratings_train = pd.merge(movie_info, user_ratings_train)
user_ratings_test = pd.merge(movie_info, user_ratings_test)

# 2. get the rating matrix. Each row is a user, and each column is a movie.
user_ratings_train = user_ratings_train.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')

user_ratings_test = user_ratings_test.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')




user_ratings_train = user_ratings_train.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

user_ratings_test = user_ratings_test.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

print(user_ratings_train.shape)
print(user_ratings_test.shape)

(943, 1664)
(943, 1664)


## Task 1. User-based CF

* Use pearson correlation to get the similarity between different users.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [242]:
# your code
import math
user_ratings_train = user_ratings_train.fillna(0)
user_ratings_test = user_ratings_test.fillna(0)

train_mean = user_ratings_train.mean(axis=1)
test_mean = user_ratings_test.mean(axis=1) 

train_means_expanded = np.outer(train_mean, np.ones(1664))
test_means_expanded = np.outer(test_mean, np.ones(1664))

train_means_subtracted = user_ratings_train - train_means_expanded
test_means_subtracted = user_ratings_test - test_means_expanded
#print(test_means_subtracted)
similarity = []
similarities = np.zeros((942,942))


for i in range(1, 943):
    denominator_i = train_means_subtracted.loc[i]
    denominator_i = np.square(denominator_i)
    denominator_i = denominator_i.sum()
    denominator_i = math.sqrt(denominator_i)
    
    
    
    for j in range(1, 943):
        numerator = train_means_subtracted.loc[i] * train_means_subtracted.loc[j]
        numerator = numerator.sum()

        denominator_j = np.square(train_means_subtracted.loc[j])
        denominator_j = denominator_j.sum()
        denominator_j = math.sqrt(denominator_j)

        denominator = denominator_j * denominator_i

        similarity.append((numerator / denominator))
        
    similarities[i-1] = similarity
    similarity = []

print(similarities)



[[1.         0.06023134 0.02175852 ... 0.17022618 0.13475539 0.08277204]
 [0.06023134 1.         0.03406568 ... 0.15924813 0.11801274 0.09905043]
 [0.02175852 0.03406568 1.         ... 0.10870144 0.07108757 0.05824196]
 ...
 [0.17022618 0.15924813 0.10870144 ... 1.         0.12222343 0.22341614]
 [0.13475539 0.11801274 0.07108757 ... 0.12222343 1.         0.08091865]
 [0.08277204 0.09905043 0.05824196 ... 0.22341614 0.08091865 1.        ]]


In [None]:
temp = user_ratings_train.to_numpy()
numerator = 0
denominator = 0
predicted_scores = np.zeros((942,1664))
#train_means_subtracted = train_means_subtracted.to_numpy()



for i in range(0, 942):
    for j in range(0, 1664):
        if (temp[i][j] != 0):

            continue
        else:
            for k in range(0, 942):
                if (temp[k][j] == 0 or k == i):
                    continue
                else:
                    numerator = numerator + (train_means_subtracted[k][j] * similarities[i][k])
                    denominator = denominator + (similarities[i][k])
                    
            if (denominator != 0):
                score = numerator / denominator
                predicted_scores[i][j] = score
            else:
                predicted_scores[i][j] = 0
   
print(predicted_scores)


## Task 2. Item-based CF
* Use cosine similarity to get the similarity between different items.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [23]:
# your code