# Lab 8: Recommender System

In this assignment, we will study how to do user-based collaborative filtering and item-based collaborative filtering. 

## 1. Dataset

In this assignment, we will use MovieLens-100K dataset. It includes about 100,000 ratings from 1000 users on 1700 movies.  

In [21]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import linear_kernel
from sklearn.neighbors import NearestNeighbors


# 1. load data
user_ratings_train = pd.read_csv('./ml-100k/u1.base',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

user_ratings_test = pd.read_csv('./ml-100k/u1.test',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

movie_info =  pd.read_csv('./ml-100k/u.item', 
                          sep='|', names=['movie_id','title'], usecols=[0,1],
                          encoding="ISO-8859-1")

user_ratings_train = pd.merge(movie_info, user_ratings_train)
user_ratings_test = pd.merge(movie_info, user_ratings_test)

# 2. get the rating matrix. Each row is a user, and each column is a movie.
user_ratings_train = user_ratings_train.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')

user_ratings_test = user_ratings_test.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')




user_ratings_train = user_ratings_train.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

user_ratings_test = user_ratings_test.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

print(user_ratings_train.shape)
print(user_ratings_test.shape)

(943, 1664)
(943, 1664)


## Task 1. User-based CF

* Use pearson correlation to get the similarity between different users.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [None]:
from sklearn.metrics import mean_absolute_error

# # fill NaNs with row average
avg = user_ratings_train.mean(axis=1)
user_ratings_train = user_ratings_train.T.fillna(avg).T

# calculate pearson similarity
pearson_sim = user_ratings_train.T.corr(method='pearson')

# create knn
knn = NearestNeighbors(n_neighbors=5, metric='precomputed')

# get dist matrix from pearson correlation
distance_matrix = 1 - pearson_sim.fillna(0)
knn.fit(distance_matrix)

# initialize arrays for the predicted and actual ratings
predictions = []
actual = []

# for each user, for each movie, get the true rating and compare it with the top 5 similar users 
for user_id in user_ratings_test.index:
    for movie in user_ratings_test.columns:
        # get true rating
        true_rating = user_ratings_test.loc[user_id, movie]

        # skip if true rating isnt in the test set
        if not np.isnan(true_rating):

            # skip movie if its not rated
            if movie not in user_ratings_train.columns:
                continue
            
            # get similar users
            sims = pearson_sim.loc[user_id]
            # drop self similarity and any NaN then get top 5 similar users
            top_users = sims.drop(index=user_id).dropna().sort_values(ascending=False).head(5)

            # numerator and denominator for weighted average calculation
            num = 0
            denom = 0
            # loop through similar users
            for neighbor_id, sim_score in top_users.items():
                neighbor_rating = user_ratings_train.loc[neighbor_id, movie]
                # if similar user rated the movie use it in the prediction
                if not np.isnan(neighbor_rating):
                    num += sim_score * (neighbor_rating)
                    denom += abs(sim_score)

            # compute weigted average if denom is > 0
            if denom > 0:
                pred = num / denom
                predictions.append(pred)
                actual.append(true_rating)

# mae
mae = mean_absolute_error(predictions, actual)
print("MAE: {:.4f}".format(mae))

MAE: 0.8935


## Task 2. Item-based CF
* Use cosine similarity to get the similarity between different items.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [66]:
# your code
from sklearn.metrics.pairwise import cosine_similarity

# fill NaNs with row average
avg = user_ratings_train.mean(axis=1)
user_ratings_train = user_ratings_train.T.fillna(avg).T

# calculate cosine similarity
cosine_sim = cosine_similarity(user_ratings_train.T)
item_sim = pd.DataFrame(cosine_sim, index=user_ratings_train.columns, columns=user_ratings_train.columns)

# create knn
knn = NearestNeighbors(n_neighbors=5)

# get dist matrix from cosine correlation
distance_matrix = 1 - item_sim.fillna(0)
knn.fit(distance_matrix.to_numpy())

# initialize arrays for the predicted and actual ratings
predictions = []
actual = []

# for each user, for each movie, get the true rating and compare it with the top 5 similar print(movie in item_sim.columns) items
for user_id in user_ratings_test.index:
    for movie in user_ratings_test.columns:
        # get true rating
        true_rating = user_ratings_test.loc[user_id, movie]

        # skip if true rating isnt in the test set
        if np.isnan(true_rating):
            continue
        
        # get similar items
        sims = item_sim[movie].drop(index=movie).dropna()
        # get top 5 most similar items
        top_items = sims.sort_values(ascending=False).head(5)

        # loop through similar items and use similar item ratings in prediction, then calculate weighted average
        num = 0
        denom = 0
        for similar_movie, sim_score in top_items.items():
            if similar_movie in user_ratings_train.columns:
                rating = user_ratings_train.loc[user_id, similar_movie]
                if not np.isnan(rating):
                    num += sim_score * rating
                    denom += abs(sim_score)

        if denom > 0:
            pred = num / denom
            predictions.append(pred)
            actual.append(true_rating)


# mae
mae = mean_absolute_error(predictions, actual)
print("MAE: {:.4f}".format(mae))

MAE: 0.8483
