In [1]:
import pandas as pd
from sklearn.decomposition import NMF
import numpy as np

In [4]:
# Calculate the rmse using a modified version of the week 3 function
def rmse(pred, real):
    pred[np.isnan(pred)]=3 #In case there is nan values in prediction, it will impute to 3.
    return np.sqrt(((real-pred)**2).mean())

In [2]:
# First import all the data
MV_users = pd.read_csv('data/users.csv')
MV_movies = pd.read_csv('data/movies.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [7]:
# lets see how many null values there are if I make a table of user ID's as rows and movie IDs as columns
ratings_by_user_df = train.pivot(index='uID', columns='mID', values='rating')
print('there are', sum(ratings_by_user_df.isna().sum()), 'null values.')
# thats a lot of null values. Lets fill them with the average rating
ratings_by_user_df = ratings_by_user_df.fillna(value = 0)
# check it worked
print('there are now', sum(ratings_by_user_df.isna().sum()), 'null values.')

there are 21430414 null values.
there are now 0 null values.


In [4]:
# lets run it through NMF
nmf = NMF(n_components=5, random_state=0,)
w1 = nmf.fit_transform(ratings_by_user_df)
h1 = nmf.components_

In [5]:
# mutiply the matrixes to get a prediction
pred = np.matmul(w1, h1)
# turn it in to a dataframe
df_pred = pd.DataFrame(data = pred, index = ratings_by_user_df.index.values, columns = ratings_by_user_df.columns.values) 
# assign a columns for user id
df_pred['uID'] = df_pred.index.values
# use melt to change it from wide to long format
df_pred_long = pd.melt(df_pred, id_vars=['uID'], var_name='mID', value_name='predicted_rating')
# merge it ith the actual ratings
df_pred_vs_test = df_pred_long.merge(test, on=['uID', 'mID'])
df_pred_vs_test

Unnamed: 0,uID,mID,predicted_rating,rating
0,6,1,0.548484,4
1,8,1,0.935704,4
2,21,1,0.141737,3
3,23,1,1.976801,4
4,26,1,1.890181,3
...,...,...,...,...
300001,5333,3952,0.893497,4
300002,5359,3952,0.714152,5
300003,5682,3952,0.739560,3
300004,5812,3952,0.783215,4


In [6]:
rmse(df_pred_vs_test.predicted_rating.to_numpy(), df_pred_vs_test.rating.to_numpy())

2.9914125525829425

In [5]:
# fill the values with 3 insted of 0
ratings_by_user_df = train.pivot(index='uID', columns='mID', values='rating')
ratings_by_user_df = ratings_by_user_df.fillna(value = 3)
# rerun nmf
nmf = NMF(n_components=5, random_state=0,)
w1 = nmf.fit_transform(ratings_by_user_df)
h1 = nmf.components_
pred = np.matmul(w1, h1)

# recalculate the rmse
df_pred = pd.DataFrame(data = pred, index = ratings_by_user_df.index.values, columns = ratings_by_user_df.columns.values) 
df_pred['uID'] = df_pred.index.values
df_pred_long = pd.melt(df_pred, id_vars=['uID'], var_name='mID', value_name='predicted_rating')
df_pred_vs_test = df_pred_long.merge(test, on=['uID', 'mID'])
rmse(df_pred_vs_test.predicted_rating.to_numpy(), df_pred_vs_test.rating.to_numpy())

1.144331431727138