In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import *


In [2]:
data = pd.read_csv('ratings.csv', encoding="ISO-8859-1")

print('\n\nHead:')
print(data.head())
print('\n\nInfo:')
print(data.info())
print('\n\nDescribe:')
print(data.describe())
print('\n\nNulls:')
print(data.isnull().sum())
print('Unique values:')
for column in data.columns:
    unique_values = data[column].unique()
    if len(unique_values) < 20:
        unique_values_str = ', '.join([str(value) for value in unique_values])
    else:
        # Show first 3 and last 3 unique values
        unique_values_str = ', '.join([str(value) for value in unique_values[:3]]) + ', ..., ' + ', '.join([str(value) for value in unique_values[-3:]])
    print(f'{column}: {len(unique_values)} unique values: {unique_values_str}')



Head:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None


Describe:
              userId        movieId         rating     timestamp
count  100836.000000  100836.000000  100836.000000  1.008360e+05
mean      326.127564   19435.295718       3.501557  1.205946e+09
std       182.618491   35530.987199       1.042529  2.162610e+08
min         1.000000       1.000000       0.500000  8.281246e+08
25%     

In [17]:
# Train test split
from sklearn.model_selection import train_test_split

# One hot encoding for user_id and movie_id, i.e. each user_id and movie_id is a feature

userId_to_index = {userId: index for index, userId in enumerate(data['userId'].unique())}
movieId_to_index = {movieId: index for index, movieId in enumerate(data['movieId'].unique())}

data['userId'] = data['userId'].map(userId_to_index)
data['movieId'] = data['movieId'].map(movieId_to_index)

# X = user_id, movie_id
X = data.drop('rating', axis=1).drop('timestamp', axis=1)
Y = data['rating']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

'Train size:', len(X_train), 'Test size:', len(X_test)

('Train size:', 90752, 'Test size:', 10084)

In [15]:
def matrix_factorization(rating_matrix, latent_size, steps=5000, alpha=0.0002, beta=0.02):
    num_users, num_movies = rating_matrix.shape
    print('Factorizing matrix of shape', num_users, num_movies, 'into', latent_size, 'latent features')
    P = np.random.rand(num_users, latent_size)
    Q = np.random.rand(num_movies, latent_size)
    
    for step in range(steps):
        for i in range(num_users):
            for j in range(num_movies):
                if rating_matrix[i, j] > 0:
                    eij = rating_matrix[i, j] - np.dot(P[i, :], Q[j, :])
                    for k in range(latent_size):
                        P[i, k] = P[i, k] + alpha * (2 * eij * Q[j, k] - beta * P[i, k])
                        Q[j, k] = Q[j, k] + alpha * (2 * eij * P[i, k] - beta * Q[j, k])
        e = 0
        for i in range(num_users):
            for j in range(num_movies):
                if rating_matrix[i, j] > 0:
                    e = e + pow(rating_matrix[i, j] - np.dot(P[i, :], Q[j, :]), 2)
                    for k in range(latent_size):
                        e = e + (beta/2) * (pow(P[i, k], 2) + pow(Q[j, k], 2))
        print('Step', step, 'error:', e)
        if e < 0.001:
            break
    return P, Q

rating_matrix = data.pivot(index='userId', columns='movieId', values='rating').fillna(0).values
P, Q = matrix_factorization(rating_matrix, 5, steps=100)

Factorizing matrix of shape 610 9724 into 5 latent features
Step 0 error: 452357.68517562107
Step 1 error: 347493.4762258143
Step 2 error: 282754.4417922424
Step 3 error: 241155.84800813167
Step 4 error: 213208.04486387532
Step 5 error: 193395.30834678918
Step 6 error: 178603.5416157448
Step 7 error: 167079.7158458306
Step 8 error: 157806.34561946007
Step 9 error: 150161.26420485866
Step 10 error: 143741.15860407325
Step 11 error: 138270.264487537
Step 12 error: 133551.66433593602
Step 13 error: 129439.96041103097
Step 14 error: 125825.01842164177
Step 15 error: 122621.73760011667
Step 16 error: 119763.2946061895
Step 17 error: 117196.50512615671
Step 18 error: 114878.54151298106
Step 19 error: 112774.55453546366
Step 20 error: 110855.91754973293
Step 21 error: 109098.91011856106
Step 22 error: 107483.7181906737
Step 23 error: 105993.66608274305
Step 24 error: 104614.62054578305
Step 25 error: 103334.52410220925
Step 26 error: 102143.02651033837
Step 27 error: 101031.19141961927
Step 2

In [None]:
predicted_ratings = np.dot(P, Q.T)

accuracy = 0

for i in range(len(X_test)):
    user_id = X_test.iloc[i]['userId']
    movie_id = X_test.iloc[i]['movieId']
    rating = Y_test.iloc[i]
    predicted_rating = predicted_ratings[user_id, movie_id]
    accuracy += abs(predicted_rating - rating)
    
accuracy = accuracy / len(X_test)
accuracy

0.9550925819710891

In [None]:
# Matrix factorization
# Objective: min sum (i, j obs) (W_ij - dot(U_i, V_j))^2 + lambda * sum (i, j not obs) dot(U_i, V_j)^2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class MatrixFactorization(nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = nn.Linear(n_users, n_factors, bias=True)
        self.item_factors = nn.Linear(n_items, n_factors, bias=True)
        self.bias = nn.Parameter(torch.zeros(1))
        
    def forward(self, users, items):
        # Objective: min sum (i, j obs) (W_ij - dot(U_i, V_j))^2 + lambda * sum (i, j not obs) dot(U_i, V_j)^2
        # W_ij = rating
        # U_i = user_factors[users]
        # V_j = item_factors[items]
        # dot(U_i, V_j) = dot(user_factors[users], item_factors[items])
        
        user = self.user_factors(users)
        item = self.item_factors(items)
        dot = (user @ item.T).sum(1)
        return  dot + self.bias
    
def RMSE(preds, y):
    return np.sqrt(((preds-y)**2).mean())

def fit(model, X_train, Y_train, X_test, Y_test, epochs=50, lr=0.001, wd=2e-5):
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for epoch in range(epochs):
        model.train()
        
        # Shuffle X_train and Y_train
        idxs = np.random.permutation(len(X_train))
        X_train = X_train.iloc[idxs]
        Y_train = Y_train.iloc[idxs]
        
        def calc_loss(users, items, ratings):
            # One hot encoding for user and item
            users_one_hot = torch.zeros(len(users), n_users)
            users_one_hot[range(len(users)), users] = 1
            
            items_one_hot = torch.zeros(len(items), n_items)
            items_one_hot[range(len(items)), items] = 1            
            
            users = torch.FloatTensor(users_one_hot)
            items = torch.FloatTensor(items_one_hot)
            ratings = torch.FloatTensor(ratings)
            
            preds = model(users, items)
            return F.l1_loss(preds, ratings)
        
        train_loss = 0
        batch_size = 64
        for idx in trange(0, len(X_train), batch_size):
            users = X_train['userId'].values[idx:idx+batch_size]
            items = X_train['movieId'].values[idx:idx+batch_size]
            ratings = Y_train.values[idx:idx+batch_size]
            
            loss = calc_loss(users, items, ratings)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        model.eval()
        users = X_test['userId'].values
        items = X_test['movieId'].values
        ratings = Y_test.values
        loss = calc_loss(users, items, ratings)
        print(f'Epoch {epoch} train loss: {train_loss/len(X_train)} test loss: {loss.item() / len(X_test)}')
        
        
n_users = X['userId'].nunique()
n_items = X['movieId'].nunique()

model = MatrixFactorization(n_users, n_items, n_factors=20)
fit(model, X_train, Y_train, X_test, Y_test, epochs=5, lr=0.01, wd=1e-5)
fit(model, X_train, Y_train, X_test, Y_test, epochs=5, lr=0.001, wd=1e-5)

100%|██████████| 1418/1418 [00:05<00:00, 263.40it/s]


Epoch 0 train loss: 0.01875025497773092 test loss: 0.02112835071144573


100%|██████████| 1418/1418 [00:05<00:00, 252.43it/s]


Epoch 1 train loss: 0.019785074442257244 test loss: 0.0083146509505699


100%|██████████| 1418/1418 [00:05<00:00, 256.19it/s]


Epoch 2 train loss: 0.019820128419083163 test loss: 0.007604046692596641


100%|██████████| 1418/1418 [00:06<00:00, 218.63it/s]


Epoch 3 train loss: 0.020574794640194505 test loss: 0.0075739157289703984


100%|██████████| 1418/1418 [00:06<00:00, 209.85it/s]


Epoch 4 train loss: 0.020674905662223887 test loss: 0.005219828746754799


100%|██████████| 1418/1418 [00:06<00:00, 207.91it/s]


Epoch 0 train loss: 0.015233184163667503 test loss: 0.00513040920138028


100%|██████████| 1418/1418 [00:05<00:00, 249.44it/s]


Epoch 1 train loss: 0.013161224408314284 test loss: 0.005546506649442534


100%|██████████| 1418/1418 [00:05<00:00, 242.42it/s]


Epoch 2 train loss: 0.012507105677037885 test loss: 0.006067094799073525


100%|██████████| 1418/1418 [00:06<00:00, 220.10it/s]


Epoch 3 train loss: 0.012164746459746899 test loss: 0.005343550682446163


100%|██████████| 1418/1418 [00:06<00:00, 228.15it/s]


Epoch 4 train loss: 0.011933462933357411 test loss: 0.006046398061459521
