# Matrix Factorization

In [1]:
import torch
import pandas as pd
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch import nn
from torch.utils.data import Dataset, DataLoader

## Load MovieLens Dataset

In [2]:
class MovieLensDataset(Dataset):
    def __init__(self,datapath):
        self.data_pd = pd.read_csv(datapath,sep="\t",names=["user","movie","rating","timestamp"])
        self.items = torch.LongTensor(self.data_pd['movie'])
        self.users = torch.LongTensor(self.data_pd['user'])
        self.ratings = torch.FloatTensor(self.data_pd['rating'])
    
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self,idx):
        return self.users[idx], self.items[idx], self.ratings[idx]
    
    def get_datasize(self):
        return self.users.max()+1,self.items.max()+1,len(self.ratings)

In [3]:
train_data = MovieLensDataset("../datasets/MovieLens/ml-100k/ua.base")
test_data = MovieLensDataset("../datasets/MovieLens/ml-100k/ua.test")

batch_size = 128
train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = False)

## Initialization

In [4]:
n_users, n_items, n_ratings = train_data.get_datasize()
_,_,n_ratings_test = test_data.get_datasize()

In [5]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, rank = 10):
        super().__init__()
        self.U = nn.Parameter(torch.randn(num_users,rank))
        self.V = nn.Parameter(torch.randn(num_items,rank))
    
    def forward(self,users,items):
        ratings = torch.sum(self.U[users]*self.V[items],dim = -1)
        return ratings

In [6]:
mf_model = MF(n_users, n_items, rank = 16)
optimizer = torch.optim.Adam(mf_model.parameters(),lr = 0.01)
criterion = nn.MSELoss()

## Training

In [7]:
for epoch in range(20):
    cost = 0
    for users, items, ratings in train_loader:
        optimizer.zero_grad()
        ratings_pred = mf_model(users,items)
        loss = criterion(ratings_pred, ratings)
        loss.backward()
        optimizer.step()
        cost += loss.item() * len(ratings) # len은 loss.item()이 batch size의 평균이라 곱해준것
    
    cost /= n_ratings
    
    print(f"Epoch : {epoch}")
    print("train cost : {:.6f}".format(cost))
    
    cost_test = 0
    for users, items, ratings in test_loader:
        ratings_pred = mf_model(users, items)
        loss = criterion(ratings_pred, ratings)
        cost_test += loss.item()*len(ratings)
        
    cost_test /= n_ratings_test
    print("test cost : {:.6f}".format(cost_test))

Epoch : 0
train cost : 20.449340
test cost : 15.940851
Epoch : 1
train cost : 5.265995
test cost : 5.084339
Epoch : 2
train cost : 1.514143
test cost : 2.958698
Epoch : 3
train cost : 1.038371
test cost : 2.336312
Epoch : 4
train cost : 0.897178
test cost : 2.090560
Epoch : 5
train cost : 0.836654
test cost : 1.972741
Epoch : 6
train cost : 0.804020
test cost : 1.896026
Epoch : 7
train cost : 0.778056
test cost : 1.903479
Epoch : 8
train cost : 0.758061
test cost : 1.847904
Epoch : 9
train cost : 0.738624
test cost : 1.856937
Epoch : 10
train cost : 0.717216
test cost : 1.784285
Epoch : 11
train cost : 0.698643
test cost : 1.847141
Epoch : 12
train cost : 0.682977
test cost : 1.798472
Epoch : 13
train cost : 0.668119
test cost : 1.804359
Epoch : 14
train cost : 0.655392
test cost : 1.823638
Epoch : 15
train cost : 0.645053
test cost : 1.792940
Epoch : 16
train cost : 0.635308
test cost : 1.818759
Epoch : 17
train cost : 0.629466
test cost : 1.810498
Epoch : 18
train cost : 0.619851
tes