In [100]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [101]:
ratings = pd.read_csv('../ml-25m/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [102]:
# 사용하지 않는 timestamp 열 drop
ratings = ratings.drop(columns='timestamp')

# embedding 구현, 접근의 편의성을 위해 label encoding 수행
movie_to_index = {movie_id: i for i, movie_id in enumerate(ratings['movieId'].unique())}
user_to_index = {user_id: i for i, user_id in enumerate(ratings['userId'].unique())}
ratings['movieId'] = ratings['movieId'].map(movie_to_index)
ratings['userId'] = ratings['userId'].map(user_to_index)

In [103]:
# train, test을 나눠줌
test_ratings = ratings.groupby('userId', group_keys=False).sample(5)
train_ratings = ratings.drop(test_ratings.index)

In [104]:
# 유저별로 5개의 정보를 사용하기 때문에 train과 test의 movie 종류가 달라질 수 있음
not_included_movies = set(test_ratings['movieId']) - set(train_ratings['movieId'])

# 적어도 train set에는 모든 movie가 존재해야하기 때문에 train에 없는 movie를 test에서 drop
indexes = []
for index, row in test_ratings.iterrows():
    if not not_included_movies:
        break
    if row['movieId'] in not_included_movies:
       indexes.append(index)
       not_included_movies.remove(row['movieId'])

test_ratings = test_ratings.drop(indexes)

In [105]:
train_ratings = train_ratings.reset_index(drop=True)
test_ratings = test_ratings.reset_index(drop=True)

In [106]:
class MFDataset(Dataset):
    def __init__(self, ratings):
        self.X = ratings[['userId', 'movieId']].to_numpy()
        self.y = ratings['rating'].to_numpy()
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        X = torch.IntTensor(self.X[index])
        y = torch.FloatTensor([self.y[index]])
        return X, y

In [108]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

batch_size = 64

train_dataset = MFDataset(train_ratings)
test_dataset = MFDataset(test_ratings)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [109]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        pred = model(X[:, 0], X[:, 1])
        loss = loss_fn(pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % 100 == 0:
            print(f'Batch {batch}, Loss: {loss.item()}')

In [110]:
class MF(nn.Module):
    def __init__(self, num_user, num_movie, k):
        super().__init__()
        self.num_user = num_user
        self.num_movie = num_movie
        self.user_embed = nn.Embedding(num_user, k)
        self.movie_embed = nn.Embedding(num_movie, k)
        self.b_u = nn.Embedding(self.num_user, 1)
        self.b_i = nn.Embedding(self.num_movie, 1)
    
    def forward(self, user, movie):
        p = self.user_embed(user)
        q = self.movie_embed(movie)
        b_u = self.b_u(user)
        b_i = self.b_i(movie)
            
        # pred = p @ q.T# + self.b_u + self.b_i + self.b
        pred = (p * q).sum(axis=1) + np.squeeze(b_u) + np.squeeze(b_i)
        return pred.view(-1)

In [95]:
epochs = 1

num_user = train_ratings['userId'].nunique()
num_movie = train_ratings['movieId'].nunique()
model = MF(num_user, num_movie, 20)

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train(train_dataloader, model, loss_fn, optimizer)

  return F.mse_loss(input, target, reduction=self.reduction)


Batch 0, Loss: 52.019561767578125
Batch 100, Loss: 39.3658447265625
Batch 200, Loss: 33.54312515258789
Batch 300, Loss: 45.118221282958984
Batch 400, Loss: 27.620771408081055
Batch 500, Loss: 32.43029022216797
Batch 600, Loss: 25.66817855834961
Batch 700, Loss: 40.27180099487305
Batch 800, Loss: 30.62614631652832
Batch 900, Loss: 53.554386138916016
Batch 1000, Loss: 30.346389770507812
Batch 1100, Loss: 25.20223617553711
Batch 1200, Loss: 30.897024154663086
Batch 1300, Loss: 24.994033813476562
Batch 1400, Loss: 19.414941787719727
Batch 1500, Loss: 27.988054275512695
Batch 1600, Loss: 31.421306610107422
Batch 1700, Loss: 20.330434799194336
Batch 1800, Loss: 18.336257934570312
Batch 1900, Loss: 30.416296005249023
Batch 2000, Loss: 31.884023666381836
Batch 2100, Loss: 40.88953399658203
Batch 2200, Loss: 38.25410842895508
Batch 2300, Loss: 38.44101333618164
Batch 2400, Loss: 24.9813232421875
Batch 2500, Loss: 44.39422607421875


[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 