In [1]:
import pandas as pd
import numpy as np

import os
import glob
from datetime import date
import logging

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:

log_model = 'nmf'

today = date.today()
log_formatter = logging.Formatter("%(asctime)s %(message)s")
logger = logging.getLogger()

log_file_name = "./logs/{}_{}".format(today, log_model)

file_handler = logging.FileHandler("{}.log".format(log_file_name))
file_handler.setFormatter(log_formatter)
logger.addHandler(file_handler)
logger.setLevel(logging.DEBUG)
pd.read_csv('./ml-1m/ml-1m/ratings.dat').head()


Unnamed: 0,1::1193::5::978300760
0,1::661::3::978302109
1,1::914::3::978301968
2,1::3408::4::978300275
3,1::2355::5::978824291
4,1::1197::3::978302268


In [3]:
ratings_cols = ['UserID', 'MovieID', 'Rating', 'Timestamp']
ratings = pd.read_csv('./ml-1m/ml-1m/ratings.dat', sep='::', engine='python', names=ratings_cols)
ratings

num_users = ratings.UserID.unique().shape[0]
num_items = ratings.MovieID.unique().shape[0]
print('no. users: %d, no. items: %d' %(num_users, num_items))

no. users: 6040, no. items: 3706


In [4]:

from torch.utils.data import Dataset, TensorDataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, users, items, y):
        self.x = torch.cat([
            torch.LongTensor(users).unsqueeze(0).transpose(0, 1),
            torch.LongTensor(items).unsqueeze(0).transpose(0, 1)
        ], axis=1)
        self.y = torch.FloatTensor(y)

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.y)

In [5]:


from sklearn import preprocessing
le1 = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()

batch_size = 256

train_dataset = CustomDataset(
    le1.fit_transform(ratings.UserID),
    le2.fit_transform(ratings.MovieID),
    ratings.Rating.values
)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [6]:

class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=128):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        x, y = data[:,:1], data[:,1:]
        u, v = self.user_emb(x), self.item_emb(y)
        return (u.squeeze(1) * v.squeeze(1)).sum(1)


In [7]:

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print('Device : %s' % device)

Device : cuda


In [8]:
model = MF(num_users, num_items, emb_size=128).to(device)
print('MODEL:\n\n', model)

## optimizer, learning rate,  loss function을 설정
batch_size = 128
learning_rate = 0.0005
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss()

model.train()

MODEL:

 MF(
  (user_emb): Embedding(6040, 128)
  (item_emb): Embedding(3706, 128)
)


MF(
  (user_emb): Embedding(6040, 128)
  (item_emb): Embedding(3706, 128)
)

In [101]:
EPOCHS = 20

for e in range(EPOCHS):
    print('start : ' + str(e) + ' epoch')
    avg_loss = 0
    for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        y_hat = model(x)
        loss = loss_fn(y_hat, y)
        avg_loss += loss.item()
        loss.backward()
        optimizer.step()
    torch.save(model.state_dict(), "./weights/{}_{}.pt".format(log_model, e + 1))
    print("e{}: loss: {}".format(e + 1, loss / (batch_idx + 1)))
    logger.info("e{}: loss: {}".format(e + 1, loss / (batch_idx + 1)))
print('training complete')


start : 0 epoch
e1: loss: 8.820590301183984e-05
start : 1 epoch
e2: loss: 0.000125005841255188
start : 2 epoch
e3: loss: 7.351280510192737e-05
start : 3 epoch
e4: loss: 6.227856647456065e-05
start : 4 epoch
e5: loss: 2.1203024516580626e-05
start : 5 epoch
e6: loss: 1.811356924008578e-05
start : 6 epoch
e7: loss: 2.495392800483387e-05
start : 7 epoch
e8: loss: 1.6609546946710907e-05
start : 8 epoch
e9: loss: 2.148180647054687e-05
start : 9 epoch
e10: loss: 1.645660995563958e-05
start : 10 epoch
e11: loss: 1.1157949302287307e-05
start : 11 epoch
e12: loss: 2.0137691535637714e-05
start : 12 epoch
e13: loss: 1.1393317436159123e-05
start : 13 epoch
e14: loss: 1.1999186426692177e-05
start : 14 epoch
e15: loss: 1.1303659448458347e-05
start : 15 epoch
e16: loss: 2.3687256543780677e-05
start : 16 epoch
e17: loss: 1.3061499885225203e-05
start : 17 epoch
e18: loss: 1.1610450201260392e-05
start : 18 epoch
e19: loss: 1.4820739124843385e-05
start : 19 epoch
e20: loss: 2.697329910006374e-05
training 

In [11]:
model.eval()

MF(
  (user_emb): Embedding(6040, 128)
  (item_emb): Embedding(3706, 128)
)

In [12]:
ratings['le_UserID'] = le1.fit_transform(ratings.UserID)
ratings['le_MovieID'] = le1.fit_transform(ratings.MovieID)

In [13]:
val_tensor = torch.LongTensor(ratings[['le_UserID', 'le_MovieID']].values).to(device)
pred = model(torch.LongTensor(ratings[['le_UserID', 'le_MovieID']].values).to(device))

In [14]:
ratings['pred_Rating'] = pred.to('cpu').detach().numpy()