In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [2]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv(os.path.join('ml-100k/u.data'), '\t', names=names, engine='python')
print(data.head(5))

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


In [3]:
def read_data_ml100k():
    names = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv(os.path.join('ml-100k/u.data'), '\t', names=names, engine='python')
    num_users = data.user_id.unique().shape[0]
    num_items = data.item_id.unique().shape[0]
    return data, num_users, num_items

In [4]:
def split_data_ml100k(data, num_users, num_items, split_mode='random', test_ratio=0.1):
    if split_mode == 'seq-aware':
        train_items, test_items, train_list = {}, {}, []
        for line in data.itertuples():
            u, i, rating, time = line[1], line[2], line[3], line[4]
            train_items.setdefault(u, []).append((u, i, rating, time))
            if u not in test_items or test_items[u][-1] < time:
                test_items[u] = (i, rating, time)
        for u in range(1, num_users + 1):
            train_list.extend(sorted(train_items[u], key=lambda k: k[3]))
        test_data = [(key, *value) for key, value in test_items.items()]
        train_data = [item for item in train_list if item not in test_data]
        train_data = pd.DataFrame(train_data)
        test_data = pd.DataFrame(test_data)
    else:
        mask = [True if x == 1 else False for x in np.random.uniform( 0, 1, (len(data))) < 1 - test_ratio]
        neg_mask = [not x for x in mask]
        train_data, test_data = data[mask], data[neg_mask]
    return train_data, test_data

In [5]:
def load_data_ml100k(data, num_users, num_items, feedback='explicit'):
    users, items, scores = [], [], []
    inter = np.zeros((num_items, num_users)) if feedback == 'explicit' else {}
    for line in data.itertuples():
        user_index, item_index = int(line[1] - 1), int(line[2] - 1)
        score = int(line[3]) if feedback == 'explicit' else 1
        users.append(user_index)
        items.append(item_index)
        scores.append(score)
        if feedback == 'implicit':
            inter.setdefault(user_index, []).append(item_index)
        else:
            inter[item_index, user_index] = score
    return users, items, scores, inter

In [6]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

In [7]:
class movieDataset(Dataset):
    def __init__(self, train_u, train_i, train_r):
        
        self.train_u = np.array(train_u)
        self.train_i = np.array(train_i)
        self.train_r = np.array(train_r)
        self.length = len(train_u)
        
    def __getitem__(self, index):
        
#         u = torch.LongTensor(self.train_u[index])
#         i = torch.LongTensor(self.train_i[index])
#         r = torch.LongTensor(self.train_r[index])
        
        u = self.train_u[index]
        i = self.train_i[index]
        r = self.train_r[index]
        
        return u, i, r

    def __len__(self):

        return self.length

In [8]:
split_mode='seq-aware'
feedback='explicit'
test_ratio=0.1
batch_size=256
    
data, num_users, num_items = read_data_ml100k()

train_data, test_data = split_data_ml100k(data, num_users, num_items, split_mode, test_ratio)

train_u, train_i, train_r, _ = load_data_ml100k(train_data, num_users, num_items, feedback)
test_u, test_i, test_r, _ = load_data_ml100k(test_data, num_users, num_items, feedback)

train_set = movieDataset(train_u, train_i, train_r)
test_set = movieDataset(test_u, test_i, test_r)

train_iter = DataLoader(train_set, shuffle=True, batch_size=batch_size)
test_iter = DataLoader(test_set, batch_size=batch_size)
    


In [9]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch

deepfm : https://d2l.ai/chapter_recommender-systems/deepfm.html

In [10]:
train_set.train_u

array([  0,   0,   0, ..., 942, 942, 942])

In [11]:
train_set.train_i

array([167, 171, 164, ..., 228, 229, 227])

In [12]:
train_set.train_r

array([5, 5, 5, ..., 2, 1, 3])

In [13]:
for i,j,k in train_iter:
    print(i.size(),j.size(),k.size())
    print(i,j,k)
    break

torch.Size([256]) torch.Size([256]) torch.Size([256])
tensor([797, 728, 787, 247, 760, 647, 570, 373, 534, 633,   0, 136, 357, 641,
        804, 174, 895, 484, 591, 598, 486, 390, 333, 505,  12, 266, 434,  12,
        542, 822, 281, 189, 202, 222, 185,  74,  57, 326, 268, 377, 502, 850,
         47, 839, 822, 835, 547, 267, 290,  39, 362, 822, 289, 391, 709, 660,
        804, 275, 592, 891, 176, 505, 465, 677,  12, 150, 312, 384, 738, 360,
        397, 547, 383,  22, 536, 683, 502, 377, 592,  12, 312, 762, 360, 710,
        587, 612, 652,  57, 542, 398, 421, 891, 415, 587, 233, 486, 275,  89,
        325, 226, 274,  82, 797,  50, 594, 485, 626, 164,  25, 346, 218, 250,
        811, 928, 863,  57, 384, 722, 129, 789,  94, 313, 565, 765, 416, 690,
        119,  49, 928, 489, 373, 689, 738, 129, 155, 550, 327, 877, 415, 261,
        488, 307, 305, 862, 935,   4,   0, 536, 473, 270, 405, 358, 307, 797,
        475, 765, 353, 311, 318, 268, 207, 455, 602, 896, 486,  40, 511, 381,
        47

In [61]:
class MF(nn.Module):
    def __init__(self, num_factors, num_users, num_items, **kwargs):
        super().__init__()
        self.P = nn.Embedding(num_users, num_factors)
        self.Q = nn.Embedding(num_items, num_factors)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)

    def forward(self, user_id, item_id):
        P_u = self.P(user_id)
        Q_i = self.Q(item_id)
        b_u = self.user_bias(user_id)
        b_i = self.item_bias(item_id)
        
        outputs = (P_u * Q_i).sum(axis=1) + np.squeeze(b_u) + np.squeeze(b_i)
        outputs =  outputs.flatten()

        return outputs
    
    def get_score(self, x, y):
        user_id = torch.Tensor(np.array([x])).type(torch.LongTensor).cuda()
        item_id = torch.Tensor(np.array([y])).type(torch.LongTensor).cuda()
        P_u = self.P(user_id)
        Q_i = self.Q(item_id)
        b_u = self.user_bias(user_id)
        b_i = self.item_bias(item_id)
        
        outputs = (P_u * Q_i).sum(axis=1) + np.squeeze(b_u) + np.squeeze(b_i)
        outputs =  outputs.flatten()

        return outputs

In [62]:
model = MF(5, num_users, num_items).cuda()
optimizer = optim.Adam(model.parameters(), lr=0.02)
loss_fn = torch.nn.MSELoss()

In [63]:
model

MF(
  (P): Embedding(943, 5)
  (Q): Embedding(1682, 5)
  (user_bias): Embedding(943, 1)
  (item_bias): Embedding(1682, 1)
)

In [64]:
epoch = 20

for e in range(epoch):

    for i,j,k in train_iter:
        
        optimizer.zero_grad()
        
        output = model(i.type(torch.LongTensor).cuda(), j.type(torch.LongTensor).cuda())

        loss = loss_fn(output,k.type(torch.FloatTensor).cuda())
        
        loss.backward()

        optimizer.step()
   
    print(e + 1, float(loss))

1 1.9258922338485718
2 1.0360578298568726
3 0.9582760334014893
4 0.9022576808929443
5 0.7426514029502869
6 0.8963865041732788
7 0.7419942617416382
8 0.7975097298622131
9 0.7570343017578125
10 0.688805341720581
11 0.8349044322967529
12 0.6888463497161865
13 0.8100711703300476
14 0.6979689598083496
15 0.8512730002403259
16 0.7314725518226624
17 0.8113701939582825
18 0.6908893585205078
19 0.8254406452178955
20 0.761676013469696


In [65]:
scores = model.get_score(20,30)
scores

tensor([3.3228], device='cuda:0', grad_fn=<AddBackward0>)

In [66]:
scores = model.get_score(20,50)
scores

tensor([2.7222], device='cuda:0', grad_fn=<AddBackward0>)

In [68]:
average_loss = 0.0
count = 0

for i,j,k in test_iter:
    output = model(i.type(torch.LongTensor).cuda(), j.type(torch.LongTensor).cuda())
    
    loss = loss_fn(output,k.type(torch.FloatTensor).cuda())
    count+=1
    average_loss+=float(loss)
    
print(average_loss/count)

1.2432507872581482
