In [1]:
import numpy as np
from collections import defaultdict

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [2]:
class CTRDataset(Dataset):
    def __init__(self, data_path, feat_mapper=None, defaults=None,
                 min_threshold=4, num_feat=34):
        self.NUM_FEATS, self.count, self.data = num_feat, 0, {}
        feat_cnts = defaultdict(lambda: defaultdict(int))
        self.feat_mapper, self.defaults = feat_mapper, defaults
        self.field_dims = np.zeros(self.NUM_FEATS, dtype=np.int64)
        with open(data_path) as f:
            for line in f:
                instance = {}
                values = line.rstrip('\n').split('\t')
                if len(values) != self.NUM_FEATS + 1:
                    continue
                label = np.float32([0, 0])
                label[int(values[0])] = 1
                instance['y'] = [np.float32(values[0])]
                for i in range(1, self.NUM_FEATS + 1): # 가장 첫번째는 0,1 값
                    feat_cnts[i][values[i]] += 1
                    instance.setdefault('x', []).append(values[i])
                self.data[self.count] = instance
                self.count = self.count + 1
        if self.feat_mapper is None and self.defaults is None:
            feat_mapper = {i: {feat for feat, c in cnt.items() if c >=
                               min_threshold} for i, cnt in feat_cnts.items()}
            self.feat_mapper = {i: {feat_v: idx for idx, feat_v in enumerate(feat_values)}
                                for i, feat_values in feat_mapper.items()}
            self.defaults = {i: len(feat_values) for i, feat_values in feat_mapper.items()}
        for i, fm in self.feat_mapper.items():
            self.field_dims[i - 1] = len(fm) + 1
        self.offsets = np.array((0, *np.asarray(np.cumsum(self.field_dims))
                                 [:-1]))

    def __len__(self):
        return self.count

    def __getitem__(self, idx):
        feat = np.array([self.feat_mapper[i + 1].get(v, self.defaults[i + 1])
                         for i, v in enumerate(self.data[idx]['x'])])
        return feat + self.offsets, self.data[idx]['y']

In [4]:
class FM(nn.Module):
    def __init__(self, field_dims, num_factors):
        super().__init__()
        num_inputs = int(sum(field_dims))
        self.embedding = nn.Embedding(num_inputs, num_factors)
        self.fc = nn.Embedding(num_inputs, 1)
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        square_of_sum = torch.sum(self.embedding(x), dim=1) ** 2
        sum_of_square = torch.sum(self.embedding(x) ** 2, dim=1)
        output = self.bias + torch.sum(self.fc(x), dim=1) \
            + 0.5 * (square_of_sum - sum_of_square).sum(dim=1, keepdims=True)
        output = torch.sigmoid(output)
        return output

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_factors = 20
lr = 0.02
batch_size = 2048
wd = 1e-5

num_epochs = 30

In [6]:
train_data = CTRDataset("/home/doyeon/yeons/d2l-recsys/d2l-RecSys/ctr/train.csv")
test_data = CTRDataset("/home/doyeon/yeons/d2l-recsys/d2l-RecSys/ctr/train.csv", 
                       feat_mapper=train_data.feat_mapper,
                       defaults=train_data.defaults)

train_iter = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_iter = DataLoader(test_data, shuffle=False, batch_size=batch_size)

In [7]:
model = FM(train_data.field_dims, num_factors).to(device)
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

In [8]:
for epoch in range(num_epochs):
    train_loss = []
    model.train()
    
    for i, values in enumerate(train_iter):
        optimizer.zero_grad()
        
        field_dims = values[0].long().to(device)
        label = values[1][0].float().to(device).unsqueeze(1)
        
        preds = model(field_dims)
        loss = loss_fn(preds, label)
        loss.backward()
        optimizer.step()
        
        train_loss.append(loss.detach().cpu().item())
        
    test_loss = []
    model.eval()
    with torch.no_grad():
        for i, values in enumerate(test_iter):
            field_dims = values[0].long().to(device)
            label = values[1][0].float().to(device).unsqueeze(1)
            
            preds = model(field_dims)
            loss = loss_fn(preds, label)
            test_loss.append(loss.detach().cpu().item())


    print(f"{epoch} Epochs")
    print(f"train_loss: {np.mean(train_loss):.3f}")
    print(f"test_loss: {np.mean(test_loss):.3f}")
    print("\n")

0 Epochs
train_loss: 43.055
test_loss: 47.506


1 Epochs
train_loss: 46.947
test_loss: 41.475


2 Epochs
train_loss: 28.742
test_loss: 20.317


3 Epochs
train_loss: 22.032
test_loss: 20.261


4 Epochs
train_loss: 13.866
test_loss: 11.644


5 Epochs
train_loss: 11.311
test_loss: 7.555


6 Epochs
train_loss: 7.149
test_loss: 6.992


7 Epochs
train_loss: 5.867
test_loss: 4.992


8 Epochs
train_loss: 4.934
test_loss: 4.349


9 Epochs
train_loss: 4.228
test_loss: 4.220


10 Epochs
train_loss: 3.940
test_loss: 3.795


11 Epochs
train_loss: 3.667
test_loss: 3.681


12 Epochs
train_loss: 3.568
test_loss: 3.485


13 Epochs
train_loss: 3.485
test_loss: 3.428


14 Epochs
train_loss: 3.283
test_loss: 3.313


15 Epochs
train_loss: 3.211
test_loss: 3.236


16 Epochs
train_loss: 3.203
test_loss: 3.162


17 Epochs
train_loss: 3.143
test_loss: 3.097


18 Epochs
train_loss: 3.006
test_loss: 3.057


19 Epochs
train_loss: 2.979
test_loss: 3.017


20 Epochs
train_loss: 2.972
test_loss: 2.993


21 Epochs
tr