# Import Slice of Data

In [79]:
import pandas as pd
import numpy as np
import pprint

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x241efba91d0>

In [80]:
data_path = 'mathorcup_recom_listwise/data/'

[pandas.read_csv](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)

In [81]:
test_df = pd.read_csv(data_path + 'test_data.csv', dtype=str, nrows=2)
test_df

Unnamed: 0,requestID,userID,date,time,sequence
0,719708291_1635480753679_2960,1439416582,20211029,12,509057416;133681226775;509178914;509178914;508...
1,679978594_1635491281923_1780,1359957188,20211029,15,508829941;133686019323;508830405;133677444707;...


In [82]:
train_df = pd.read_csv(data_path + 'train_data.csv', dtype=str, nrows=2)
train_df

Unnamed: 0,userID,requestID,date,time,sequence
0,1000014754,500007377_1635422685108_3822,20211028,20,133669542676:1:148;133658378700:1:16;133650937...
1,1000019906,500009953_1635375063077_3893,20211028,6,133679233276:0:0;133658338671:0:0;133677846615...


# Embedding

[WORD EMBEDDINGS: ENCODING LEXICAL SEMANTICS](https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html)

``` python
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
hello_embed = embeds(lookup_tensor)
print(hello_embed)
```

Out:
```
tensor([[ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519]],
       grad_fn=<EmbeddingBackward0>)
```

## UserID Embedding

In [83]:
train_userID = set(train_df['userID'])
train_userID

{'1000014754', '1000019906'}

In [84]:
userID2idx = {_:i for i, _ in enumerate(train_userID)}
userID2idx

{'1000014754': 0, '1000019906': 1}

In [85]:
userID_embeds = nn.Embedding(len(userID2idx), 4)
userID_embeds

Embedding(2, 4)

## Item ID Embedding

In [86]:
def icd2i(icd):
    i, c, d = icd.split(':')
    return i


def seq2itemID(sequence):
    """
    :param sequence: {str}, e.g. '133679233276:0:0;133658338671:0:0;133677846615:0:0'
    :return:
    """
    return {icd2i(icd) for icd in sequence.split(';')}

[How can I find the union on a list of sets in Python? [duplicate]](https://stackoverflow.com/a/31253153/12224183)

In [87]:
train_itemID = set.union(*[seq2itemID(seq) for seq in train_df['sequence']])
train_itemID

{'133650937891',
 '133658338671',
 '133658378700',
 '133658512070',
 '133660220952',
 '133660292493',
 '133663959878',
 '133665337307',
 '133669542676',
 '133673154438',
 '133677667554',
 '133677842247',
 '133677846615',
 '133678000841',
 '133679233276',
 '506770575',
 '506898339',
 '507531461',
 '507570279',
 '507605194'}

In [92]:
itemID2idx = {_:i for i, _ in enumerate(train_itemID)}
itemID2idx

{'133658512070': 0,
 '506898339': 1,
 '133669542676': 2,
 '507570279': 3,
 '133660220952': 4,
 '133658378700': 5,
 '133665337307': 6,
 '133650937891': 7,
 '507531461': 8,
 '133677842247': 9,
 '133663959878': 10,
 '133677846615': 11,
 '133677667554': 12,
 '133678000841': 13,
 '506770575': 14,
 '133673154438': 15,
 '507605194': 16,
 '133660292493': 17,
 '133679233276': 18,
 '133658338671': 19}

In [93]:
itemID_embeds = nn.Embedding(len(itemID2idx), 4)
itemID_embeds

Embedding(20, 4)

# Dataset

[torch.utils.data.Dataset(*args, **kwds)](https://pytorch.org/docs/stable/data.html?highlight=dataset#torch.utils.data.Dataset)

In [94]:
from torch.utils.data import Dataset

In [360]:
def icd2dict(icd):
    i, c, d = icd.split(':')
    return {'itemID': i, 'clicked': bool(eval(c)), 'duration': eval(d)}


class Sequence:
    def __init__(self, sequence):
        """
        :param sequence: {str}, e.g. '133679233276:0:0;133658338671:0:0;133677846615:0:0'
        :return:
        """
        self.sequence = [icd2dict(icd) for icd in sequence.split(';')]
        self.length = len(self.sequence)
        self.avg_clicked = np.mean([_['clicked'] for _ in self.sequence])
        self.sum_duration = np.sum([_['duration'] for _ in self.sequence])
        self.avg_duration = self.sum_duration / self.length
    def seq_print(self):
        pprint.pprint(self.sequence)
        print('length of sequence:', self.len_sequence)
        print('average clicked:', self.avg_clicked)
        print('average duration:', self.avg_duration)
    def __len__(self):
        return self.length

In [410]:
class TrainDataset(Dataset):
    def __init__(self, df, userID2idx, itemID2idx):
        self.length = len(df)
        
        self.userID2idx = userID2idx
        self.itemID2idx = itemID2idx
    
        self.userLen = len(userID2idx)
        self.itemLen = len(itemID2idx)
        
        self.userID, self.requestID = df['userID'], df['requestID']  # string
        self.userIdx = torch.tensor([userID2idx[_] for _ in self.userID], dtype=torch.int32)  # {Tensor: (len(df),)}
        
        # self.date = torch.tensor(df.astype({'date': 'int32'})['date'])  # e.g. 20220106
        # self.time = torch.tensor(df.astype({'time': 'int8'})['time'])  # range in [00, 23]
        
        self.date = torch.zeros([len(df), 6], dtype=torch.int16)
        for _ in range(len(df)):
            self.date[_, 0] = int(df.loc[_, 'time'])  # hour
            date = df.loc[_, 'date']
            self.date[_, 3] = int(date[:4])  # year
            self.date[_, 4] = int(date[4:6])  # month
            self.date[_, 5] = int(date[6:8])  # day
            
        self.sequence = [Sequence(_) for _ in df['sequence']]
        self.max_sum_duration = max([_.sum_duration for _ in self.sequence])
    def __len__(self):
        return self.length
    def __getitem__(self, idx):
        # udt = torch.tensor([self.userIdx[idx], self.date[idx], self.time[idx]], dtype=torch.int32)
        userIdx = self.userIdx[idx]
        date = self.date[idx]
        sequence = self.sequence[idx]
        itemID = torch.tensor([self.itemID2idx[_['itemID']] for _ in self.sequence[idx].sequence], dtype=torch.int32)
        duration = torch.tensor([_['duration'] for _ in self.sequence[idx].sequence], dtype=torch.int32)
        return userIdx, date, itemID, duration, torch.tensor(len(sequence))

In [413]:
train_df

Unnamed: 0,userID,requestID,date,time,sequence
0,1000014754,500007377_1635422685108_3822,20211028,20,133669542676:1:148;133658378700:1:16;133650937...
1,1000019906,500009953_1635375063077_3893,20211028,6,133679233276:0:0;133658338671:0:0;133677846615...


In [415]:
train_dataset = TrainDataset(train_df, userID2idx, itemID2idx)

In [416]:
train_dataset.max_sum_duration

853

In [364]:
train_dataset[0]

(tensor(0, dtype=torch.int32),
 tensor([  20,    0,    0, 2021,   10,   28], dtype=torch.int16),
 tensor([ 2,  5,  7, 10,  1,  0,  4, 14, 15, 17, 17], dtype=torch.int32),
 tensor([148,  16,  85, 221,   0, 101,  60,   0, 102,   0, 120],
        dtype=torch.int32),
 tensor(11))

In [365]:
train_dataset[1]

(tensor(1, dtype=torch.int32),
 tensor([   6,    0,    0, 2021,   10,   28], dtype=torch.int16),
 tensor([18, 19, 11, 12, 16,  6,  9,  8,  3, 13], dtype=torch.int32),
 tensor([  0,   0,   0,   0, 113,   0, 251,   0,   0,   0], dtype=torch.int32),
 tensor(10))

In [366]:
train_dataset.__dict__

{'length': 2,
 'userID2idx': {'1000014754': 0, '1000019906': 1},
 'itemID2idx': {'133658512070': 0,
  '506898339': 1,
  '133669542676': 2,
  '507570279': 3,
  '133660220952': 4,
  '133658378700': 5,
  '133665337307': 6,
  '133650937891': 7,
  '507531461': 8,
  '133677842247': 9,
  '133663959878': 10,
  '133677846615': 11,
  '133677667554': 12,
  '133678000841': 13,
  '506770575': 14,
  '133673154438': 15,
  '507605194': 16,
  '133660292493': 17,
  '133679233276': 18,
  '133658338671': 19},
 'userLen': 2,
 'itemLen': 20,
 'userID': 0    1000014754
 1    1000019906
 Name: userID, dtype: object,
 'requestID': 0    500007377_1635422685108_3822
 1    500009953_1635375063077_3893
 Name: requestID, dtype: object,
 'userIdx': tensor([0, 1], dtype=torch.int32),
 'date': tensor([[  20,    0,    0, 2021,   10,   28],
         [   6,    0,    0, 2021,   10,   28]], dtype=torch.int16),
 'sequence': [<__main__.Sequence at 0x24181a4af70>,
  <__main__.Sequence at 0x24181a2a130>]}

# Dataloader

[torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, collate_fn=None, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None, multiprocessing_context=None, generator=None, *, prefetch_factor=2, persistent_workers=False)](https://pytorch.org/docs/stable/data.html?highlight=dataset#torch.utils.data.DataLoader)

[Guidelines for assigning num_workers to DataLoader](https://discuss.pytorch.org/t/guidelines-for-assigning-num-workers-to-dataloader/813)

[How to use 'collate_fn' with dataloaders?](https://stackoverflow.com/a/65875359/12224183)
collate
英 [kəˈleɪt]  美 [kəˈleɪt] 
vt. 核对，校对；校勘

[reshaping a tensor with padding in pytorch](https://stackoverflow.com/a/53126241/12224183)

Examples:
1. [Custom datasets in Pytorch — Part 2. Text (Machine Translation)
](https://towardsdatascience.com/custom-datasets-in-pytorch-part-2-text-machine-translation-71c41a3e994e)

In [367]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [368]:
source = torch.rand((5))
source

tensor([0.2780, 0.8161, 0.5097, 0.2991, 0.7889])

In [369]:
result = F.pad(input=source, pad=(0, 2), mode='constant', value=0)
result

tensor([0.2780, 0.8161, 0.5097, 0.2991, 0.7889, 0.0000, 0.0000])

In [370]:
itemLen = train_dataset.itemLen
itemLen

20

In [371]:
# [How to use 'collate_fn' with dataloaders?]
# (https://stackoverflow.com/a/65875359/12224183)
def collate_fn(data):
    """
    :param data: list of tuples with (utd, sequence idx, labels, len(labels))
    :return:
    """
    userIdxes, dates, itemIdxes, durations, lengths = list(zip(*data))
    return (
        torch.stack(userIdxes),
        torch.stack(dates),
        pad_sequence(itemIdxes, batch_first=True, padding_value=itemLen),  # if -1, index out of range in Embedding()!!  
        pad_sequence(durations, batch_first=True, padding_value=-1), 
        torch.stack(lengths)
    )

In [372]:
train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=64, shuffle=True)

In [373]:
next(iter(train_dataloader))[0]  # user index with shape: torch.Size([2])

tensor([0, 1], dtype=torch.int32)

In [374]:
next(iter(train_dataloader))[1]  # date with shape: torch.Size([2, 6])

tensor([[   6,    0,    0, 2021,   10,   28],
        [  20,    0,    0, 2021,   10,   28]], dtype=torch.int16)

In [375]:
next(iter(train_dataloader))[2]  # item index

tensor([[ 2,  5,  7, 10,  1,  0,  4, 14, 15, 17, 17],
        [18, 19, 11, 12, 16,  6,  9,  8,  3, 13, 20]], dtype=torch.int32)

In [376]:
next(iter(train_dataloader))[3]  # duration

tensor([[  0,   0,   0,   0, 113,   0, 251,   0,   0,   0,  -1],
        [148,  16,  85, 221,   0, 101,  60,   0, 102,   0, 120]],
       dtype=torch.int32)

In [377]:
next(iter(train_dataloader))[4]  # length

tensor([10, 11])

In [378]:
for i, batch in enumerate(train_dataloader):
    print(i, batch[0].shape)

0 torch.Size([2])


# Model

Examples: 
1. [TRAINING WITH PYTORCH](https://pytorch.org/tutorials/beginner/introyt/trainingyt.html#training-with-pytorch)
2. [LANGUAGE MODELING WITH NN.TRANSFORMER AND TORCHTEXT](https://pytorch.org/tutorials/beginner/transformer_tutorial.html)
3. [SEQUENCE MODELS AND LONG SHORT-TERM MEMORY NETWORKS](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#sphx-glr-beginner-nlp-sequence-models-tutorial-py)
4. [WORD EMBEDDINGS: ENCODING LEXICAL SEMANTICS](https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#word-embeddings-encoding-lexical-semantics)
5. [A detailed guide to PyTorch’s nn.Transformer() module.](https://towardsdatascience.com/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1)

**Transformer to deal with sequences of inconsistent length?**

[Issues using pack_padded_sequence #1522](https://github.com/pytorch/xla/issues/1522#issuecomment-606300555)
> For transformer based models, we don't do packing (doesn't make sense for attention). AFAIU pad/pack produces a different shape tensor every time, one dimension of the tensor is number of non-pad tokens in the batch, which is likely highly variable from batch to batch. The reasoning behind it is, it saves flops that way. However, this must be causing a ton of compiles on TPUs due to shapes varying all over the place.

[How to create batches of a list of varying dimension tensors?](https://discuss.pytorch.org/t/how-to-create-batches-of-a-list-of-varying-dimension-tensors/50773)

**Meaning of Transformer parameters?**

[TRANSFORMER](https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html#transformer)

[Difference between src_mask and src_key_padding_mask](https://stackoverflow.com/questions/62170439/difference-between-src-mask-and-src-key-padding-mask)

**nn.transformer with embedding example**

[nn.Transformer explaination](https://discuss.pytorch.org/t/nn-transformer-explaination/53175/3)
> I’m having the same problem, but for the example part i guess it is a mistake from their side
nn.transformer doesn’t take source and target vocab size as it is only implementing the transformer part without the embeddings layer on the input data and without the linear layer on the output of the decoder,
in order to make it work d_model will be your embedding size and call an embedding layer on the source and on the target and the output of the transformer should pass through a linear that gets you the target vocab size
> ```python
self.embed_src = nn.Embedding(src_vocab, emb_dim)
self.embed_trg = nn.Embedding(trg_vocab, emb_dim)
self.model = nn.Transformer( d_model = emb_dim,nhead=heads, self.num_encoder_layers=N, num_decoder_layers=N)
self.out_linear = nn.Linear(emb_dim, trg_vocab)
> ```
> for the forward function it should be
> ```python
src = self.embed_src(src) 
trg = self.embed_trg(trg)
output = self.model(src, trg)
output = self.out(output)
> ```

[ojus1/Date2Vec - GitHub](https://github.com/ojus1/Date2Vec)

In [379]:
import torch.nn as nn
import torch.nn.functional as F
from models.date2vec import Date2Vec

## Embedding Test

In [380]:
userID2idx

{'1000014754': 0, '1000019906': 1}

In [381]:
itemID2idx

{'133658512070': 0,
 '506898339': 1,
 '133669542676': 2,
 '507570279': 3,
 '133660220952': 4,
 '133658378700': 5,
 '133665337307': 6,
 '133650937891': 7,
 '507531461': 8,
 '133677842247': 9,
 '133663959878': 10,
 '133677846615': 11,
 '133677667554': 12,
 '133678000841': 13,
 '506770575': 14,
 '133673154438': 15,
 '507605194': 16,
 '133660292493': 17,
 '133679233276': 18,
 '133658338671': 19}

In [382]:
itemID_embeds = nn.Embedding(len(itemID2idx), 4)
itemID_embeds

Embedding(20, 4)

In [383]:
itemID_embeds(torch.zeros([2,3], dtype=torch.int)).shape

torch.Size([2, 3, 4])

## Src Dim Test

In [384]:
N , dim_user, dim_date = 4, 16, 16
u = torch.zeros([N, dim_user])  # (N, dim_user)
d = torch.zeros([N, dim_date])  # (N, dim_date)
torch.transpose(torch.stack([u,d]), 1, 0).shape  # (N, S, E`)

torch.Size([4, 2, 16])

## Tranformer Output Dim Test

In [385]:
transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12)
src = torch.rand((10, 32, 512))  # (S, N, E)
tgt = torch.rand((20, 32, 512))  # (S, N, E)
out = transformer_model(src, tgt)
out.shape
# RuntimeError: the feature number of src and tgt must be equal to d_model
# RuntimeError: the batch number of src and tgt must be equal

torch.Size([20, 32, 512])

## Linear Output Dim Test

In [386]:
N, T, d_model = 4, 16, 512
linear = nn.Linear(d_model, 1)
out = torch.rand([N, T, d_model])
out = linear(out)
out.shape

torch.Size([4, 16, 1])

In [387]:
class Model(nn.Module):
    def __init__(self, user_len, item_len, d_model=6):
        super(Model, self).__init__()
        self.userID_embeddings = nn.Embedding(user_len, d_model)
        self.itemID_embeddings = nn.Embedding(item_len + 1, d_model)  # if not +1, index out of range in Embedding()!!
        self.date2vec = Date2Vec(k=32, act='sin')  # (N, 6) -> (N, dim_date)
        self.transformer = nn.Transformer(
            d_model=d_model,  # default is 512
            nhead=3,  # default is 4
            num_encoder_layers=6, 
            num_decoder_layers=6, 
            dim_feedforward=2048, 
            dropout=0.1, 
            activation='relu', 
            custom_encoder=None, 
            custom_decoder=None, 
            layer_norm_eps=1e-05, 
            batch_first=True, 
            device=None, 
            dtype=None
        )  # (N, T, E) -> (N, T, E)
        self.linear = nn.Linear(d_model, 1)  # (N, T, E) -> (N, T, 1)
        
    def forward(self, u, d, i, tgt_mask, tgt_key_padding_mask):
        """
        :param u: {Tensor: (N,)}, user indexes
        :param d: {Tensor: (N, 6)}, date
        :param i: {Tensor: (N, T)}, item indexes (target sentence)
        """
        # prepare src
        u = self.userID_embeddings(u)  # (N, dim_user)
        d = self.date2vec(d.float())  # (N, dim_date)
        ud = torch.transpose(torch.stack([u,d]), 1, 0)  # (N, S, E)
        # prepare tgt
        i = self.itemID_embeddings(i)  # (N, T, E)
        # calculate out
        out = self.transformer(src=ud, tgt=i, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask)  # (N, T, E)
        out = self.linear(out)  # (N, T, 1)
        return out
    
    def get_tgt_mask(self, size) -> torch.tensor:
        # Generates a squeare matrix where the each row allows one word more to be seen
        mask = torch.tril(torch.ones(size, size) == 1)  # Lower triangular matrix
        mask = mask.float()
        mask = mask.masked_fill(mask == 0, float('-inf'))  # Convert zeros to -inf
        mask = mask.masked_fill(mask == 1, float(0.0))  # Convert ones to 0

        # EX for size=5:
        # [[0., -inf, -inf, -inf, -inf],
        #  [0.,   0., -inf, -inf, -inf],
        #  [0.,   0.,   0., -inf, -inf],
        #  [0.,   0.,   0.,   0., -inf],
        #  [0.,   0.,   0.,   0.,   0.]]

        return mask

    def create_pad_mask(self, matrix: torch.tensor, pad_token: int) -> torch.tensor:
        # If matrix = [1,2,3,0,0,0] where pad_token=0, the result mask is
        # [False, False, False, True, True, True]
        return (matrix == pad_token)


In [388]:
train_userID

{'1000014754', '1000019906'}

In [389]:
model = Model(user_len=len(train_userID), item_len=len(train_itemID))
model

Model(
  (userID_embeddings): Embedding(2, 6)
  (itemID_embeddings): Embedding(21, 6)
  (date2vec): Date2Vec(
    (fc1): Linear(in_features=6, out_features=16, bias=True)
    (fc2): Linear(in_features=6, out_features=16, bias=True)
    (d2): Dropout(p=0.3, inplace=False)
    (fc3): Linear(in_features=32, out_features=16, bias=True)
    (d3): Dropout(p=0.3, inplace=False)
    (fc4): Linear(in_features=16, out_features=6, bias=True)
    (fc5): Linear(in_features=6, out_features=6, bias=True)
  )
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=6, out_features=6, bias=True)
          )
          (linear1): Linear(in_features=6, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=6, bias=True)
          (norm1): L

In [390]:
for i, batch in enumerate(train_dataloader):
    userIdxes, dates, itemIdxes, durations, lengths = batch
    print(i, 'th batch')
    print('userIdxes:', userIdxes)
    print('dates:', dates)
    print('itemIdxes:', itemIdxes)
    print('durations:', durations, durations.shape)
    print('lengths:', lengths, lengths.shape)
    
    tgt_mask = model.get_tgt_mask(size=itemIdxes.size(1))
    tgt_key_padding_mask = model.create_pad_mask(matrix=itemIdxes, pad_token=itemLen)
    
    preds = model(
        userIdxes, 
        dates, 
        itemIdxes, 
        tgt_mask=tgt_mask, 
        tgt_key_padding_mask=tgt_key_padding_mask
    ).squeeze()
    print('preds:', preds)
    print('preds.shape:', preds.shape)  # (N, T)

0 th batch
userIdxes: tensor([0, 1], dtype=torch.int32)
dates: tensor([[  20,    0,    0, 2021,   10,   28],
        [   6,    0,    0, 2021,   10,   28]], dtype=torch.int16)
itemIdxes: tensor([[ 2,  5,  7, 10,  1,  0,  4, 14, 15, 17, 17],
        [18, 19, 11, 12, 16,  6,  9,  8,  3, 13, 20]], dtype=torch.int32)
durations: tensor([[148,  16,  85, 221,   0, 101,  60,   0, 102,   0, 120],
        [  0,   0,   0,   0, 113,   0, 251,   0,   0,   0,  -1]],
       dtype=torch.int32) torch.Size([2, 11])
lengths: tensor([11, 10]) torch.Size([2])
preds: tensor([[1.1449, 0.1882, 0.4989, 1.0057, 0.9262, 0.6131, 0.1845, 0.5120, 1.1680,
         1.1465, 0.5812],
        [0.9523, 0.8888, 0.8726, 1.0284, 0.6494, 0.8173, 0.6586, 0.8271, 0.5834,
         0.8427, 0.5406]], grad_fn=<SqueezeBackward0>)
preds.shape: torch.Size([2, 11])


In [391]:
tgt_mask

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [392]:
tgt_key_padding_mask

tensor([[False, False, False, False, False, False, False, False, False, False,
         False],
        [False, False, False, False, False, False, False, False, False, False,
          True]])

In [393]:
preds

tensor([[1.1449, 0.1882, 0.4989, 1.0057, 0.9262, 0.6131, 0.1845, 0.5120, 1.1680,
         1.1465, 0.5812],
        [0.9523, 0.8888, 0.8726, 1.0284, 0.6494, 0.8173, 0.6586, 0.8271, 0.5834,
         0.8427, 0.5406]], grad_fn=<SqueezeBackward0>)

# Loss Function

[Custom loss functions](https://discuss.pytorch.org/t/custom-loss-functions/29387/2)

In [394]:
durations = torch.tensor([[4, 0, 4, 0, -1],[0, 4, 0, 4, 0]])
durations

tensor([[ 4,  0,  4,  0, -1],
        [ 0,  4,  0,  4,  0]])

In [395]:
preds = torch.tensor([[1, -1, 1, -1, 1], [-1, 1, -1, 1, -1]])
preds

tensor([[ 1, -1,  1, -1,  1],
        [-1,  1, -1,  1, -1]])

## for Element  > 0

In [396]:
label.gt(0) * (durations - preds) ** 2

tensor([[9, 0, 9, 0, 0],
        [0, 9, 0, 9, 0]])

In [397]:
torch.sum(durations.gt(0) * (durations - preds) ** 2)

tensor(36)

## for Element == 0

In [398]:
durations.eq(0) * (torch.sign(preds) + 1)

tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]])

In [399]:
durations.eq(0) * (torch.tanh(preds) + 1)

tensor([[0.0000, 0.2384, 0.0000, 0.2384, 0.0000],
        [0.2384, 0.0000, 0.2384, 0.0000, 0.2384]])

In [400]:
torch.sum(durations.eq(0) * (torch.tanh(preds) + 1))

tensor(1.1920)

## for Terminal Energy

In [401]:
torch.sum(torch.abs(durations.gt(-1) * preds))

tensor(9)

In [484]:
def my_loss(preds, durations, total_energy):
    """
    :param output: (N, T)
    :param target: (N, T)
    :param target: (N)
    """
    ge0_loss = torch.sum(durations.gt(0) * (durations - preds * total_energy) ** 2)  # [0, 4*te^2)
    eq0_loss = torch.sum(durations.eq(0) * (torch.tanh(preds) + 1))  # range in [0, 2)
    term_loss = (1 - torch.sum(torch.abs(durations.gt(-1) * preds))) ** 2  # range in [0, (T-1)^2)
    print('ge0_loss, eq0_loss, term_loss:', ge0_loss, eq0_loss, term_loss)
    return ge0_loss / total_energy + 100 * eq0_loss + term_loss

In [485]:
preds

tensor([[ 1, -1,  1, -1,  1],
        [-1,  1, -1,  1, -1]])

In [486]:
durations

tensor([[ 4,  0,  4,  0, -1],
        [ 0,  4,  0,  4,  0]])

## Test 1

In [487]:
my_loss(preds, durations, 20)

ge0_loss, eq0_loss, term_loss: tensor(1024) tensor(1.1920) tensor(64)


tensor(1307.2291)

## Test 2

In [488]:
my_loss(durations/20, durations, 20)

ge0_loss, eq0_loss, term_loss: tensor(0.) tensor(5.) tensor(0.0400)


tensor(5000.0400)

In [489]:
durations.eq(0) * (torch.sign(durations/20) + 1)

tensor([[0., 1., 0., 1., 0.],
        [1., 0., 1., 0., 1.]])

# Train Settings

In [490]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device = 'cpu'
model = Model(user_len=len(train_userID), item_len=len(train_itemID))
opt = torch.optim.Adam(model.parameters())
loss_fn = my_loss

# Train

In [491]:
def train_loop(model, opt, loss_fn, dataloader, total_energy):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_dataloader):
        userIdxes, dates, itemIdxes, durations, lengths = batch

        tgt_mask = model.get_tgt_mask(size=itemIdxes.size(1))
        tgt_key_padding_mask = model.create_pad_mask(matrix=itemIdxes, pad_token=itemLen)

        preds = model(
            userIdxes, 
            dates, 
            itemIdxes, 
            tgt_mask=tgt_mask, 
            tgt_key_padding_mask=tgt_key_padding_mask
        ).squeeze()  # (N, T)
        
        loss = loss_fn(preds, durations, total_energy)
        
        opt.zero_grad()
        loss.backward()
        opt.step()

        total_loss += loss.detach().item()

    return total_loss / len(dataloader)

In [492]:
train_dataset.max_sum_duration

853

In [494]:
epochs = 1000
train_loss_list, validation_loss_list = [], []

for epoch in range(1):
    if epoch % 100 == 0:
        print("-"*25, f"Epoch {epoch + 1}","-"*25)

    train_loss = train_loop(
        model=model, 
        opt=opt, 
        loss_fn=loss_fn, 
        dataloader=train_dataloader,
        total_energy=train_dataset.max_sum_duration
    )
    train_loss_list += [train_loss]

    print(f"Training loss: {train_loss:.4f}")
    print()

------------------------- Epoch 1 -------------------------
ge0_loss, eq0_loss, term_loss: tensor(350897.1875, grad_fn=<SumBackward0>) tensor(3.1237, grad_fn=<SumBackward0>) tensor(234.1014, grad_fn=<PowBackward0>)
Training loss: 3769.1638



# Tensor Board

In [497]:
from torch.utils.tensorboard import SummaryWriter

In [498]:
writer = SummaryWriter('runs/test')
epochs = 10

for epoch in range(1, epochs + 1):
    train_loss = train_loop(
        model=model, 
        opt=opt, 
        loss_fn=loss_fn, 
        dataloader=train_dataloader,
        total_energy=train_dataset.max_sum_duration
    )
    writer.add_scalar('train_loss', train_loss, epoch)

ge0_loss, eq0_loss, term_loss: tensor(212472.1250, grad_fn=<SumBackward0>) tensor(3.2651, grad_fn=<SumBackward0>) tensor(221.0641, grad_fn=<PowBackward0>)
ge0_loss, eq0_loss, term_loss: tensor(140877.7656, grad_fn=<SumBackward0>) tensor(6.4822, grad_fn=<SumBackward0>) tensor(92.5187, grad_fn=<PowBackward0>)
ge0_loss, eq0_loss, term_loss: tensor(218546.6406, grad_fn=<SumBackward0>) tensor(6.1948, grad_fn=<SumBackward0>) tensor(95.0685, grad_fn=<PowBackward0>)
ge0_loss, eq0_loss, term_loss: tensor(493922.3125, grad_fn=<SumBackward0>) tensor(3.0103, grad_fn=<SumBackward0>) tensor(240.8879, grad_fn=<PowBackward0>)
ge0_loss, eq0_loss, term_loss: tensor(765295.1875, grad_fn=<SumBackward0>) tensor(2.1402, grad_fn=<SumBackward0>) tensor(321.0149, grad_fn=<PowBackward0>)
ge0_loss, eq0_loss, term_loss: tensor(744933.3750, grad_fn=<SumBackward0>) tensor(1.8623, grad_fn=<SumBackward0>) tensor(325.3593, grad_fn=<PowBackward0>)
ge0_loss, eq0_loss, term_loss: tensor(665069.3125, grad_fn=<SumBackward0

In [499]:
%load_ext tensorboard
%tensorboard --logdir runs