In [23]:
from jflegDataset import JflegDataset
import pandas as pd
from torch.utils.data import DataLoader
from transformers import BertTokenizer, GPT2Tokenizer
import torch
from torch import nn

In [24]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
TRAIN_PATH = "src/dataset/train.csv"
VAL_PATH = "src/dataset/eval.csv"



In [25]:
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token

sentences = ["It will rain in the",
            "I want to eat a big bowl of",
            "My dog is"]
a = tokenizer(sentences, return_tensors="pt", padding=True)

target = ["It will",
            "I want to",
            "My"]

b = tokenizer(target, return_tensors="pt", padding=True)

b

{'input_ids': tensor([[50256,  1026,   481],
        [   40,   765,   284],
        [50256, 50256,  3666]]), 'attention_mask': tensor([[0, 1, 1],
        [1, 1, 1],
        [0, 0, 1]])}

In [26]:
ds_train = JflegDataset(TRAIN_PATH, tokenizer)
ds_eval = JflegDataset(VAL_PATH, tokenizer)

dl_train = DataLoader(ds_train)
dl_eval = DataLoader(ds_eval)

tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [27]:
from torch import nn, Tensor
import math

# https://pytorch.org/tutorials/beginner/transformer_tutorial.html
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class TokenEmbedding(nn.Module):
    #  https://pytorch.org/tutorials/beginner/translation_transformer.html
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: torch.Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [28]:
import torch
from torch import nn, Tensor

class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, padding_token_id:int):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, 0.1)
        self.embedding = TokenEmbedding(ntoken, d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=4,
            num_encoder_layers=6,
            num_decoder_layers=6,
            dim_feedforward=1024,
            dropout=0.1,
            batch_first=True
        )

        self.padding_token_id = padding_token_id

        self.head = nn.Linear(d_model, ntoken)
        self.sm = nn.Softmax(dim=0)

    def generatePaddingMask(self, sentences):
        return sentences == self.padding_token_id

    def forward(self, input: Tensor, target: Tensor) -> Tensor:
        srcPaddingMask = self.generatePaddingMask(input)
        tgtPaddingMask = self.generatePaddingMask(target)

        src = self.embedding(input) 
        src = self.pos_encoder(src)

        target = self.embedding(target)
        target = self.pos_encoder(target)

        encoded = self.transformer.encoder(src, src_key_padding_mask=srcPaddingMask)
        decoded = self.transformer.decoder(target, memory=encoded, tgt_key_padding_mask=tgtPaddingMask)

        out = self.head(decoded)

        return self.sm(out)

In [29]:
model = TransformerModel(tokenizer.vocab_size, 768, tokenizer.eos_token_id)

for x,y in dl_train:
    src = x["input_ids"]
    target = y[0]["input_ids"]

    print(f"{src.shape=}")
    print(f"{target.shape=}")
    out = model(src,target)
    print(out.shape)
    break


src.shape=torch.Size([1, 4])
target.shape=torch.Size([1, 3])
torch.Size([1, 3, 50257])


In [30]:

if __name__ == "__main__":
    n_classes = 100

    # TODO: TRANSPONI E SMARMELLA TUTTO

    source = torch.randint(low=0, high=n_classes, size=(20, 16))
    target = torch.randint(low=0, high=n_classes, size=(20, 32))

    s2s = TransformerModel(n_classes,700,tokenizer.eos_token_id)

    out = s2s(source, target)
    print(out.size())
    print(out)

torch.Size([20, 32, 100])
tensor([[[0.0435, 0.0435, 0.0840,  ..., 0.0687, 0.0375, 0.0518],
         [0.0298, 0.0355, 0.0812,  ..., 0.0606, 0.0502, 0.0365],
         [0.0465, 0.0581, 0.0492,  ..., 0.0337, 0.0534, 0.0243],
         ...,
         [0.0388, 0.0604, 0.0804,  ..., 0.0616, 0.0770, 0.0341],
         [0.1181, 0.0326, 0.0505,  ..., 0.0560, 0.0529, 0.0653],
         [0.0535, 0.0569, 0.0713,  ..., 0.0627, 0.0299, 0.0522]],

        [[0.0611, 0.0680, 0.0745,  ..., 0.0450, 0.0445, 0.0654],
         [0.0570, 0.0394, 0.0573,  ..., 0.0584, 0.0492, 0.0561],
         [0.0540, 0.0607, 0.0766,  ..., 0.0359, 0.0323, 0.0622],
         ...,
         [0.0349, 0.0341, 0.0396,  ..., 0.0513, 0.0434, 0.0425],
         [0.0427, 0.0483, 0.0679,  ..., 0.0405, 0.0501, 0.0493],
         [0.0364, 0.0534, 0.0461,  ..., 0.0451, 0.0406, 0.0452]],

        [[0.0432, 0.0746, 0.0199,  ..., 0.0349, 0.0615, 0.0310],
         [0.0434, 0.0904, 0.0431,  ..., 0.0352, 0.0494, 0.0515],
         [0.0463, 0.0851, 0.0311