In [7]:
import math
import tqdm
import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset, DataLoader
import dataset
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class TransformerModel(nn.Module):

    def __init__(self, ntoken: int=128, d_model: int=512, nhead: int=8, d_hid: int=2048,
                 nlayers: int=6, n_label: int = 10, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, n_label)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor=None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output


def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of ``-inf``, with zeros on ``diag``."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [8]:
torch.set_printoptions(edgeitems=torch.inf)
train_dataset, test_dataset = dataset.GCDataset('Archive/train', 512), dataset.GCDataset('Archive/test', 512)
train_dataloader, test_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True), DataLoader(test_dataset, batch_size=2, shuffle=False)
model = TransformerModel().train()
opt = Adam(model.parameters(), 1e-4)
criterion = CrossEntropyLoss()
nepochs = 1
for epoch in range(nepochs):
    tot_accnum = 0
    totnum = 0
    model.train()
    for (data, label) in tqdm.tqdm(train_dataloader):
        opt.zero_grad()
        output = model(data.permute(1,0))
        output = output.permute(1,2,0)
        if (label<0).sum() > 0:
            print(label)
        l = criterion(output, label)
        l.backward()
        opt.step()
        
        accnum = ((label>0)*(torch.argmax(output, 1)==label)).sum()
        tot_accnum += accnum
        totnum += (label>0).sum()
        
        print(f"train_loss:{l:.3f}  acc:{accnum/((label>0).sum()):.3f}  avg_acc:{accnum/totnum:.3f}\n")

    tot_accnum = 0
    totnum = 0
    model.eval()
    for i, (data, label) in enumerate(train_dataloader):
        output = model(data.permute(1,0))
        output = output.permute(1,2,0)
        if (label<0).sum() > 0:
            print(label)
        l = criterion(output, label)
        l.backward()
        opt.step()
        
        accnum = ((label>0)*(torch.argmax(output, 1)==label)).sum()
        tot_accnum += accnum
        totnum += (label>0).sum()
        
        print(f"train_loss:{l:.3f}  acc:{accnum/((label>0).sum()):.3f}  avg_acc:{accnum/totnum:.3f}\n")

    

  0%|          | 0/316 [00:00<?, ?it/s]

tensor([[67, 62, 64, 67, 65, 64, 67, 67, 65, 64, 62, 64, 62, 64, 60, 60, 62, 64,
         81, 67, 81, 67, 81, 67, 81, 67, 65, 64, 62, 60, 62, 60, 62, 64, 62, 64,
         67, 81, 67, 67, 64, 62, 60, 55, 60, 60, 62, 64, 62, 60, 81, 67, 65, 64,
         62, 64, 64, 67, 62, 62, 60, 62,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,

In [None]:
import math 

max_len = 10
d_model = 10
position = torch.arange(max_len).unsqueeze(1)
# div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
# print(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
# pe = torch.zeros(max_len, 1, d_model)
print(position * div_term)
# pe[:, 0, 0::2] = torch.sin(position * div_term)
# pe[:, 0, 1::2] = torch.cos(position * div_term)

In [None]:
import torch 

a = torch.arange(18).reshape((6, 3))
b = torch.arange(3)
# print(a.size(), b.size())
# print(a)
print(a * b)

In [None]:
import numpy as np
import torch

a = torch.zeros(3, 2)
a.size()

In [None]:
import torch.nn as nn
import torch

a = nn.Embedding(num_embeddings=4, embedding_dim=10)
c = torch.arange(4).unsqueeze(1)
# print(c.squeeze())
b = a(c)
b.size()