https://pytorch.org/tutorials/beginner/transformer_tutorial.html

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset, DataLoader

from transformers import AutoTokenizer, AutoModelForCausalLM

import math

In [19]:
class SelfAttention(torch.nn.Module):
    def __init__(self, input_size, attention_size):
        super(SelfAttention, self).__init__()
        
        self.query_layer = torch.nn.Linear(input_size, attention_size)
        self.key_layer = torch.nn.Linear(input_size, attention_size)
        self.value_layer = torch.nn.Linear(input_size, attention_size)
        
        self.output_layer = torch.nn.Linear(attention_size, input_size)

    def forward(self, query, key, value, mask=None):
        
        query = self.query_layer(query)
        key = self.key_layer(key)
        value = self.value_layer(value)

        attention = torch.bmm(query, key.transpose(1, 2))
        attention = attention / (attention.size(-1) ** 0.5)
        attention = torch.softmax(attention, dim=-1)

        # print(attention)

        '''
        The mask is a binary tensor with the same shape as the input sequence, 
        where the elements that should be attended to are set to 1, and the elements 
        that should be ignored are set to 0. If mask is provided, the attention_scores 
        tensor is updated with the mask by replacing the attention scores corresponding 
        to the position of 0 with a large negative value (-1e9) before applying the 
        softmax function. This ensures that the attention weights will be close to zero 
        for the elements that are masked.
        '''
        if mask is not None:
            attention = attention.masked_fill(mask == 0, -1e10)

        # print(attention)

        # print(f'X size {attention.size()}')
        # print(f'V size {value.size()}')
        output = torch.bmm(attention, value)
        output = self.output_layer(output)
        
        return output


# a = SelfAttention(4, 10)

# # batch size, sequnce length, input size
# x = torch.randn(1, 3, 4)
# m = torch.tensor([[1,0,0], [0,1,0], [0,0,1]])
# y = a(x, mask=m)

Here's what transformer look like
<img src="transformer_architecture.png" style="width:30%; height:30%; display:block;"/>
and here's how we're implementing it

In [20]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)


class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

In [21]:
class EncoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout):
        super(EncoderBlock, self).__init__()

        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )
        self.norm2 = nn.LayerNorm(embed_size)

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))  
        return out



class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout):
        super(DecoderBlock, self).__init__()
        self.attention1 = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        
        self.attention2 = SelfAttention(embed_size, heads)
        self.norm2 = nn.LayerNorm(embed_size)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )
        self.norm3 = nn.LayerNorm(embed_size)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention1(x, x, x, trg_mask)
        query = self.dropout(self.norm1(attention + x))

        attention = self.attention2(value, key, query, src_mask)
        x = self.dropout(self.norm2(attention + query))

        forward = self.feed_forward(x)
        out = self.dropout(self.norm3(forward + x))

        return out

In [22]:
class Transformer(nn.Module):
    def __init__(
        self, 
        src_vocab_size,
        trg_vocab_size,
        max_length=100,
        embed_size=256,
        num_layers=6,
        forward_expansion=4,
        heads=8,
        dropout=0,
    ):
        super(Transformer, self).__init__()

        self.encoder_word_embedding = TokenEmbedding(src_vocab_size, embed_size)
        self.encoder_position_embedding = PositionalEncoding(embed_size, 0, max_length)

        self.encoder_blocks = nn.ModuleList(
            [
                EncoderBlock(
                    embed_size, 
                    heads, 
                    forward_expansion,
                    dropout, 
                ) for _ in range(num_layers)
            ]
        )

        self.decoder_word_embedding = TokenEmbedding(trg_vocab_size, embed_size)
        self.decoder_position_embedding = PositionalEncoding(embed_size, 0, max_length)
        
        self.decoder_blocks = nn.ModuleList(
            [
                DecoderBlock(
                    embed_size,
                    heads,
                    forward_expansion,
                    dropout,
                ) for _ in range(num_layers)
            ]
        )

        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, trg, src_mask=None, trg_mask=None):
        src = self.encoder_word_embedding(src)
        src = self.encoder_position_embedding(src)

        # src_number, src_length = src.shape
        # src_positions = torch.arange(0, src_length).expand(src_number, src_length)
        # src = self.dropout(self.encoder_word_embedding(src) + self.encoder_position_embedding(src_positions))

        for encoder in self.encoder_blocks:
            src = encoder(src, src, src, src_mask)

        trg = self.decoder_word_embedding(trg)
        trg = self.decoder_position_embedding(trg)

        # trg_number, trg_length = trg.shape
        # trg_positions = torch.arange(0, trg_length).expand(trg_number, trg_length)
        # trg = self.dropout(self.decoder_word_embedding(trg) + self.decoder_position_embedding(trg_positions))

        for decoder in self.decoder_blocks:
            trg = decoder(trg, src, src, src_mask, trg_mask)

        out = torch.softmax(self.fc_out(trg), dim=-1)
        return out

In [23]:
# Read the Chinese text file
with open("../data/translation/chinese.txt", encoding="utf8") as f:
    chinese_text = f.readlines()

# Read the English text file
with open("../data/translation/english.txt", encoding="utf8") as f:
    english_text = f.readlines()

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

In [None]:
ch = chinese_text[1]
en = english_text[1]

print(ch)

ch_token = torch.tensor([tokenizer(ch)['input_ids']])
en_token = tokenizer(en)['input_ids']

print(ch_token.size())

In [24]:
t = Transformer(50257, 50257)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(t))

45122305


In [34]:

# src vocab size = 13
# trg vocab size = 21
t = Transformer(13,21)

src = torch.tensor([[2,5,6,7,8,3,1,1]])
trg = torch.tensor([[2,6,7,8,3,1,1,1]])
output = t(src, trg)


print(output.size())
print(output)

'''

output[i][j][k] represents the probability of the k-th 
token in the target vocabulary being the next token after 
the j-th token in the input sequence, in the i-th example in the batch

'''


torch.Size([1, 8, 21])
tensor([[[0.0964, 0.0528, 0.0289, 0.0235, 0.0483, 0.0395, 0.0616, 0.0681,
          0.0569, 0.0507, 0.0420, 0.0380, 0.0332, 0.0576, 0.0226, 0.0661,
          0.0255, 0.0332, 0.0598, 0.0423, 0.0531],
         [0.1113, 0.1418, 0.0331, 0.0305, 0.0247, 0.0150, 0.0160, 0.0624,
          0.0692, 0.0240, 0.0359, 0.0528, 0.0277, 0.0292, 0.0461, 0.0395,
          0.0213, 0.0478, 0.0512, 0.0440, 0.0766],
         [0.0496, 0.0725, 0.0397, 0.0264, 0.0440, 0.0345, 0.0361, 0.0908,
          0.0857, 0.0493, 0.0414, 0.0308, 0.0241, 0.0412, 0.0202, 0.0395,
          0.0377, 0.0364, 0.0408, 0.0435, 0.1160],
         [0.0821, 0.0637, 0.0195, 0.0186, 0.0397, 0.0431, 0.0262, 0.0649,
          0.1942, 0.0473, 0.0451, 0.0548, 0.0192, 0.0354, 0.0190, 0.0218,
          0.0156, 0.0476, 0.0551, 0.0331, 0.0538],
         [0.0894, 0.0640, 0.0305, 0.0216, 0.0294, 0.0338, 0.0463, 0.1095,
          0.0495, 0.0478, 0.0291, 0.0217, 0.0581, 0.0610, 0.0101, 0.0291,
          0.0333, 0.0903, 0.0333,

In [37]:
print(output[0][1][2])

tensor(0.0331, grad_fn=<SelectBackward0>)


In [5]:
# Read the Chinese text file
with open("../data/translation/chinese.txt", encoding="utf8") as f:
    chinese_text = f.readlines()

# Read the English text file
with open("../data/translation/english.txt", encoding="utf8") as f:
    english_text = f.readlines()

print(chinese_text[:10])
print(english_text[:10])
print('done')


# # Convert the text to PyTorch tensors
# english_tensor = torch.tensor([line for line in english_text])
# chinese_tensor = torch.tensor([line for line in chinese_text])

# # Print the first line of the text
# print(english_text[0])
# print(chinese_text[0])

['1929年还是1989年?\n', '巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正在发生的情况。\n', '一开始，很多人把这次危机比作1982年或1973年所发生的情况，这样得类比是令人宽心的，因为这两段时期意味着典型的周期性衰退。\n', '如今人们的心情却是沉重多了，许多人开始把这次危机与1929年和1931年相比，即使一些国家政府的表现仍然似乎把视目前的情况为是典型的而看见的衰退。\n', '目前的趋势是，要么是过度的克制（欧洲），要么是努力的扩展（美国）。\n', '欧洲在避免债务和捍卫欧元的名义下正变得谨慎，而美国已经在许多方面行动起来，以利用这一理想的时机来实行急需的结构性改革。\n', '然而，作为地域战略学家，无论是从政治意义还是从经济意义上，让我自然想到的年份是1989年。\n', '当然，雷曼兄弟公司的倒闭和柏林墙的倒塌没有任何关系。\n', '事实上，从表面上看，两者似乎是完全是相反的：一个是象征着压抑和人为分裂的柏林墙的倒塌，而另一个是看似坚不可摧的并令人安心的金融资本主义机构的倒塌。\n', '然而，和1989年一样，2008-2009年很可能也能被视为一个划时代的改变，其带来的发人深省的后果将在几十年后仍能让我们感受得到。\n']
['1929 or 1989?\n', 'PARIS – As the economic crisis deepens and widens, the world has been searching for historical analogies to help us understand what has been happening.\n', 'At the start of the crisis, many people likened it to 1982 or 1973, which was reassuring, because both dates refer to classical cyclical downturns.\n', 'Today, the mood is much grimmer, with references to 1929 and 1931 beginning to abound, even if some gove

In [21]:
# from transformers import GPTJModel



# # model = torch.load('../models/GPT-J/GPT-J.bin')
# # print(model)

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

print(tokenizer)

# model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

ch = chinese_text[1]
en = english_text[1]

print(ch)
print(en)

ch_token = tokenizer(ch)['input_ids']
en_token = tokenizer(en)['input_ids']

print(ch_token)
print(en_token)


PreTrainedTokenizerFast(name_or_path='EleutherAI/gpt-j-6B', vocab_size=50257, model_max_len=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)})
巴黎-随着经济危机不断加深和蔓延，整个世界一直在寻找历史上的类似事件希望有助于我们了解目前正在发生的情况。

PARIS – As the economic crisis deepens and widens, the world has been searching for historical analogies to help us understand what has been happening.

[32432, 112, 165, 119, 236, 12, 49694, 237, 163, 251, 222, 163, 119, 237, 38184, 236, 39355, 109, 17312, 118, 38834, 23877, 255, 27950, 254, 162, 115, 109, 161, 240, 234, 164, 242, 241, 161, 119, 114, 171, 120, 234, 46763, 112, 10310, 103, 10310, 244, 45911, 234, 31660, 33566, 112, 28839, 101, 433

In [20]:
tokens = torch.tensor([[165, 119, 236]])
word = tokenizer.batch_decode(tokens)
print(word)

['黎']


In [19]:
# x = 'My name is Eddy, and he is Matt'
# a = tokenizer(x)
# b = tokenizer(x, return_tensors='pt').input_ids

# print(a)
# print(b)

# tokens = torch.tensor([[3666, 11, 1438, 11, 318, 11, 1717, 11, 9892, 11, 290, 11, 339, 11, 318, 11, 4705]])
# word = tokenizer.batch_decode(tokens)
# print(word)


print(tokenizer)

# gen_tokens = model.generate(b, do_sample=True, temperature=0.9, max_length=100,)

# print(gen_tokens)


PreTrainedTokenizerFast(name_or_path='EleutherAI/gpt-j-6B', vocab_size=50257, model_max_len=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)})


In [9]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [10]:
class TransformerModel(nn.Module):
    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5) -> None:
        super().__init__()
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

def generate_square_subsequent_mask(sz: int):
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [11]:

from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

# from datasets import load_dataset

# dataset = load_dataset('wikitext', 'wikitext-103-v1')

In [12]:
print(vocab)

# for i in range(70, 90):
#     print(dataset['train'][i])

Vocab()


In [13]:
ntokens = 20000  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # number of heads in nn.MultiheadAttention
dropout = 0.2  # dropout probability


model = TransformerModel(ntokens, 2, nhead, d_hid, nlayers, dropout)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(count_parameters(model))
print(model)

# from torchtext.datasets import WikiText2
# from torchtext.data.utils import get_tokenizer
# from torchtext.vocab import build_vocab_from_iterator

# train_iter = WikiText2(split='train')
# print(train_iter)

102068
TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=2, out_features=2, bias=True)
        )
        (linear1): Linear(in_features=2, out_features=200, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=200, out_features=2, bias=True)
        (norm1): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=2, out_features=2, bias=True)
        )
        (linear1)

In [4]:
class SelfAttention(nn.Module):
    def __init__(self, input_dim, attention_dim):
        super().__init__()

        self.query_linear = nn.Linear(input_dim, attention_dim, bias=False)
        self.key_linear = nn.Linear(input_dim, attention_dim, bias=False)
        self.value_linear = nn.Linear(input_dim, attention_dim, bias=False)

        self.output_linear = nn.Linear(attention_dim, input_dim)


    def forward(self, x, mask=None):

        query = self.query_linear(x)
        key = self.key_linear(x)
        value = self.value_linear(x)

        attention_weights = torch.bmm(query, key.transpose(1, 2)) / math.sqrt(self.attention_dim)

        if mask is not None:
            attention_weights = attention_weights.masked_fill(mask == 0, -1e10)

        attention_weights = torch.softmax(attention_weights, dim=2)
        attention_output = torch.bmm(attention_weights, value)

        y = self.output_linear(attention_output)

        return y, attention_weights

# a = SelfAttention(100, 100)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
# print(count_parameters(a))
# print(a)


q_w = torch.tensor([[1,0,1], [1,0,0], [0,0,1], [0,1,1]])

a = SelfAttention(4, 3)
# a.query_linear.weight = torch.nn.Parameter()
print(a.query_linear.weight.size())


print(count_parameters(a))



# print(count_parameters(t))
# print(t)


# e = nn.Embedding(3, 3)
# x = torch.tensor([[1,0,2]])
# print(e(x))


torch.Size([3, 4])
52
