In [5]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from dataclasses import dataclass

In [6]:
torch.cuda.is_available()

True

In [7]:
@dataclass
class ModelArgs:
    batch_size: int = 32
    d_model: int = 512
    hidden_dim: int = 1024 # hidden dim for feed forward layer
    num_blocks: int = 32
    num_q_heads: int = 32
    num_kv_heads: int = 16
    vocab_size: int = -1 # initialized later
    eps: float = 1e-6 # eps for RMSNorm
    max_batch_size: int = 32
    max_seq_len: int = 512
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    freq_base: int = 10000
    epochs: int = 5

In [8]:
def compute_freqs(d_model:int, seq_len: int, device: torch.device | str, base: int):
    """
    d_model: embedding dim
    seq_len: sequence length
    device: cuda / cpu
    base: base for exponential of theta values
    """

    assert d_model % 2 == 0, "d_model has to be even"
    
    theta = 1. / (base ** (torch.arange(0, d_model, 2) / d_model)).to(device)
    m = torch.arange(seq_len).to(device)
    freqs = torch.outer(m, theta).float() # Since each m value corresponds to a single token, multiply every value of m by every value of theta, kind of like a nested for loop.
    freqs_complex = torch.polar(torch.ones_like(freqs), freqs) # turn into complex form, z = r*cis(theta), in this case, r is 1
    return freqs_complex

def apply_rotary_embeddings(x: torch.Tensor, freqs_complex: torch.Tensor, device: torch.device):
    """
    x: input sequence to add positional embedding, (batch, seq_len, emb_dim)
    freqs_complex: frequencies for rotary postitional embeddings
    device: cuda / cpu
    """
    # print(x.shape, freqs_complex.shape)
    x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2)) # (batch, seq_len, _, 2)
    # print(x_complex.shape)
    freqs_complex = freqs_complex.unsqueeze(0).unsqueeze(2)
    # print(freqs_complex.shape)
    x_rotated = x_complex * freqs_complex # * is for element-wise multiplication
    # print(x_rotated.shape)
    x_rotated = torch.view_as_real(x_rotated)
    # print(x_rotated.shape)
    x_rotated = x_rotated.reshape(*x.shape)
    # print(x_rotated.shape)
    return x_rotated.type_as(x).to(device)

a = compute_freqs(32, 10, torch.device("cuda"), 10000)
print(a.shape)
b = torch.randn((3, 10, 5, 32)).to(torch.device("cuda"))
apply_rotary_embeddings(b, a, torch.device("cuda"))

torch.Size([10, 16])


tensor([[[[-2.0066e+00, -1.5439e+00,  1.2610e+00,  ..., -1.3504e+00,
           -1.2820e+00, -4.1116e-01],
          [-1.0628e+00,  8.0939e-01,  8.9014e-01,  ...,  1.0547e+00,
           -9.1446e-01,  4.3935e-01],
          [-9.1582e-01, -2.2099e-01, -6.9655e-02,  ..., -2.7517e-01,
            6.7879e-01, -3.5210e-02],
          [-1.4875e-01,  2.6449e+00,  3.2572e-01,  ..., -1.0715e+00,
            1.8116e+00,  9.8089e-03],
          [-6.5293e-01, -1.3751e+00, -9.0568e-01,  ..., -1.2641e+00,
            1.0315e+00, -7.1314e-01]],

         [[ 2.5721e-01, -6.7754e-01, -2.2514e+00,  ..., -2.0556e+00,
            2.7232e+00,  1.0822e+00],
          [-4.7367e-01,  2.5354e-01, -4.7315e-01,  ..., -3.2901e+00,
            2.1026e+00, -7.3989e-01],
          [-1.4458e+00, -1.3209e+00,  4.0396e-02,  ..., -4.9933e-01,
           -1.1447e-01,  8.6106e-01],
          [-1.3568e+00, -3.3785e+00, -2.4562e-01,  ..., -3.1407e-01,
           -1.4376e+00, -1.8868e-01],
          [-3.6285e-01,  7.2419e-01

In [9]:
class RMSNorm(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.eps = args.eps
        self.gamma = nn.Parameter(torch.ones(args.d_model))

    def rms(self, x: torch.Tensor):
        x = torch.pow(x, 2)
        x = torch.mean(x, dim=-1, keepdim=True)
        x = torch.sqrt(x + self.eps) # add eps to in case x = 0 (sqrt(0) is undefined in math)
        return x

    def forward(self, x: torch.Tensor):
        return x / self.rms(x) * self.gamma

args = ModelArgs(d_model=32)
r = RMSNorm(args)
a = torch.randn((5, 10, 32))
r(a).shape

torch.Size([5, 10, 32])

In [10]:
class FeedForward(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.W = nn.Linear(args.d_model, args.hidden_dim)
        self.V = nn.Linear(args.d_model, args.hidden_dim)
        self.f = nn.Linear(args.hidden_dim, args.d_model)
    
    def forward(self, x: torch.Tensor):
        swiglu = F.silu(self.W(x)) * self.V(x)
        return self.f(swiglu)

l = FeedForward(args)
a = torch.randn((5, 10, 32))
l(a).shape

torch.Size([5, 10, 32])

In [11]:
class MultiHeadAttention(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.args = args
        self.num_rep = args.num_q_heads // args.num_kv_heads
        
        self.q = nn.Linear(args.d_model, args.num_q_heads * args.d_model, bias=False)
        self.k = nn.Linear(args.d_model, args.num_kv_heads * args.d_model, bias=False)
        self.v = nn.Linear(args.d_model, args.num_kv_heads * args.d_model, bias=False)

        self.out = nn.Linear(args.num_q_heads * args.d_model, args.d_model, bias=False)
        self.cache_k = torch.zeros((args.max_batch_size, args.max_seq_len, args.num_kv_heads, args.d_model))
        self.cache_v = torch.zeros((args.max_batch_size, args.max_seq_len, args.num_kv_heads, args.d_model))

    
    def repeat_kv(self, x: torch.Tensor, n_rep: int):
        batch_size, seq_len, num_kv_heads, emb_dim = x.shape
        if n_rep == 1:
            return x
        return x[:, :, :, None, :].expand(batch_size, seq_len, num_kv_heads, n_rep, emb_dim).reshape(batch_size, seq_len, num_kv_heads * n_rep, emb_dim)

    
    def forward(self, x: torch.Tensor, start_pos: int, freqs_complex: torch.Tensor, use_kv_cache=False):
        batch_size, seq_len, _ = x.shape

        q = self.q(x) # (batch_size, seq_len, head_num, emb_dim)
        k = self.k(x)
        v = self.v(x)
        
        q = q.view(batch_size, seq_len, self.args.num_q_heads, self.args.d_model)
        k = k.view(batch_size, seq_len, self.args.num_kv_heads, self.args.d_model)
        v = v.view(batch_size, seq_len, self.args.num_kv_heads, self.args.d_model)

        q = apply_rotary_embeddings(q, freqs_complex, device=self.args.device)
        k = apply_rotary_embeddings(k, freqs_complex, device=self.args.device)
        
        keys, values = k, v
        if use_kv_cache:
            self.cache_k[:batch_size, start_pos: start_pos + seq_len] = k
            self.cache_v[:batch_size, start_pos: start_pos + seq_len] = v

            keys = self.cache_k[:batch_size, :start_pos + seq_len] # all cache including added key
            values = self.cache_v[:batch_size, :start_pos + seq_len]

        keys = self.repeat_kv(keys, self.num_rep)
        values = self.repeat_kv(values, self.num_rep)

        q = q.permute(0, 2, 1, 3)
        keys = keys.permute(0, 2, 3, 1)
        values = values.permute(0, 2, 3, 1) # (batch_size, head_num, emb_dim, seq_len)
        
        scores = torch.matmul(q, keys) / self.args.d_model # (batch_size, head_num, seq_len, seq_len)
        scores = F.softmax(scores.float(), dim=-1).type_as(q)
        
        output = torch.matmul(scores, values) # (batch_size, head_num, seq_len, emb_dim)
        output = output.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_len, -1)

        return self.out(output)

In [12]:
class EncoderBlock(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.attention = MultiHeadAttention(args)
        self.feed_forward = FeedForward(args)
        self.norm1 = RMSNorm(args)
        self.norm2 = RMSNorm(args)

    def forward(self, x: torch.Tensor, start_pos: int, freqs_complex: torch.Tensor):
        h = x + self.attention(self.norm1(x), start_pos, freqs_complex)
        out = h + self.feed_forward(self.norm2(h))
        return out

In [13]:
class Transformer(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()

        self.args = args

        self.embedding_layer = nn.Embedding(args.vocab_size, args.d_model)

        self.encoder_layers = nn.ModuleList()
        for _ in range(args.num_blocks):
            self.encoder_layers.append(EncoderBlock(args))

        self.norm = RMSNorm(args)
        self.output = nn.Linear(args.d_model, args.vocab_size, bias=False)
        self.freqs_complex = compute_freqs(args.d_model, args.max_seq_len, device=args.device, base=args.freq_base)

    def forward(self, tokens: torch.Tensor, start_pos: int):
        h = self.embedding_layer(tokens)
        freqs_complex = self.freqs_complex[:, start_pos: start_pos + tokens.shape[1]]

        h = self.norm(h)

        for layer in self.encoder_layers:
            h = layer(h, start_pos, freqs_complex)

        output = self.output(h).float()
        return output

In [14]:
import torch
import torch.nn as nn

class TransformerSparseEmbeddings(nn.Module):
    def __init__(self, args):
        super().__init__()

        self.args = args

        # Use Sparse Embedding
        self.embedding_layer = nn.EmbeddingBag(args.vocab_size, args.d_model, sparse=True)

        self.encoder_layers = nn.ModuleList()
        for _ in range(args.num_blocks):
            self.encoder_layers.append(EncoderBlock(args))

        self.norm = RMSNorm(args)
        self.output = nn.Linear(args.d_model, args.vocab_size, bias=False)
        self.freqs_complex = compute_freqs(args.d_model, args.max_seq_len, device=args.device, base=args.freq_base)

    def forward(self, tokens: torch.Tensor, start_pos: int):
        # Modify the embedding lookup to work with sparse embeddings
        offsets = torch.arange(0, tokens.size(0) * tokens.size(1), tokens.size(1), device=tokens.device)
        h = self.embedding_layer(tokens.view(-1), offsets)

        h = h.view(tokens.size(0), tokens.size(1), -1)
        freqs_complex = self.freqs_complex[:, start_pos: start_pos + tokens.shape[1]]

        h = self.norm(h)

        for layer in self.encoder_layers:
            h = layer(h, start_pos, freqs_complex)

        output = self.output(h).float()
        return output

In [15]:
import sentencepiece as spm

In [16]:
import pandas as pd
df = pd.read_csv('./DeepLearning/Ja-En-LLaMA/en-ja.bicleaner05.txt', sep="\\t", header=None)

  df = pd.read_csv('./DeepLearning/Ja-En-LLaMA/en-ja.bicleaner05.txt', sep="\\t", header=None)


In [17]:
df.head()[[3, 4]]

Unnamed: 0,3,4
0,And everyone will not care that it is not you.,鼻・口のところはあらかじめ少し切っておくといいですね。
1,And everyone will not care that it is not you.,アドレス置いとくので、消されないうちにメールくれたら嬉しいです。
2,Sponsored link This advertisement is displayed...,スポンサードリンク この広告は一定期間更新がない場合に表示されます。
3,"Also, it will always be hidden when becoming a...",また、 プレミアムユーザー になると常に非表示になります。
4,It will return to non-display when content upd...,コンテンツの更新が行われると非表示に戻ります。


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25740835 entries, 0 to 25740834
Data columns (total 5 columns):
 #   Column  Dtype  
---  ------  -----  
 0   0       object 
 1   1       object 
 2   2       float64
 3   3       object 
 4   4       object 
dtypes: float64(1), object(4)
memory usage: 981.9+ MB


In [19]:
df[3][9]

'Go to the original video hierarchy of the conversion source, copy and paste the following is fine. ffmpeg -i sample.mp4 -strict -2 video.webm summary I’ve been using the upload and embed method to Youtube to set up videos on the web.'

In [20]:
# using trained tokenizers from http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/
english_tokenizer = spm.SentencePieceProcessor("./DeepLearning/Ja-En-LLaMA/enja_spm_models/spm.en.nopretok.model")
japanese_tokenizer = spm.SentencePieceProcessor("./DeepLearning/Ja-En-LLaMA/enja_spm_models/spm.ja.nopretok.model")

In [21]:
df[3].isna().unique(), df[4].isna().unique()

(array([False]), array([False]))

In [22]:
english_tokenizer.encode("Go to the original video hierarchy of the conversion source, copy and paste the following is fine. ffmpeg -i sample.mp4 -strict -2 video.webm summary I’ve been using the upload and embed method to Youtube to set up videos on the web.", out_type=str)

['▁Go',
 '▁to',
 '▁the',
 '▁original',
 '▁video',
 '▁hierarchy',
 '▁of',
 '▁the',
 '▁conversion',
 '▁source',
 ',',
 '▁copy',
 '▁and',
 '▁paste',
 '▁the',
 '▁following',
 '▁is',
 '▁fine',
 '.',
 '▁',
 'ff',
 'mp',
 'eg',
 '▁-',
 'i',
 '▁sample',
 '.',
 'mp',
 '4',
 '▁-',
 'strict',
 '▁-',
 '2',
 '▁video',
 '.',
 'web',
 'm',
 '▁summary',
 '▁I',
 '’',
 've',
 '▁been',
 '▁using',
 '▁the',
 '▁upload',
 '▁and',
 '▁embed',
 '▁method',
 '▁to',
 '▁You',
 'tube',
 '▁to',
 '▁set',
 '▁up',
 '▁videos',
 '▁on',
 '▁the',
 '▁web',
 '.']

In [23]:
japanese_tokenizer.encode("年金 日本に住んでいる20歳~60歳の全ての人は、公的年金制度に加入しなければなりません。", out_type=str)

['▁',
 '年',
 '金',
 '▁日本',
 'に住んでいる',
 '20',
 '歳',
 '~',
 '60',
 '歳の',
 '全ての',
 '人は',
 '、',
 '公的',
 '年',
 '金',
 '制度',
 'に',
 '加入',
 'しなければなりません',
 '。']

In [24]:
english_tokenizer.vocab_size(), japanese_tokenizer.vocab_size()

(32000, 32000)

In [25]:
from collections import Counter
from torchtext.vocab import vocab


def build_vocab(sentences, tokenizer):
    counter = Counter()
    for sentence in sentences:
        counter.update(tokenizer.encode(sentence, out_type=str))
    return vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>']) # specials: include special tokens in the mapping



In [26]:
print(len(df), len(df) // 10)

25740835 2574083


In [27]:
df = df.sample(frac=1)
english = df[3].tolist()[:len(df) // 500]
japanese = df[4].tolist()[:len(df) // 500]
assert len(english) == len(japanese)
l = len(english)
train_en = english[:int(0.7 * l)]
val_en = english[int(0.7 * l): int(0.85 * l)]
test_en = english[int(0.85 * l): l]
train_ja = japanese[:int(0.7 * l)]
val_ja = japanese[int(0.7 * l): int(0.85 * l)]
test_ja = japanese[int(0.85 * l): l]

In [28]:
print(len(train_en), len(df))

36036 25740835


In [29]:
vocab_ja = build_vocab(japanese, japanese_tokenizer)
vocab_en = build_vocab(english, english_tokenizer)

In [30]:
def data_process(ja, en):
    data = []
    for (raw_ja, raw_en) in zip(ja, en):
        ja_tensor = torch.tensor([vocab_ja[token] for token in japanese_tokenizer.encode(raw_ja.strip("\n"), out_type=str)], dtype=torch.long)
        en_tensor = torch.tensor([vocab_en[token] for token in english_tokenizer.encode(raw_en.rstrip("\n"), out_type=str)], dtype=torch.long)
        data.append((ja_tensor, en_tensor))
    return data

In [31]:
train = data_process(train_ja, train_en)

In [32]:
a = ModelArgs()
print(vars(a))

{'batch_size': 32, 'd_model': 512, 'hidden_dim': 1024, 'num_blocks': 32, 'num_q_heads': 32, 'num_kv_heads': 16, 'vocab_size': -1, 'eps': 1e-06, 'max_batch_size': 32, 'max_seq_len': 512, 'device': device(type='cuda'), 'freq_base': 10000, 'epochs': 5}


In [33]:
from torch.nn.utils.rnn import pad_sequence

PAD_IDX = vocab_ja['<pad>']
BOS_IDX = vocab_ja['<bos>']
EOS_IDX = vocab_ja['<eos>']

def generate_batch(data_batch):
    ja_batch, en_batch = [], []
    for (ja_item, en_item) in data_batch:
        ja_batch.append(torch.cat([torch.tensor([BOS_IDX]), ja_item, torch.tensor([EOS_IDX])], dim=0))
        en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
    ja_batch = pad_sequence(ja_batch, padding_value=PAD_IDX) # pad sequences into equal length
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return ja_batch, en_batch

In [34]:
from torch.utils.data import DataLoader

args = ModelArgs()

train_iter = DataLoader(train, batch_size=args.batch_size, shuffle=True, collate_fn=generate_batch) # type: ignore

In [35]:
len(vocab_ja)

24181

In [36]:
args.vocab_size = len(vocab_ja)
# args = ModelArgs(batch_size=32, d_model=64, hidden_dim=512, num_blocks=8, num_q_heads=32, num_kv_heads=16, vocab_size=100)
print(vars(args))

{'batch_size': 32, 'd_model': 512, 'hidden_dim': 1024, 'num_blocks': 32, 'num_q_heads': 32, 'num_kv_heads': 16, 'vocab_size': 24181, 'eps': 1e-06, 'max_batch_size': 32, 'max_seq_len': 512, 'device': device(type='cuda'), 'freq_base': 10000, 'epochs': 5}


In [37]:
args = ModelArgs(batch_size=32, d_model=64, hidden_dim=512, num_blocks=8, num_q_heads=32, num_kv_heads=16, vocab_size=26904)
transformer = TransformerSparseEmbeddings(args)
transformer = transformer.to(args.device)

In [38]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001)

def train_epoch(model, train_iter, optimizer, device):
    model.train()
    losses = 0
    for i, (x, y) in enumerate(train_iter):
        x = x.to(device)
        y = y.to(device)
        y = y[:-1, :]
        logits = model(x, 0)

        y_out = y[1:,:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), y_out.reshape(-1))
        loss.backward()
        optimizer.step()
        losses += loss.item()
    return losses / len(train_iter)