In [0]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable


from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import DataLoader

In [0]:
!export CUDA_LAUNCH_BLOCKING=1

In [0]:
import click
from pathlib import Path
from einops import rearrange
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np

from torch.optim import Adam
import torch
import torch.nn as nn

In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
!cp -r '/content/drive/My Drive/data_enru' 'data_enru'

In [0]:
!cp -r '/content/drive/My Drive/data_orig' 'data'

In [0]:
!pip install einops



In [0]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [0]:
import pickle
import random
import numpy as np

from torch.utils.data import Dataset


class ParallelLanguageDataset(Dataset):
    def __init__(self, data_path_1, data_path_2, num_tokens, max_seq_length):
        self.num_tokens = num_tokens
        self.data_1, self.data_2, self.data_lengths = load_data(data_path_1, data_path_2, max_seq_length)

        self.batches = gen_batches(num_tokens, self.data_lengths)

    def __getitem__(self, idx):
        src, src_mask = getitem(idx, self.data_1, self.batches, True)
        tgt, tgt_mask = getitem(idx, self.data_2, self.batches, False)

        return src, src_mask, tgt, tgt_mask

    def __len__(self):
        return len(self.batches)

    def shuffle_batches(self):
        self.batches = gen_batches(self.num_tokens, self.data_lengths)


def gen_batches(num_tokens, data_lengths):
    # Shuffle all the indices
    for k, v in data_lengths.items():
        random.shuffle(v)

    batches = []
    prev_tokens_in_batch = 1e10
    for k in sorted(data_lengths):
        v = data_lengths[k]
        total_tokens = (k[0] + k[1]) * len(v)

        while total_tokens > 0:
            tokens_in_batch = min(total_tokens, num_tokens) - min(total_tokens, num_tokens) % (k[0] + k[1])
            sentences_in_batch = tokens_in_batch // (k[0] + k[1])

            # Combine with previous batch?
            if tokens_in_batch + prev_tokens_in_batch <= num_tokens:
                batches[-1].extend(v[:sentences_in_batch])
                prev_tokens_in_batch += tokens_in_batch
            else:
                batches.append(v[:sentences_in_batch])
                prev_tokens_in_batch = tokens_in_batch
            v = v[sentences_in_batch:]

            total_tokens = (k[0] + k[1]) * len(v)
    return batches


def load_data(data_path_1, data_path_2, max_seq_length):
    with open(data_path_1, 'rb') as f:
        data_1 = pickle.load(f)
    with open(data_path_2, 'rb') as f:
        data_2 = pickle.load(f)

    data_lengths = {}
    for i, (str_1, str_2) in enumerate(zip(data_1, data_2)):
        if 0 < len(str_1) <= max_seq_length and 0 < len(str_2) <= max_seq_length - 2:
            if (len(str_1), len(str_2)) in data_lengths:
                data_lengths[(len(str_1), len(str_2))].append(i)
            else:
                data_lengths[(len(str_1), len(str_2))] = [i]
    return data_1, data_2, data_lengths


def getitem(idx, data, batches, src):
    sentence_indices = batches[idx]
    if src:
        batch = [data[i] for i in sentence_indices]
    else:
        batch = [[2] + data[i] + [3] for i in sentence_indices]

    seq_length = 0
    for sentence in batch:
        if len(sentence) > seq_length:
            seq_length = len(sentence)

    masks = []
    for i, sentence in enumerate(batch):
        masks.append([False for _ in range(len(sentence))] + [True for _ in range(seq_length - len(sentence))])
        batch[i] = sentence + [0 for _ in range(seq_length - len(sentence))]

    return np.array(batch), np.array(masks)

In [0]:
import numpy as np


# From https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Optim.py
class ScheduledOptim():
    '''A simple wrapper class for learning rate scheduling'''

    def __init__(self, optimizer, d_model, n_warmup_steps):
        self._optimizer = optimizer
        self.n_warmup_steps = n_warmup_steps
        self.n_current_steps = 0
        self.init_lr = np.power(d_model, -0.5)

    def step_and_update_lr(self):
        "Step with the inner optimizer"
        self._update_learning_rate()
        self._optimizer.step()

    def zero_grad(self):
        "Zero out the gradients by the inner optimizer"
        self._optimizer.zero_grad()

    def _get_lr_scale(self):
        return np.min([
            np.power(self.n_current_steps, -0.5),
            np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

    def _update_learning_rate(self):
        ''' Learning rate scheduling per step '''

        self.n_current_steps += 1
        lr = self.init_lr * self._get_lr_scale()

        for param_group in self._optimizer.param_groups:
            param_group['lr'] = lr

In [0]:
import math
from einops import rearrange

import torch
from torch import nn


class LanguageTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, pos_dropout, trans_dropout):
        super().__init__()
        self.d_model = d_model
        self.embed_src = nn.Embedding(vocab_size, d_model)
        self.embed_tgt = nn.Embedding(vocab_size, d_model)
        self.pos_enc = PositionalEncoding(d_model, pos_dropout, max_seq_length)

        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, trans_dropout)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt, src_key_padding_mask, tgt_key_padding_mask, memory_key_padding_mask, tgt_mask):
        src = rearrange(src, 'n s -> s n')
        tgt = rearrange(tgt, 'n t -> t n')
        src = self.pos_enc(self.embed_src(src) * math.sqrt(self.d_model))
        tgt = self.pos_enc(self.embed_tgt(tgt) * math.sqrt(self.d_model))

        output = self.transformer(src, tgt, tgt_mask=tgt_mask, src_key_padding_mask=src_key_padding_mask,
                                  tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask)
        output = rearrange(output, 't n e -> n t e')
        return self.fc(output)


# Source: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=100):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [0]:
def main():
    # project_path = str(Path(__file__).resolve().parents[0])

    train_dataset = ParallelLanguageDataset('data/processed/en/train.pkl',
                                            'data/processed/fr/train.pkl',
                                            num_tokens, max_seq_length)
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=4, pin_memory=True)
    valid_dataset = ParallelLanguageDataset('data/processed/en/val.pkl',
                                            'data/processed/fr/val.pkl',
                                            num_tokens, max_seq_length)
    valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=True, num_workers=4, pin_memory=True)

    model = LanguageTransformer(vocab_size, d_model, nhead, num_encoder_layers,
                                num_decoder_layers, dim_feedforward, max_seq_length,
                                pos_dropout, trans_dropout).to(device)
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_normal_(p)

    optim = ScheduledOptim(
        Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
        d_model, n_warmup_steps)

    criterion = nn.CrossEntropyLoss(ignore_index=0)
    train_losses, val_losses = train(train_loader, valid_loader, model, optim, criterion, num_epochs)


def train(train_loader, valid_loader, model, optim, criterion, num_epochs):
    print_every = 500
    model.train()

    lowest_val = 1e9
    train_losses = []
    val_losses = []
    total_step = 0
    for epoch in range(num_epochs):
        pbar = tqdm(total=print_every, leave=False)
        total_loss = 0

        train_loader.dataset.shuffle_batches()
        for step, (src, src_key_padding_mask, tgt, tgt_key_padding_mask) in enumerate(iter(train_loader)):
            total_step += 1

            src, src_key_padding_mask = src[0].to(device), src_key_padding_mask[0].to(device)
            tgt, tgt_key_padding_mask = tgt[0].to(device), tgt_key_padding_mask[0].to(device)

            memory_key_padding_mask = src_key_padding_mask.clone()
            tgt_inp, tgt_out = tgt[:, :-1], tgt[:, 1:]
            tgt_mask = gen_nopeek_mask(tgt_inp.shape[1]).to(device)

            optim.zero_grad()
            outputs = model(src, tgt_inp, src_key_padding_mask, tgt_key_padding_mask[:, :-1], memory_key_padding_mask, tgt_mask)
            loss = criterion(rearrange(outputs, 'b t v -> (b t) v'), rearrange(tgt_out, 'b o -> (b o)'))

            loss.backward()
            optim.step_and_update_lr()

            total_loss += loss.item()
            train_losses.append((step, loss.item()))
            pbar.update(1)
            if step % print_every == print_every - 1:
                pbar.close()
                print(f'Epoch [{epoch + 1} / {num_epochs}] \t Step [{step + 1} / {len(train_loader)}] \t '
                      f'Train Loss: {total_loss / print_every}')
                total_loss = 0

                pbar = tqdm(total=print_every, leave=False)

        pbar.close()
        val_loss = validate(valid_loader, model, criterion)
        val_losses.append((total_step, val_loss))
        if val_loss < lowest_val:
            lowest_val = val_loss
            torch.save(model, 'output/transformer.pth')
        print(f'Val Loss: {val_loss}')
    return train_losses, val_losses


def validate(valid_loader, model, criterion):
    pbar = tqdm(total=len(iter(valid_loader)), leave=False)
    model.eval()

    total_loss = 0
    for src, src_key_padding_mask, tgt, tgt_key_padding_mask in iter(valid_loader):
        with torch.no_grad():
            src, src_key_padding_mask = src[0].to(device), src_key_padding_mask[0].to(device)
            tgt, tgt_key_padding_mask = tgt[0].to(device), tgt_key_padding_mask[0].to(device)
            memory_key_padding_mask = src_key_padding_mask.clone()
            tgt_inp = tgt[:, :-1]
            tgt_out = tgt[:, 1:].contiguous()
            tgt_mask = gen_nopeek_mask(tgt_inp.shape[1]).to(device)

            outputs = model(src, tgt_inp, src_key_padding_mask, tgt_key_padding_mask[:, :-1], memory_key_padding_mask, tgt_mask)
            loss = criterion(rearrange(outputs, 'b t v -> (b t) v'), rearrange(tgt_out, 'b o -> (b o)'))

            total_loss += loss.item()
            pbar.update(1)

    pbar.close()
    model.train()
    return total_loss / len(valid_loader)


def gen_nopeek_mask(length):
    mask = rearrange(torch.triu(torch.ones(length, length)) == 1, 'h w -> w h')
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))

    return mask

In [0]:
num_epochs = 20
max_seq_length = 96
num_tokens = 2000

vocab_size = 1000+4
d_model = 512
num_encoder_layers = 6
num_decoder_layers = 6
dim_feedforward = 2048
nhead = 8
pos_dropout = 0.1
trans_dropout = 0.1
n_warmup_steps = 4000

In [0]:
main()

  0%|          | 0/500 [00:00<?, ?it/s]Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


RuntimeError: ignored

In [0]:
def map_words(sentence, freq_list):
    return [freq_list[word] for word in sentence if word in freq_list]

def preprocess_data(path, vocab_folder, lang, split = 0.8):

    punct = ['(', ')', ':', '"', ' ']

    data = []
    with open(path, 'r') as fp:
        for line in fp:
            data.append(line.strip())

    proc_data = []
    for sentence in data:
        sentence = sentence.lower()
        sentence = [elem for elem in sentence.split(" ") if elem not in punct]
        proc_data.append(sentence)
        # sentence = [tok.text for tok in lang_model.tokenizer(sentence) if tok.text not in punctuation]
    # lang_data = load_data(data_path)
    # lang_model = spacy.load(lang, disable=['tagger', 'parser', 'ner'])

    indices = [i for i in range(len(data))]
    random.shuffle(indices)

    # 80:20:0 train validation test split
    train_idx = int(len(data) * split)

    train_indices = indices[:train_idx]
    test_indices = indices[train_idx:]
    # processed_sentences = [process_sentences(lang_model, sentence, punctuation) for sentence in lang_data]

    train = [proc_data[i] for i in train_indices]

    freq_list = Counter()
    for sentence in train:
        freq_list.update(sentence)
    # freq_list = freq_list.most_common(10000)

    # Map words in the dictionary to indices but reserve 0 for padding,
    # 1 for out of vocabulary words, 2 for start-of-sentence and 3 for end-of-sentence
    freq_list = {freq[0]: i + 4 for i, freq in enumerate(freq_list)}
    freq_list['[PAD]'] = 0
    freq_list['[OOV]'] = 1
    freq_list['[SOS]'] = 2
    freq_list['[EOS]'] = 3
    proc_data = [map_words(sentence, freq_list) for sentence in tqdm(proc_data)]

    train = [proc_data[i] for i in train_indices]
    test = [proc_data[i] for i in test_indices]
    # test = [processed_sentences[i] for i in test_indices]

    os.makedirs(f'{vocab_folder}/processed/{lang}', exist_ok=True)
    with open(f'{vocab_folder}/processed/{lang}/train.pkl', 'wb') as f:
        pickle.dump(train, f)
    with open(f'{vocab_folder}/processed/{lang}/val.pkl', 'wb') as f:
        pickle.dump(test, f)
    # with open(f'data/processed/{lang}/test.pkl', 'wb') as f:
    #     pickle.dump(test, f)
    with open(f'{vocab_folder}/processed/{lang}/freq_list.pkl', 'wb') as f:
        pickle.dump(freq_list, f)


class TranslationDataset(Dataset):

    def __init__(self, data_path_source, data_path_target, num_tokens, max_seq_length):
        self.num_tokens = num_tokens

        with open(data_path_source, 'rb') as f:
            self.data_1 = pickle.load(f)
        with open(data_path_target, 'rb') as f:
            self.data_2 = pickle.load(f)

        self.data_lengths = {}
        for i, (str_1, str_2) in enumerate(zip(self.data_1, self.data_2)):
            if 0 < len(str_1) <= max_seq_length and 0 < len(str_2) <= max_seq_length - 2:
                if (len(str_1), len(str_2)) in self.data_lengths:
                    self.data_lengths[(len(str_1), len(str_2))].append(i)
                else:
                    self.data_lengths[(len(str_1), len(str_2))] = [i]

        dl = self.data_lengths.copy()
        for k, v in dl.items():
            random.shuffle(v)

        batches = []
        prev_tokens_in_batch = 1e10
        for k in sorted(dl):
            v = dl[k]
            total_tokens = (k[0] + k[1]) * len(v)

            while total_tokens > 0:
                tokens_in_batch = min(total_tokens, num_tokens) - min(total_tokens, num_tokens) % (k[0] + k[1])
                sentences_in_batch = tokens_in_batch // (k[0] + k[1])

                # Combine with previous batch?
                if tokens_in_batch + prev_tokens_in_batch <= num_tokens:
                    batches[-1].extend(v[:sentences_in_batch])
                    prev_tokens_in_batch += tokens_in_batch
                else:
                    batches.append(v[:sentences_in_batch])
                    prev_tokens_in_batch = tokens_in_batch
                v = v[sentences_in_batch:]

                total_tokens = (k[0] + k[1]) * len(v)
        self.batches = batches
        # self.batches = gen_batches(num_tokens, self.data_lengths)

    def __proc_data_mask(self, idx, src = True):
        sentence_indices = self.batches[idx]
        if src:
            batch = [self.data_1[i] for i in sentence_indices]
        else:
            batch = [[2] + self.data_2[i] + [3] for i in sentence_indices]

        seq_length = 0
        for sentence in batch:
            if len(sentence) > seq_length:
                seq_length = len(sentence)

        masks = []
        for i, sentence in enumerate(batch):
            masks.append([False for _ in range(len(sentence))] + [True for _ in range(seq_length - len(sentence))])
            batch[i] = sentence + [0 for _ in range(seq_length - len(sentence))]

        return np.array(batch), np.array(masks)
    def __getitem__(self, item):
        src, src_mask = self.__proc_data_mask(item, True)
        tgt, tgt_mask = self.__proc_data_mask(item, False)

        return src, src_mask, tgt, tgt_mask

    def __len__(self):
        return len(self.batches)

    def shuffle_batches(self):

        dl = self.data_lengths.copy()
        for k, v in dl.items():
            random.shuffle(v)

        batches = []
        prev_tokens_in_batch = 1e10
        for k in sorted(dl):
            v = dl[k]
            total_tokens = (k[0] + k[1]) * len(v)

            while total_tokens > 0:
                tokens_in_batch = min(total_tokens, self.num_tokens) - min(total_tokens, self.num_tokens) % (k[0] + k[1])
                sentences_in_batch = tokens_in_batch // (k[0] + k[1])

                # Combine with previous batch?
                if tokens_in_batch + prev_tokens_in_batch <= self.num_tokens:
                    batches[-1].extend(v[:sentences_in_batch])
                    prev_tokens_in_batch += tokens_in_batch
                else:
                    batches.append(v[:sentences_in_batch])
                    prev_tokens_in_batch = tokens_in_batch
                v = v[sentences_in_batch:]

                total_tokens = (k[0] + k[1]) * len(v)
        self.batches = batches
        

In [0]:
!git clone https://github.com/andrewpeng02/transformer-translation.git

fatal: destination path 'transformer-translation' already exists and is not an empty directory.


In [0]:
!cp -r transformer-translation/* ./ 

In [0]:
!python train.py

  0% 0/500 [00:00<?, ?it/s]torch.Size([7, 142, 512]) torch.Size([8, 142, 512])
  0% 1/500 [00:00<06:22,  1.30it/s]torch.Size([10, 95, 512]) torch.Size([12, 95, 512])
  0% 2/500 [00:01<05:16,  1.57it/s]torch.Size([10, 95, 512]) torch.Size([12, 95, 512])
  1% 3/500 [00:01<04:23,  1.89it/s]torch.Size([8, 117, 512]) torch.Size([10, 117, 512])
  1% 4/500 [00:01<03:45,  2.20it/s]torch.Size([9, 111, 512]) torch.Size([10, 111, 512])
  1% 5/500 [00:01<03:18,  2.50it/s]torch.Size([13, 74, 512]) torch.Size([15, 74, 512])
  1% 6/500 [00:02<02:58,  2.77it/s]torch.Size([8, 76, 512]) torch.Size([16, 76, 512])
  1% 7/500 [00:02<02:42,  3.03it/s]torch.Size([6, 166, 512]) torch.Size([7, 166, 512])
  2% 8/500 [00:02<02:34,  3.19it/s]torch.Size([8, 133, 512]) torch.Size([8, 133, 512])
  2% 9/500 [00:03<02:28,  3.30it/s]torch.Size([16, 22, 512]) torch.Size([19, 22, 512])
  2% 10/500 [00:03<02:10,  3.75it/s]torch.Size([10, 100, 512]) torch.Size([11, 100, 512])
  2% 11/500 [00:03<02:11,  3.71it/s]torch.Size(