In [1]:
import numpy as np
import random
import torch
import torch.nn as nn
from torch.optim import Adam, AdamW
from torch.utils.data import DataLoader
import torch.nn.functional as F

In [2]:
torch.manual_seed(0)
random.seed(0)

In [3]:
import pandas as pd


train_dataset = pd.read_csv('/kaggle/input/machine-translation-ioai/train.csv').values
test_dataset = pd.read_csv('/kaggle/input/machine-translation-ioai/test.csv')

In [4]:
MAX_LENGTH = max(map(lambda x: len(x[0]), train_dataset)) + 1

MAX_LENGTH

41

In [5]:
SOS_token = 0
EOS_token = 1


class Lang:

    def __init__(self, name):
        self.name = name
        self.word2index = {
            'SOS': 0,
            'EOS': 1
        }
        self.index2word = {
            0: 'SOS',
            1: 'EOS'
        }

    @property
    def n_words(self) -> int:
        return len(self.index2word)

    def add_sentence(self, sentence):
        for word in list(sentence):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word

In [6]:
input_lang = Lang('human')
output_lang = Lang('iso')

for pair in train_dataset:
    input_lang.add_sentence(pair[0])
    output_lang.add_sentence(pair[1])

print(input_lang.name, input_lang.n_words)
print(output_lang.name, output_lang.n_words)

human 82
iso 13


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
class Encoder(nn.Module):

    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, x, hidden):
        embedded = self.embedding(x).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [9]:
class Decoder(nn.Module):

    def __init__(self, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.map_encoder = nn.Linear(hidden_size, hidden_size)
        self.map_combined = nn.Linear(2 * hidden_size, hidden_size)
        # self.bn_combined = nn.BatchNorm1d(hidden_size)
        self.dropout = nn.Dropout(p=.1)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x, hidden, encoder_outs):
        embed = self.embedding(x).view(-1, 1, self.hidden_size)
        embed = self.dropout(embed)

        attn_w = self.map_encoder(encoder_outs)
        attn_w = F.softmax(hidden.view(1,self.hidden_size) @ attn_w.view(-1, self.hidden_size).T,dim=-1)

        attn_val = attn_w @ encoder_outs 
        
        combined = torch.cat([embed, attn_val.unsqueeze(0)], dim=-1)
        combined = self.map_combined(combined)
        # combined = self.bn_combined(combined)
        output = self.relu(combined)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [10]:
def sentence2idx(lang, sentence):
    return [lang.word2index[word] for word in list(sentence)]


def sentence2tensor(lang, sentence):
    indexes = sentence2idx(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def pair2tensor(x):
    input_tensor = sentence2tensor(input_lang, x[0])
    target_tensor = sentence2tensor(output_lang, x[1])
    return input_tensor, target_tensor

In [11]:
def train_single(
        input_tensor, target_tensor,
        encoder, decoder,
        encoder_optimizer, decoder_optimizer,
        criterion
):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss = 0

    encoder_hidden = encoder.init_hidden()

    # enocder_outs = torch.zeros((input_tensor.shape[0], MAX_LENGTH, encoder.hidden_size), device=device)
    enocder_outs = torch.zeros((MAX_LENGTH, encoder.hidden_size), device=device)

    for ei, elem in enumerate(input_tensor):
        encoder_output, encoder_hidden = encoder(elem, encoder_hidden)
        # enocder_outs[:, ei] = encoder_output[:,0]
        enocder_outs[ei] = encoder_output[0,0]

    # decoder_input = torch.tensor([[SOS_token] for i in range(input_tensor.shape[0])], device=device)
    decoder_input = torch.tensor([[SOS_token]], device=device)


    decoder_hidden = encoder_hidden

    teacher_forcing_ratio = 0.5
    use_teacher_forcing = False
    # use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        for elem in target_tensor:
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, enocder_outs)
            loss += criterion(decoder_output, elem)
            decoder_input = elem
    else:
        for elem in target_tensor:
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, enocder_outs)
            _, topi = decoder_output.data.topk(1)
            decoder_input = topi.squeeze().detach()

            loss += criterion(decoder_output, elem)
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / len(target_tensor)

In [12]:
def train(encoder, decoder, n_epochs=5, print_every=100):
    encoder.train()
    decoder.train()

    encoder_optimizer = AdamW(encoder.parameters(), lr=3e-4, weight_decay=1e-6)
    decoder_optimizer = AdamW(decoder.parameters(), lr=3e-4, weight_decay=1e-6)
    encoder_scheduler = torch.optim.lr_scheduler.StepLR(encoder_optimizer, gamma=0.65, step_size=2)
    decoder_scheduler = torch.optim.lr_scheduler.StepLR(decoder_optimizer, gamma=0.65, step_size=2)

    training_pairs = [
        pair2tensor(x) for x in train_dataset
    ]

    # dataloader = DataLoader(training_pairs, shuffle=True, batch_size=32)

    criterion = nn.NLLLoss()

    best_loss = 10e9
    best_encoder = -1
    best_decoder = -1

    for epoch in range(n_epochs):
        print_loss_total = 0

        print(f'Epoch [{epoch + 1:02d}/{n_epochs:02d}]')

        # for i, batch in enumerate(dataloader):
        for i, batch in enumerate(training_pairs):

            input_tensor = batch[0]
            target_tensor = batch[1]

            # print(input_tensor.shape)
            loss = train_single(
                input_tensor, target_tensor,
                encoder, decoder,
                encoder_optimizer, decoder_optimizer,
                criterion
            )
            print_loss_total += loss

            if (i + 1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every

                if print_loss_avg < best_loss:
                    best_loss = print_loss_avg
                    best_encoder = encoder
                    best_decoder = decoder
                
                print_loss_total = 0
                print(f'Training ({i / len(training_pairs) * 100:.1f}%) loss: {print_loss_avg:.4f}')

    encoder_scheduler.step()
    decoder_scheduler.step()

    return best_encoder, best_decoder

In [13]:
encoder_model = Encoder(input_lang.n_words, 1024).to(device)
decoder_model = Decoder(1024, output_lang.n_words).to(device)

encoder_model, decoder_model = train(encoder_model, decoder_model, n_epochs=20)

#lr 3e-4, 512, 10 -> 0.71
#lr 3e-4, 768, 20 -> 0.71

Epoch [01/20]
Training (9.0%) loss: 1.7694
Training (18.2%) loss: 1.2040
Training (27.3%) loss: 0.7353
Training (36.4%) loss: 0.6459
Training (45.6%) loss: 0.5833
Training (54.7%) loss: 0.5987
Training (63.8%) loss: 0.5666
Training (73.0%) loss: 0.5324
Training (82.1%) loss: 0.4801
Training (91.2%) loss: 0.4519
Epoch [02/20]
Training (9.0%) loss: 0.3958
Training (18.2%) loss: 0.3808
Training (27.3%) loss: 0.3570
Training (36.4%) loss: 0.3371
Training (45.6%) loss: 0.4728
Training (54.7%) loss: 0.3311
Training (63.8%) loss: 0.2639
Training (73.0%) loss: 0.2195
Training (82.1%) loss: 0.2134
Training (91.2%) loss: 0.1754
Epoch [03/20]
Training (9.0%) loss: 0.1486
Training (18.2%) loss: 0.1269
Training (27.3%) loss: 0.1173
Training (36.4%) loss: 0.1039
Training (45.6%) loss: 0.0827
Training (54.7%) loss: 0.0776
Training (63.8%) loss: 0.0692
Training (73.0%) loss: 0.0604
Training (82.1%) loss: 0.0750
Training (91.2%) loss: 0.0551
Epoch [04/20]
Training (9.0%) loss: 0.0656
Training (18.2%) l

In [14]:
@torch.no_grad()
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    encoder.eval()
    decoder.eval()

    input_tensor = sentence2tensor(input_lang, sentence)
    encoder_hidden = encoder.init_hidden()

    enocder_outs = torch.zeros((MAX_LENGTH, encoder.hidden_size), device=device)

    for ei, elem in enumerate(input_tensor):
        encoder_output, encoder_hidden = encoder(elem, encoder_hidden)
        enocder_outs[ei] = encoder_output[0,0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    decoded_words = []

    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, enocder_outs)
        _, topi = decoder_output.data.topk(1)
        decoded_words.append(output_lang.index2word[topi.item()])

        if topi.item() == EOS_token:
            break

        decoder_input = topi.squeeze().detach()

    return decoded_words


def predict_(encoder, decoder, dataset):
    result = []

    for _ in dataset:
        result.append(evaluate(encoder, decoder, _)[:10])

    return result

In [15]:
def evaluateRandomly(encoder, decoder, n=10):

    acc = 0

    for i in range(n):
        pair = random.choice(train_dataset)
        print(pair[0])
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ''.join(output_words)
        print('<', output_sentence.replace('EOS', ''))
        print(output_sentence.replace('EOS', '') == pair[1])
        print('')
        acc += int(output_sentence.replace('EOS', '') == pair[1])
    print('Accuracy:', acc / n)

In [16]:
evaluateRandomly(encoder_model, decoder_model, n=100)

sipchil 10 2077
> sipchil 10 2077
= 17-10-2077
< 17-10-2077
True

двадцать шестого августа 2007
> двадцать шестого августа 2007
= 26-08-2007
< 26-08-2007
True

седмог 05 2049
> седмог 05 2049
= 07-05-2049
< 07-05-2049
True

siebzehnter märz 2007
> siebzehnter märz 2007
= 17-03-2007
< 17-03-2007
True

18.12.49
> 18.12.49
= 18-12-2049
< 18-12-2049
True

двадесет шестог априла 2007
> двадесет шестог априла 2007
= 26-04-2007
< 26-04-2007
True

25 февраля 2049
> 25 февраля 2049
= 25-02-2049
< 25-02-2049
True

der neunzehnte april 2007
> der neunzehnte april 2007
= 19-04-2007
< 19-04-2007
True

восьмого  02 2007
> восьмого  02 2007
= 08-02-2007
< 08-02-2007
True

17 ноябрдә 2049
> 17 ноябрдә 2049
= 17-11-2049
< 17-11-2049
True

12 semptembre 2077
> 12 semptembre 2077
= 12-09-2077
< 12-09-2077
True

15 parwol 2007
> 15 parwol 2007
= 15-08-2007
< 15-08-2007
True

le quatre février 2049
> le quatre février 2049
= 04-02-2049
< 04-02-2049
True

җиденче июлдә 2007
> җиденче июлдә 2007
= 07-07-2007

In [17]:
test_dataset = pd.read_csv('/kaggle/input/machine-translation-ioai/test.csv')

In [18]:
test_prediction = predict_(encoder_model, decoder_model, test_dataset['data'])

In [19]:
test_prediction = [''.join(x) for x in test_prediction]

In [20]:
test_dataset['label'] = test_prediction

In [21]:
test_dataset[['id', 'label']].to_csv('submission.csv', index=None)