In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

In [2]:
class Lang:
    def __init__(self ,id):
        self.id = id
        self.word2index = {"<SOS>" : 0, "<EOS>": 1}
        self.word2count = {}
        self.index2word = {0 : "<SOS>", 1: "<EOS>"}
        self.nwords = 2
        pass
    def word2vec(self, word):
        pass
    def vec2word(self, vec):
        pass

    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word)
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.nwords
            self.index2word[self.nwords] = word
            self.nwords += 1
            self.word2count[word] = 0
        self.word2count[word] += 1

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [3]:
with open("eng-fra.txt", "r") as f:
    lines = f.read().strip().split("\n")
    pairs = [[normalizeString(pair) for pair in l.split("\t")] for l in lines]

In [4]:
fr = Lang("fr")
en = Lang("en")
for pair in pairs:
    fr.addSentence(pair[1])
    en.addSentence(pair[0])

In [5]:
MAX_LENGTH = 10

fr_prefixes = (
    "je suis",
    "il est", "c est",
    "elle est", "ce sont",
    "vous etes", "tu es",
    "nous sommes", "on est",
    "ils sont", "elles sont"
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(fr_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [6]:
with open("eng-fra.txt", "r") as f:
    lines = f.read().strip().split("\n")
    pairs = filterPairs([[normalizeString(pair) for pair in l.split("\t")] for l in lines])

In [7]:
fr = Lang("fr")
en = Lang("en")
for pair in pairs:
    fr.addSentence(pair[1])
    en.addSentence(pair[0])

In [8]:
en.nwords

3223

In [36]:
device = "cuda"
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, nlayers, dropout = 0.3):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.nlayers = nlayers
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size , embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, nlayers, dropout = dropout, batch_first = True)
    def forward(self, x):
        # [1 x seq_length ]
        embedding = self.dropout(self.embedding(x))
        # [1 x seq_length x embedding_size]
        outputs, (hidden, cell) =  self.rnn(embedding)
        return (hidden, cell)
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, nlayers, output_size, dropout = 0.3):
        super(Decoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.nlayers = nlayers
        self.output_size = output_size
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, nlayers, dropout = dropout, batch_first = True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
    def forward_step(self, x, hidden, cell, teacher = None):
        # x : 1 x 1
        # 1 word at a time
        x = self.relu(self.dropout(self.embedding(x)))
        
        output, (hidden, cell) = self.rnn(x, )
        # output : [1 x N x hidden_size]
        preds = self.fc(output)

        # 1 x N x length_vocab
        preds = preds.squeeze(0)
        
        return preds, hidden, cell

    def forward(self,  encoder_hidden, encoder_cell):
        batch_size = encoder_hidden.size(1)
        x = torch.empty(1,1,  device = device, dtype = torch.long,).fill_(0)
        
        hidden, cell = encoder_hidden, encoder_cell
        outputs = []
        for i in range(MAX_LENGTH):
            output, hidden, cell = self.forward_step(x, hidden, cell)
            outputs.append(output)
            x = torch.argmax(output).unsqueeze(0).unsqueeze(0)
            #x = top.squeeze(-1).detach()
        outputs = torch.cat(outputs, dim = 0)
        outputs = F.log_softmax(outputs, dim = -1)
        return outputs
class Seq2Seq(nn.Module):
    def __init__(self, input_size, embedding_size, encoder_hidden_size, decoder_hidden_size, nlayers_encoder, nlayers_decoder, output_size):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_size, embedding_size, encoder_hidden_size, nlayers_encoder)
        self.decoder = Decoder(output_size, embedding_size, decoder_hidden_size, nlayers_decoder, output_size)
    def forward(self, x):
        hidden, cell = self.encoder(x)
        return self.decoder(hidden, cell)
        

In [37]:
enc = Encoder(3, 10, 4, 2).to(device)

In [29]:
enc(torch.tensor([[1, 2,0], [1, 0, 0]], device =device))[0].shape

torch.Size([2, 2, 4])

In [38]:
X, Y= [], []
for pair in pairs:
    english_words = pair[0].split(" ")
    french_words = pair[1].split(" ")
    english_indexes = [0]
    french_indexes = [0]
    for word in english_words:
        english_indexes.append(en.word2index[word])
    for word in french_words:
        french_indexes.append(fr.word2index[word])
    french_indexes.append(1)
    X.append(torch.tensor(english_indexes, device = device))
    Y.append(torch.tensor(french_indexes, device = device))

In [35]:
outputs.shape

torch.Size([10, 4478])

In [50]:
from tqdm import tqdm
model = Seq2Seq(en.nwords, 256, 128, 128, 2, 2, fr.nwords).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss()
n_epochs = 3
flatten = nn.Flatten()
for i in tqdm(range(n_epochs)):
    for x, y in zip(X, Y):
        optimizer.zero_grad()
        outputs = model(x)
        outputs = outputs.view(-1, outputs.size(-1)
        y = y.view(-1)
        print(outputs.shape, y.shape)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
    
        
    

  0%|                                                                                             | 0/3 [00:00<?, ?it/s]

torch.Size([1, 10, 4478]) torch.Size([1, 5, 1])





RuntimeError: Expected target size [1, 4478], got [1, 5, 1]

In [47]:
import os
from string import ascii_letters

In [95]:
dir = "./data/names"
lang2label = {}
index = 0
names = {}
for file in os.listdir(dir):
    lang, _ = file.split(".")
    lang2label[lang] = torch.tensor(index)
    names[lang] = []
    with open(dir + "/" + file, "r") as f:
        for line in f:
            names[lang].append(unicodeToAscii(line.strip()))
    index += 1

In [99]:
char2idx = {letter: i for i, letter in enumerate(ascii_letters + " .,:;-'")}
num_letters = len(char2idx); num_letters

59

In [174]:
num_langs = len(lang2label)

In [108]:
def name2tensor(name):
    res = torch.zeros(len(name), 1, num_letters, dtype = torch.long)
    for i in range(len(name)):
        res[i, 0, char2idx[name[i]]] = 1
    return res


In [156]:
def construct_dataset():
    dir = "./data/names"
    lang2label = {}
    index = 0
    names = {}
    X, y = [], []
    for file in os.listdir(dir):
        lang, _ = file.split(".")
        lang2label[lang] = torch.tensor(index)
        names[lang] = []
        with open(dir + "/" + file, "r") as f:
            for line in f:
                line = normalizeString(line)
                X.append(name2tensor(line))

                y.append(torch.tensor(index))
        index += 1
    return X, y

In [None]:
X, y = construct_dataset()



In [162]:
from sklearn.model_selection import train_test_split
index_train, index_test = train_test_split(range(len(X)), test_size = 0.1, shuffle = True)

In [165]:
X_train, y_train = [], []
X_test, y_test = [], []
for i in index_train:
    X_train.append(X[i])
    y_train.append(y[i])
for j in index_test:
    X_test.append(X[j])
    y_test.append(y[j])

In [223]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.in2hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.in2output = nn.Linear(input_size + hidden_size, output_size)
        self.relu = nn.ReLU()
    def forward(self, x, hidden = None):
        batch_size = x.size(0)
        if hidden is None:
            hidden = torch.zeros(batch_size, self.hidden_size)
        combined = torch.cat((x, hidden), dim = 1)
        hidden = self.relu(self.in2hidden(combined))
        output = self.in2output(combined)
        return output, hidden


class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.forget_gate = nn.Linear(input_size + hidden_size, hidden_size)
        self.input_gate = nn.Linear(input_size + hidden_size, hidden_size)
        self.candidate_gate = nn.Linear(input_size + hidden_size, hidden_size)
        self.filter_gate = nn.Linear(input_size + hidden_size, hidden_size)
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()
        self.in2hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.in2output = nn.Linear(input_size + hidden_size, output_size)
        self.relu = nn.ReLU()
    def forward(self, x, hidden = None, cell = None):
        batch_size = x.size(0)
        if hidden is None:
            hidden = torch.zeros(batch_size, self.hidden_size)
            cell = torch.zeros(batch_size, self.hidden_size)
        combined = torch.cat((x, hidden), dim = 1)
        cell = cell * self.sigmoid(self.forget_gate(combined))
        cell = cell + self.sigmoid(self.input_gate(combined)) * self.tanh(self.candidate_gate(combined))
        hidden = self.sigmoid(self.filter_gate(combined)) * self.tanh(cell)
        output = self.in2output(combined)
        return output, (hidden, cell)

In [224]:
rnn = LSTM(3, 32, 10)

In [226]:
lr = 1e-3
criterion = nn.CrossEntropyLoss()
model = LSTM(num_letters, 256, num_langs) 
optimizer = torch.optim.Adam(params = model.parameters(),  lr = lr)
n_epochs = 2
losses = []
from tqdm import tqdm
for i in tqdm(range(n_epochs)):
    loss_total = 0
    for j in range(len(X_train)):
        
        x, y = X_train[i], y_train[i]
        optimizer.zero_grad()
        hidden = None
        cell = None
        for c in x:
            output, (hidden, cell) = model(c, hidden, cell)
        
        loss = criterion(output, y.unsqueeze(0))
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        loss_total += loss.item() / len(X_train)
    losses.append(loss_total)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [07:15<00:00, 217.59s/it]


In [229]:
model(c)

(tensor([[-7.6276e-03, -1.1010e-01, -5.7426e-03,  3.2927e-02, -7.8736e-02,
          -7.9761e-02,  7.5931e-02, -4.1672e-03,  5.3505e-03, -4.5437e-02,
          -1.2615e-02, -4.1347e-02, -7.3037e-02, -7.2559e-03, -5.3806e-02,
          -2.1059e-02, -7.8413e-02, -9.2925e-05]], grad_fn=<AddmmBackward0>),
 (tensor([[ 0.0325, -0.0264,  0.0376, -0.0371, -0.0182,  0.0233, -0.0119, -0.0261,
            0.0038, -0.0226,  0.0166, -0.0223,  0.0284,  0.0090, -0.0232, -0.0061,
            0.0059,  0.0270,  0.0019, -0.0074, -0.0058,  0.0301, -0.0155,  0.0329,
            0.0037, -0.0055, -0.0123,  0.0144,  0.0225, -0.0341,  0.0375, -0.0221,
            0.0273, -0.0273, -0.0389, -0.0065,  0.0219,  0.0377, -0.0286,  0.0008,
           -0.0148, -0.0331, -0.0007, -0.0207, -0.0147,  0.0056, -0.0044, -0.0183,
           -0.0132, -0.0323,  0.0112, -0.0235, -0.0274, -0.0292,  0.0203,  0.0260,
           -0.0134, -0.0244,  0.0326,  0.0261,  0.0063, -0.0257, -0.0224, -0.0159,
            0.0134,  0.0045,  0.0

In [230]:
seq = X_train[0]
hidden = None
for c in seq:
    output, (hidden, cell) = model(c, hidden, cell)

In [231]:
output.argmax()

tensor(6)

In [232]:
losses

[0.0020898328232687472, 0.0]

In [238]:
model.eval()
acc = 0
with torch.no_grad():
    for i in range(len(X_train)):
        x, y = X_train[i], y_train[i]
        hidden = None
        cell = None
        for c in x:
            output, (hidden, cell) = model(c, hidden, cell)
        label_predicted = torch.argmax(output)
        acc += (label_predicted.item() == y.item())
    acc /= len(X_train)

In [239]:
acc

0.4692239566035647

In [192]:
losses

[0.001673747174622966, 0.0]