In [111]:
import os, sys, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize

import torch
import torchtext
import torch.nn as nn
from torch.utils import data

import torchvision.transforms as transforms

import torchtext.data as ttd
from torchtext.vocab import GloVe

from collections import OrderedDict

from datetime import datetime

In [9]:
# config
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 20_000
NUM_SAMPLES = 10_000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 64
EPOCHS = 100
LATENT_DIM = 256

SOS = "<sos>"
EOS = "<eos>"

In [12]:
# load in the data
input_texts = []  # sentence in original language
target_texts = []  # sentence in target language
target_text_inputs = []  #  sentence in target language offset by 1
t = 0
for line in open("./fra.txt"):
    # only keep a limited number of samples
    t += 1
    if t > NUM_SAMPLES:
        break
    line = line.rstrip()
    # input and targets are separeted by tab
    if "\t" not in line:
        continue

    # split up the input and translation
    input_text, translation = line.split("\t")[:2]

    # make the target input and output
    # we'll be using teacher forcing
    #target_text = translation + " <eos>"
    #target_text_input = "<sos> " + translation

    input_texts.append(input_text)
    target_texts.append(translation)
    #target_text_inputs.append(target_text_input)
print("num samples:", len(input_texts))

num samples: 10000


In [29]:
# some checks
idx = np.random.randint(len(input_texts))
print(input_texts[idx])

idx = np.random.randint(len(target_texts))
print(target_texts[idx])

I forbid that.
Ne me pousse pas !


In [19]:
# build vocab
voc = []
voc_fr = [EOS, SOS]
#stop_words = stopwords.words("english")

for line in input_texts:
    line = word_tokenize(re.sub("\W+", " ", line.lower())) 
    for w in line:
        if w not in voc: # and w not in stop_words:
            voc.append(w)
            
for line in target_texts:
    line = word_tokenize(re.sub("\W+", " ", line.lower())) 
    for w in line:
        if w not in voc_fr: # and w not in stop_words:
            voc_fr.append(w)

In [20]:
# convert vocabulary to indices and keep the order 
word2idx = {}
for idx, w in enumerate(voc):
    word2idx[w] = idx + 1 # key=word, item=index

word2idx_keys = word2idx.keys()
print("Length of vocab : {0:d} tokens".format(len(word2idx)))

Length of vocab : 1999 tokens


In [21]:
# convert vocabulary to indices and keep the order 
word2idx_fr = {}
for idx, w in enumerate(voc_fr):
    word2idx_fr[w] = idx + 1 # key=word, item=index

word2idx_fr_keys = word2idx_fr.keys()
print("Length of vocab : {0:d} tokens".format(len(word2idx_fr)))

Length of vocab : 3932 tokens


In [74]:
# tokenizing inputs and targets
input_sequences = []
target_sequences = []
target_sequences_inputs = []

for line in input_texts:
    line = word_tokenize(re.sub("\W+", " ", line.lower())) 
    input_sequence = [word2idx[w] for w in line]
    input_sequences.append(input_sequence)
    

for line in target_texts:
    line = word_tokenize(re.sub("\W+", " ", line.lower())) 
    sequence = [word2idx_fr[w] for w in line] 
    target_sequence = sequence + [word2idx_fr[EOS]]
    target_sequence_input = [word2idx_fr[SOS]] + sequence
    target_sequences.append(target_sequence)
    target_sequences_inputs.append(target_sequence_input)
    


In [75]:
# sanity checks
target_sequences[-1], target_sequences_inputs[-1]

([132, 2481, 1927, 1], [2, 132, 2481, 1927])

In [76]:
# determine maximum length input/target sequence
max_len_input = max(len(s) for s in input_sequences)
print("max_len_input:", max_len_input)

max_len_target = max(len(s) for s in target_sequences)
print("max_len_target:", max_len_target)

max_len_input: 5
max_len_target: 13


In [77]:
# function to pad the sequences
def pad_sequences(sequences, maxlen, padding='pre'):
    seq = sequences.copy()
    if padding == 'post':
        for i in range(len(sequences)):
            if len(sequences[i]) < maxlen:
                seq[i] += (maxlen - len(sequences[i])) * [0]
    elif padding == 'pre':
        for i in range(len(sequences)):
            if len(sequences[i]) < maxlen:
                seq[i] = (maxlen - len(sequences[i])) * [0] + sequences[i]
    
    seq = np.array(seq)
    return seq
    

In [78]:
# pad the sequences
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)
print("encoder_data.shape:", len(encoder_inputs),len(encoder_inputs[0]) )
print("encoder_data[0]:", encoder_inputs[0])

decoder_inputs = pad_sequences(
    target_sequences_inputs, maxlen=max_len_target, padding="post"
)
print("decoder_data.shape:", len(decoder_inputs),len(decoder_inputs[0]))
print("decoder_data[0]:", decoder_inputs[0])

decoder_targets = pad_sequences(target_sequences, maxlen=max_len_target, padding="post")

encoder_data.shape: 10000 5
encoder_data[0]: [0 0 0 0 1]
decoder_data.shape: 10000 13
decoder_data[0]: [2 3 0 0 0 0 0 0 0 0 0 0 0]


In [51]:
# load in pre-trained word vectors 
# can download here 
# word2vec = torchtext.vocab.GloVe(name="6B", dim=EMBEDDING_DIM) 
print("loading word vectors ...")
word2vec_path = '../../Lazyprogrammer/large_files/glove.6B/glove.6B.%sd.txt'
word2vec = {}
with open(
    os.path.join(word2vec_path % EMBEDDING_DIM)
) as f:
    # is just a space-separated text file in the format:
    # word vec[0] vec[1] vec[2]
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.array(values[1:], dtype="float32")
        word2vec[word] = vec
    print("Found %s word vectors." % len(word2vec))

loading word vectors ...
Found 400000 word vectors.


In [52]:
# prepare embedding matrix
print("Filling pre-trained embeddings...")
num_words = min(MAX_NUM_WORDS, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_inputs.items():
    if i < num_words:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all zeros
            embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


In [79]:
# store number of output wors for later
# remember to add 1 since indexing start at 1
num_words_output = len(word2idx_outputs) + 1

# one-hot the targets 
decoder_targets_one_hot = np.zeros(
    (len(input_sequences), max_len_target, num_words_output), dtype='float32'
)
for i, d in enumerate(decoder_targets):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

In [80]:
# load pre-trained word embeddings into an embedding layer
# freeze the layer
embedding_layer = nn.Embedding(num_words, EMBEDDING_DIM,)  # vocab size  # embedding dim
embedding_layer.weight = nn.Parameter(torch.from_numpy(embedding_matrix).float())
#embedding_layer.requires_grad = False


In [82]:
print(max_len_input)
print(max_len_target)

5
13


In [None]:
# T_encoder = 5
# T_decoder = 13
# encoder_input size N x 5
# decoder_input size N x 13


In [102]:
# build the model
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.encoder = nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=LATENT_DIM, batch_first=True) # -> N x T x LATENT_DIM 
        self.embed = embedding_layer
        self.embed_decoder = nn.Embedding(num_words_output, EMBEDDING_DIM) # vocab_size_output X Latent Dim 
        self.decoder = nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=LATENT_DIM, batch_first=True)
        self.fc = nn.Linear(LATENT_DIM, num_words_output)
        
    def forward(self, X_encoder, X_decoder):
        h0 = torch.zeros(1, X_encoder.size(0), LATENT_DIM)#.to(device)
        c0 = torch.zeros(1, X_encoder.size(0), LATENT_DIM)#.to(device)
        
        out = self.embed(X_encoder) # N x T_input -> N x T_encoder x EMBEDDING_DIM
        _ , (h, c) = self.encoder(out, (h0, c0))  # (h, c) : ((1, N , LATENT_DIM), (1, N , LATENT_DIM))
        
        decoder_inputs_x = self.embed_decoder(X_decoder) # # N x T_decoder -> N x T_decoder x EMBEDDING_DIM
        decoder_outputs, _ = self.decoder(decoder_inputs_x, (h, c)) # -> N x T_decoder x LATENT_DIM
        
        final_out = self.fc(decoder_outputs) # -> N x T_decoder x num_words_output
        
        return final_out
  
        

In [103]:
model = RNN()

In [122]:
# simple test 
x_in = torch.ones(1,5, dtype=torch.int) 
x_out = torch.ones(1,13, dtype=torch.int) 
y = model(x_in, x_out)
print(y.shape)
#EMBEDDING_DIM = 100
#LATENT_DIM = 256

torch.Size([1, 13, 5681])


In [124]:
# tensors 
encoder_inputs_tensor = torch.from_numpy(encoder_inputs).int()
decoder_inputs_tensor = torch.from_numpy(decoder_inputs).int()
decoder_targets_one_hot_tensor = torch.from_numpy(decoder_targets_one_hot).float()

In [125]:
# build data set
class Translation(data.Dataset):
    def __init__(self):
        pass
    
    def __len__(self):
        return len(encoder_inputs)

    def __getitem__(self, idx):
        return encoder_inputs_tensor[idx], decoder_inputs_tensor[idx], decoder_targets_one_hot_tensor[idx]


In [126]:
# instantiate dataset
translation_dataset = Translation()

In [127]:
data_loader = torch.utils.data.DataLoader(
    dataset=translation_dataset,
    shuffle=True,
    batch_size=BATCH_SIZE)

In [128]:
for inputs, inputs_d, targets in data_loader:
    print(inputs.shape, inputs_d.shape, targets.shape)
    break

torch.Size([64, 5]) torch.Size([64, 13]) torch.Size([64, 13, 5681])


In [129]:
# loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [131]:
# training loop

#EPOCHS = 3
# loss to store
train_losses = np.zeros(EPOCHS)
test_losses = np.zeros(EPOCHS)

for it in range(EPOCHS):

    train_loss = []
    for inputs_encoder, inputs_decoder, targets in data_loader:
        # move data to gpu
        #inputs, targets = inputs.to(device), targets.to(device)
        
        # forward
        outputs = model(inputs_encoder, inputs_decoder)

        # compute loss
        loss = criterion(outputs, targets)

        # zero the grad
        optimizer.zero_grad()

        # backward
        loss.backward()
        optimizer.step()

        # store loss
        train_loss.append(loss.item())
    
    # final loss
    train_loss = np.mean(train_loss)
    train_losses[it] = train_loss


    # prints
    print(f"Epoch {it+1}/{EPOCHS}. Train Loss = {train_loss:.4f}")


Epoch 1/3. Train Loss = 0.0033
Epoch 2/3. Train Loss = 0.0033
Epoch 3/3. Train Loss = 0.0033


In [132]:
# making a sampling model
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
       
    def forward(self, X_encoder):
        # encoder_input size 1 x 5
        h0 = torch.zeros(1, X_encoder.size(0), LATENT_DIM)#.to(device) 
        c0 = torch.zeros(1, X_encoder.size(0), LATENT_DIM)#.to(device)
        
        out = model.embed(X_encoder) # 1 x T_encoder -> 1 x T_encoder x EMBEDDING_DIM
        _ , (h, c) = model.encoder(out, (h0, c0))  # (h, c) : ((1, 1 , LATENT_DIM), (1, 1 , LATENT_DIM))

        return h, c
        

In [133]:
encoder = Encoder()

In [140]:
# simple test 
x_in = torch.ones(1,1, dtype=torch.int) 
#x_out = torch.ones(1,13, dtype=torch.int) 
h, c = encoder(x_in)
print(h.shape, c.shape)


torch.Size([1, 1, 256]) torch.Size([1, 1, 256])


In [137]:
# making a sampling model
class SamplingModel(nn.Module):
    def __init__(self):
        super(SamplingModel, self).__init__()
        
    
    def forward(self, X_decoder, h, c):
        # decoder_input size 1 x 13
        # (h, c) : ((1, 1 , LATENT_DIM), (1, 1 , LATENT_DIM))
        decoder_inputs_x = model.embed_decoder(X_decoder) # 1 x T_decoder -> 1 x T_decoder x EMBEDDING_DIM
        out, (h, c) = model.decoder(decoder_inputs_x, (h, c)) # -> 1 x T_decoder x LATENT_DIM
        out = model.fc(out) # -> 1 x T_decoder x NUM_words out
        
        return out, h, c
        

In [138]:
sampling_model = SamplingModel()

In [141]:
# simple test 
#x_in = torch.ones(1,5, dtype=torch.int) 
x_out = torch.ones(1,1, dtype=torch.int) 
out, h, c = sampling_model(x_out, h, c)
print(out.shape, h.shape, c.shape)


torch.Size([1, 1, 5681]) torch.Size([1, 1, 256]) torch.Size([1, 1, 256])


In [142]:
# map indexes back into real words
# so wwe can view the results
idx2word_eng = {v: w for w, v in word2idx.items()}
idx2word_fr = {v: w for w, v in word2idx_fr.items()}

In [143]:
# generate empty target seq of length 1
target_seq = torch.zeros((1, 1))

# populate the first character of target sequence with the start character
# NOTE: tokenizer lower cases all words
target_seq[0, 0] = word2idx_outputs["<sos>"]

In [160]:
def decode_sequence(input_seq):
    # encode the input as state vectors.
    h, c = encoder(input_seq)

    # generate empty target seq of length 1
    target_seq = torch.zeros((1, 1)).int()

    # populate the first character of target sequence with the start character
    # NOTE: tokenizer lower cases all words
    target_seq[0, 0] = word2idx_outputs["<sos>"]

    # if we get this we break
    eos = word2idx_outputs["<eos>"]

    # create translation
    output_sentence = []
    for _ in range(max_len_target):
        output_tokens, h, c = sampling_model(target_seq, h, c)

        # get next word
        idx = np.argmax(output_tokens.detach().numpy()[0, 0, :])

        # end of sentence EOS
        if eos == idx:
            break

        word = ""
        if idx > 0:
            word = idx2word_fr[idx]
            output_sentence.append(word)

        # update the decoder input
        # which is just the word just generated
        target_seq[0, 0] = idx
        #states_value = [h, c]

    return " ".join(output_sentence)


In [151]:
x = torch.randn(4,4)
m = torch.max(x, 1)
print(m)

torch.return_types.max(
values=tensor([1.2423, 1.4502, 1.1779, 1.6262]),
indices=tensor([3, 3, 0, 3]))


In [148]:
# do some translation
i = np.random.choice(len(input_texts))
input_seq = encoder_inputs_tensor[i : i + 1]
input_seq, type(input_seq)

(tensor([[  0,   0, 610, 251,  56]], dtype=torch.int32), torch.Tensor)

In [161]:
while True:
    # do some translation
    i = np.random.choice(len(input_texts))
    input_seq = encoder_inputs_tensor[i : i + 1]
    translation = decode_sequence(input_seq)
    print("_")
    print("Input:", input_texts[i])
    print("Translation:", translation)

    ans = input("Continue? [Y/n]")
    if ans and ans.lower().startswith("n"):
        break


_
Input: I saw that.
Translation: j ai été du un
Continue? [Y/n]y
_
Input: Tom grumbled.
Translation: j adore l essayer
Continue? [Y/n]y
_
Input: Tie your shoe.
Translation: puis je m en
Continue? [Y/n]y
_
Input: I'm a lawyer.
Translation: j ai été du un
Continue? [Y/n]y
_
Input: Get the box.
Translation: puis je partir
Continue? [Y/n]y
_
Input: Talk to me!
Translation: puis je partir
Continue? [Y/n]y
_
Input: Well done!
Translation: puis je partir
Continue? [Y/n]y
_
Input: I'll pay you.
Translation: j ai été du un
Continue? [Y/n]y
_
Input: You're naive.
Translation: puis je m en
Continue? [Y/n]y
_
Input: They're cute.
Translation: j ai été du un
Continue? [Y/n]y
_
Input: Can you ski?
Translation: puis je m en aller
Continue? [Y/n]y
_
Input: Bring the key.
Translation: puis je partir
Continue? [Y/n]y
_
Input: Straighten up.
Translation: puis je partir
Continue? [Y/n]n
