In [222]:
from models.language_model import LanguageModel
import pandas as pd
from tqdm import tqdm
import numpy as np
from nltk.util import trigrams
import torch
from torch import nn
from torch.optim import Adam


## Import Training Data

In [119]:
df = pd.read_csv('data/small_data.csv')

In [120]:
small_data = df[:1000]

In [121]:
def extract_data(df):
    source = df['english'].tolist()
    target = df['french'].tolist()
    
    return source, target

In [137]:
source, target = extract_data(small_data)

In [138]:
source = pre_process(source)
target = pre_process(target)

100%|██████████| 1000/1000 [00:00<00:00, 313569.38it/s]
100%|██████████| 1000/1000 [00:00<00:00, 411569.42it/s]


In [140]:
target[0]

['les',
 'délégations',
 'qui',
 'souhaitent',
 's',
 'inscrire',
 'sur',
 'la',
 'liste',
 'des',
 'orateurs',
 'sont',
 'priées',
 'de',
 'se',
 'mettre',
 'en',
 'rapport',
 'avec',
 'le']

In [141]:
def build_vocab(data):

    vocab = {}
    vocab['<s>'] = 0
    vocab['<e>'] = 1

    num = 2
    for i in data:
        for token in i:
            if token == 0 or token == 1:
                continue
            elif token not in vocab:
                vocab[token] = num
                num += 1
            else:
                continue 
        
    decode_vocab = {num : wrd for wrd, num in vocab.items()}
    return vocab, decode_vocab

In [142]:
def hottify(vocab):
    
    hotties = {}
    vec = np.zeros(len(vocab))
    
    for wrd, ind in vocab.items():
        vec[ind] = 1
        hotties[wrd] = vec
        vec = np.zeros(len(vocab))
    
    #decode_hotties = {hot : wrd for wrd, hot in hotties.items()}
    return hotties#, decode_hotties


In [249]:
import pickle

In [253]:
with open('data/small_vectors.pickle', 'rb') as file:
    a = pickle.load(file)

In [256]:
a['delegations']

array([-0.15722656,  0.24609375,  0.35351562,  0.34179688,  0.1796875 ,
       -0.16601562, -0.16894531,  0.01672363,  0.05371094,  0.05566406,
        0.16796875, -0.21582031, -0.07666016,  0.26171875, -0.18164062,
        0.20019531,  0.00113678,  0.25976562,  0.07080078, -0.03173828,
        0.04174805,  0.2265625 ,  0.06494141, -0.23730469,  0.00254822,
        0.0612793 , -0.24902344, -0.09326172,  0.25976562, -0.15625   ,
        0.14160156, -0.17480469, -0.14257812,  0.69921875,  0.21191406,
       -0.2578125 ,  0.39453125,  0.2890625 , -0.11425781,  0.0859375 ,
        0.01806641,  0.06542969,  0.04956055,  0.21972656, -0.10546875,
       -0.10351562, -0.10058594, -0.03100586, -0.22363281,  0.11669922,
        0.2421875 ,  0.13574219, -0.08349609,  0.25      , -0.40039062,
       -0.0859375 , -0.09765625,  0.19042969, -0.03540039, -0.17089844,
        0.20410156, -0.36132812,  0.00323486, -0.16601562, -0.03540039,
       -0.10595703, -0.40625   ,  0.31640625,  0.18164062, -0.07

In [143]:
vocab, decode = build_vocab(source)

In [144]:
french_vocab, decode_french = build_vocab(target)
french_hotties = hottify(french_vocab)

In [146]:
def pre_process(lang_data):
    return [i.split(' ') for i in tqdm(lang_data)]

In [147]:
english_language_model = LanguageModel(source)

100%|██████████| 1000/1000 [00:00<00:00, 9369.96it/s]
100%|██████████| 1000/1000 [00:00<00:00, 60563.19it/s]
100%|██████████| 1000/1000 [00:00<00:00, 29763.94it/s]

Counting Ngrams
Calculating Bigram MLE
Calculating Triigram MLE





In [223]:
loss = nn.CrossEntropyLoss()
optimizer = Adam((net.fc1.weights, net.fc2.weights), lr=0.01)

In [224]:
net = Net(50, 1, len(french_hotties))

In [255]:
optimizer.zero_grad()
for i, x in zip(source, target):
    
    trs = list(trigrams(i))
    french_trs = list(trigrams(x))
    
    for tri, french in zip(trs, french_trs):
        pre = tri[:2]
        poss = torch.Tensor([np.array([a[i[0][-1]]]) for i in english_language_model.most_likely(pre, 50, False)])
        french_target = french[-1]      
        
        y = torch.Tensor(french_hotties[french[-1]])
        out = net.forward(poss)
        print(torch.argmax(y))
        print(out.shape)
        print(torch.argmax(out, dim=1).shape)
        error = loss(out, torch.argmax(y))
        break
        
    break

KeyError: 'a'

In [214]:
from torch.nn import Sigmoid, Softmax
from torch import mm


class Net:
    
    def __init__(self, input_size, hidden_size, output_size):

        # input_size is the size of the vector (50 x hdden size)
        # hidden_size can be anything really I think
        # output_size is the final layer which is the size of the french vocabulary

        self.fc1 = fc(input_size, hidden_size)
        self.fc2 = fc(output_size, input_size)
        self.linear = Sigmoid()
        self.out_layer = Softmax(dim=1)

    def forward(self, x):
        out = self.fc1.forward(x)
        out = self.fc2.forward(out)
        return self.out_layer(out)
    
    
class fc(Net):

    def __init__(self, input_size, output_size):
 
        self.weights = torch.rand(output_size, input_size, requires_grad=True)
        self.bias = torch.rand(input_size).long()
        self.linnear = Sigmoid()

    def forward(self, x):
        x = mm(x, self.weights) + self.bias
        return self.linnear(x)

In [245]:
for i in out:
    print(torch.argmax(i))

tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)
tensor(1550)


In [247]:
decode_french[1550]

'206'