In [38]:
import  numpy as np, pandas as pd, subprocess, os, torch, pickle, csv
from nltk.tokenize import word_tokenize as wt
from torch import nn
from torch.optim import Adam
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data import Field, Iterator, TabularDataset, BucketIterator
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import torch.nn.functional as F
import random
from tqdm import tqdm

In [2]:
def get_languages(csv_file, preset):
    with open(csv_file, 'r') as csv_File: #opens csv containing language codes and their names
        reader = csv.reader(csv_File)
        language_table = {row[0].split(';')[1] : row[0].split(';')[0] for row in reader} #dictionary mapping code to name
    if preset == 'y':
        #Preset language codes to be used in model, chosen at random
        language_codes = ["srd", "krc", "nob", "pnb",
                          "mai", "eng", "be-tarask",
                          "xho", "tet", "tha"]
        language_names = [(key, value) for key, value in language_table.items() if value in language_codes]
        return language_names
    elif preset == 'n':
        '''
        experimental function for allowing user to choose which languages to use
        '''
        languages = []
        while len(languages) != 10:
            language = input("Enter language ").capitalize()
            #print(language)
            if language in languages:
                print("You've already said that one! ")
            elif language in language_table:
                languages.append(language)
                print(languages)
            else:
                print('Language not recognised. Please refer to language labels')
                print(languages)
                continue
        language_codes = [language_table[i] for i in language_table if i in languages] #collects languages from predetermined set,
        language_names =  [(key, value) for key, value in language_table.items() if value in language_codes] #dictionary mapping code to name
        return language_names
    else:
        print("has to be y or n dummy") #in case of user error

In [3]:
def gen_data(training_file, training_labels, language_codes, training):
    '''
    Function generates the set based on pre defined language codes and creates various
    attributes to the object
    '''
    if training == True:
        data = [i.split('\n')[:-1] for i in open(training_file, 'r')] #opens text file and splits on new line
        labels = [i.split('\n')[:-1] for i in open(training_labels, 'r')] #opens label file and splits on white space
        things = list(zip([i[0] for i in data], [i[0] for i in labels])) #zips sentences with corrosponding language label
        sets = [(i[0],i[1]) for i in things] #this might actually do the same thing as the above not sure
        
        x = [i[0][:100] for i in sets if i[1] in language_codes] #Matrix of sentences to be used in the model
        y = [i[1] for i in sets if i[1] in language_codes] #labels for each of the sentences
        raw_data = ''.join([i for i in x]) #concatenation of all characters in the training set
        vocab = {char: ord(char) for char in set(raw_data)} #dictionary mapping character to ord(integer)
        int2char = {num : char for char, num in vocab.items()} #dictionary mapping integer to character
        return x, y, vocab, int2char
    else:
        data = [i.split('\n')[:-1] for i in open(training_file, 'r')] #opens text file and splits on new line
        labels = [i.split('\n')[:-1] for i in open(training_labels, 'r')] #opens label file and splits on white space
        things = list(zip([i[0] for i in data], [i[0] for i in labels])) #zips sentences with corrosponding language label
        sets = [(i[0],i[1]) for i in things] #this might actually do the same thing as the above not sure
        x = [i[0][:100] for i in sets if i[1] in language_codes] #Matrix of sentences to be used in the model
        y = [i[1] for i in sets if i[1] in language_codes] #labels for each of the sentences
        return x, y
        

In [4]:
language_names = get_languages('./data/raw/labels.csv', 'y')

In [5]:
language_codes = [i[1] for i in language_names]

In [6]:
x_train = 'data/raw/x_train.txt'
y_train = 'data/raw/y_train.txt'

In [7]:
x_train, y_train, vocab, int2char = gen_data(x_train, y_train, language_codes, training=True)

In [8]:
x_test = 'data/raw/x_test.txt'
y_test = 'data/raw/y_test.txt'

In [9]:
x_test, y_test = gen_data(x_test, y_test, language_codes, training=False)

In [10]:
def langencoder(language_codes):
    one_hot_lang = {}
    lang2int = {lang : (num) for num, lang in dict(enumerate(language_codes)).items()}
    '''for lang, num in lang2int.items():
        one_hot_lang[lang] = np.zeros(9)
        one_hot_lang[lang] = np.insert(one_hot_lang[lang],num, 1)
        #one_hot_lang[lang] = list([0,0,0,0,0,0,0,0,0,0,0]).insert(num, 1)
    '''
    #hot2lang = {hot : lang for lang, hot in one_hot_lang.items()}
    
    return lang2int#, hot2lang

In [11]:
lang2int = langencoder(language_codes)

In [12]:
def build_vocab(x_train):
    total_data = ''.join(x_train)
    int2char = dict(enumerate(set(total_data)))
    char2int = {char : num for num, char in int2char.items() }
    #char2int['<niv>'] = max(char2int.values()) +1
    return char2int

vocab = build_vocab(x_train)

In [13]:
def output_vocab(vocab):
    directory = 'vocab/'
    if os.path.exists(directory) == False:
        os.mkdir(directory)
        
        
    with open('{}vocab.pkl'.format(directory), 'wb') as file:
        pickle.dump(vocab, file)
        file.close()
    

In [14]:
output_vocab(vocab)

In [15]:
from torch.nn.utils.rnn import pad_sequence

In [16]:
def build_data(x,y, lang2int,vocab):
    labels = []
    vectors = []
    sets = zip(x,y)
    for samples in sets:
        
        sample = [i for i in samples[0]]
        label = samples[1]
        #print(label)
        count = 100
        while count != 0:
            #x = [(i,int2label[samples[1]]) for i in samples[0]]
            vector = []
            encoded = []
            for i in sample:
                if i in vocab:
                    encoded.append(vocab[i])
                else:
                    encoded.append(random.randint(0,len(vocab)))
                
            for i in range(1,101):
                vectors.append(torch.LongTensor(encoded[:i])) #, int(lang2int[label])))
                labels.append(lang2int[label])
                count -=1
            #vectors += vector
            #label_matrix += labels
    return pad_sequence(vectors, batch_first=True, padding_value=0), labels



In [17]:
train_data, train_labels = build_data(x_train, y_train, lang2int, vocab)

In [18]:
test_data, test_labels = build_data(x_test, y_test, lang2int, vocab)

In [19]:
from torch.utils.data import Dataset

In [20]:
class RTDataset(Dataset):
    def __init__(self, train_x, train_y):
        self.train_x = train_x
        self.train_y = train_y
        
    def __getitem__(self, index):
        return (self.train_x[index], self.train_y[index])
    
    def __len__(self):
        return len(self.train_x)
        
        

In [21]:
train_data_set = RTDataset(train_data, train_labels)

In [22]:
test_data_set = RTDataset(test_data, test_labels)

In [67]:
train_loader = DataLoader(train_data_set, batch_size=100, shuffle=True)

In [68]:
test_loader = DataLoader(test_data_set, batch_size=100, shuffle=True)

In [69]:
class GRUNet(nn.Module):
    def __init__(self, vocab_size, seq_len, input_size, hidden_size, num_layers, output_size, dev, dropout=0.0):
        super().__init__()
        self.num_layers = num_layers
        self.seq_len = seq_len
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.device = device
        self.emb = nn.Embedding(vocab_size, input_size).to(device)
        self.gru = nn.GRU(input_size, hidden_size,
                          num_layers=self.num_layers, batch_first=True, dropout=dropout).to(device)
        self.fc = nn.Linear(hidden_size * seq_len, output_size).to(device)
        # self.softmax = nn.Softmax(dim=2)

    def forward(self, sequence, hidden_layer):
        output = self.emb(sequence).to(device)
        hidden_layer = hidden_layer.to(self.device)
        output, hidden_layer = self.gru(output, hidden_layer)
        output = output.contiguous().view(-1, self.hidden_size *
                                          len(sequence[0]))
        output = self.fc(output).to(device)

        return output, hidden_layer
    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size).float().to(self.device)


In [26]:
def get_vocab(path):
    with open(path+'vocab.pkl', 'rb')as file:
        vocab = pickle.load(file)
    return vocab

In [27]:
'''def get_dataloaders(filepath):
    dataloaders = []
    for i in os.listdir(filepath):
        if i == '.ipynb_checkpoints':
            continue
        with open(filepath + i, "rb") as input_file:
            dataloaders.append(pickle.load(input_file))
            input_file.close()
    return dataloaders[0], dataloaders[1]

def train(training_dataloader, model):
    pass


train_loader, test_loader = get_dataloaders('dataloaders/')'''

'def get_dataloaders(filepath):\n    dataloaders = []\n    for i in os.listdir(filepath):\n        if i == \'.ipynb_checkpoints\':\n            continue\n        with open(filepath + i, "rb") as input_file:\n            dataloaders.append(pickle.load(input_file))\n            input_file.close()\n    return dataloaders[0], dataloaders[1]\n\ndef train(training_dataloader, model):\n    pass\n\n\ntrain_loader, test_loader = get_dataloaders(\'dataloaders/\')'

In [28]:
vocab = get_vocab('vocab/')

In [70]:
vocab_size = len(vocab)
sequence = 100
batch_size = 100
input_size = 100
hidden_size = 256
nr_layers = 2
output_size = 10
device = torch.device('cuda:0')


In [71]:
model = GRUNet(vocab_size, sequence, input_size, 
               hidden_size, nr_layers, output_size, device, dropout=0.0)
    

In [72]:
model

GRUNet(
  (emb): Embedding(879, 100)
  (gru): GRU(100, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=25600, out_features=10, bias=True)
)

In [73]:
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

In [75]:
model.train()
model = model.to(device)
print('Training')
epoch_nr = 0
EPOCH = list(range(20))
for epoch in tqdm(EPOCH):
    epoch_nr += 1
    epoch_loss = []
    h = model.init_hidden(batch_size)
    count = 0

    percent = 10
    with tqdm(total=len(train_loader)) as pbar:
        for (x,y) in train_loader:
        
            count +=1 
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            h = h.data
            out, h = model(x, h)
            loss = criterion(out, y.long())
            loss.backward()
            epoch_loss.append(loss.item())
            optimizer.step()
            #print('Loss per timestep = {}'.format(loss.item()))
            pbar.update(1)
            
        avg_loss = sum(epoch_loss) / len(epoch_loss)
        print("Average loss at epoch %d: %.7f" % (epoch_nr, avg_loss)) 
    

Training


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 1: 0.0845533



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 2: 0.0543851



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 3: 0.0845897



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 4: 0.0557049



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 5: 0.0458393



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 6: 0.0744213



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 7: 0.2028613



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 8: 0.4564747



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 9: 0.4002304



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 10: 0.4465500



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 11: 0.4379047



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 12: 0.4698699



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 13: 0.4770084



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 14: 0.4945874



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 15: 0.4728319



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 16: 0.5613053



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 17: 0.5397002



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 18: 0.5576397



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 19: 0.5606002



HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

Average loss at epoch 20: 0.5960788




In [84]:
trained_model = model

In [79]:
save_model(trained_model, 3)

In [80]:
def save_model(model, model_nr):
    directory = 'trained_models/'
    if os.path.exists(directory) == False:
        os.mkdir(directory)
        
    torch.save(model.state_dict(), '{}gru_model_nr{}.pt'.format(directory,model_nr))
        
def load_model(path):
    model = os.listdir(path)[2]
    print(model)
    with open(path+model, 'rb') as input_model:
        print(input_model)
        trained_model = torch.load(input_model)
        
        
    return trained_model
        

In [81]:
dp = load_model('trained_models/')

gru_model_nr2.pt
<_io.BufferedReader name='trained_models/gru_model_nr2.pt'>


In [82]:
trained_model = GRUNet(vocab_size, sequence, input_size, 
               hidden_size, nr_layers, output_size, device, dropout=0.0)

In [83]:
trained_model.load_state_dict(dp)

RuntimeError: Error(s) in loading state_dict for GRUNet:
	Unexpected key(s) in state_dict: "gru.weight_ih_l2", "gru.weight_hh_l2", "gru.bias_ih_l2", "gru.bias_hh_l2". 
	size mismatch for gru.weight_ih_l0: copying a param with shape torch.Size([600, 100]) from checkpoint, the shape in current model is torch.Size([768, 100]).
	size mismatch for gru.weight_hh_l0: copying a param with shape torch.Size([600, 200]) from checkpoint, the shape in current model is torch.Size([768, 256]).
	size mismatch for gru.bias_ih_l0: copying a param with shape torch.Size([600]) from checkpoint, the shape in current model is torch.Size([768]).
	size mismatch for gru.bias_hh_l0: copying a param with shape torch.Size([600]) from checkpoint, the shape in current model is torch.Size([768]).
	size mismatch for gru.weight_ih_l1: copying a param with shape torch.Size([600, 200]) from checkpoint, the shape in current model is torch.Size([768, 256]).
	size mismatch for gru.weight_hh_l1: copying a param with shape torch.Size([600, 200]) from checkpoint, the shape in current model is torch.Size([768, 256]).
	size mismatch for gru.bias_ih_l1: copying a param with shape torch.Size([600]) from checkpoint, the shape in current model is torch.Size([768]).
	size mismatch for gru.bias_hh_l1: copying a param with shape torch.Size([600]) from checkpoint, the shape in current model is torch.Size([768]).
	size mismatch for fc.weight: copying a param with shape torch.Size([10, 20000]) from checkpoint, the shape in current model is torch.Size([10, 25600]).

In [43]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [95]:
lang2int

{'eng': 0,
 'be-tarask': 1,
 'krc': 2,
 'mai': 3,
 'nob': 4,
 'pnb': 5,
 'srd': 6,
 'tet': 7,
 'tha': 8,
 'xho': 9}

In [None]:

predictions = []
correct = []
correct = 0
correct_map = {}
count = 0
batch_nr = 1
with tqdm(total=len(test_loader)) as pbar:
    for batch in test_loader:
        batch_nr += 1
        hidden_layer = model.init_hidden(1).to(device)
        for i ,examples in enumerate(zip(batch[0], batch[1])):
            prediction = trained_model(examples[0].unsqueeze(0).to(device), hidden_layer)
            _, indeces = torch.max(prediction[0].data, dim=1)
            #predictions.append(indices[0].item())
            
            if indeces[0].item() == examples[1].item():
                
                correct += 1
            count += 1
        pbar.update(1)
    print('Accuracy after batch {}: {}'.format(batch_nr, ((correct / count) * 100)))
        
        
        
        
        
        
    


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))

In [90]:
len(test_loader)

5000

In [91]:
correct

440730

In [94]:
correct / 500000 * 100

88.146