In [3]:
import  numpy as np, pandas as pd, subprocess, os, torch, pickle, csv
from nltk.tokenize import word_tokenize as wt
from torch import nn
from torch.optim import Adam
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data import Field, Iterator, TabularDataset, BucketIterator
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import torch.nn.functional as F
import random
from tqdm import tqdm

In [4]:
def get_languages(csv_file, preset):
    with open(csv_file, 'r') as csv_File: #opens csv containing language codes and their names
        reader = csv.reader(csv_File)
        language_table = {row[0].split(';')[1] : row[0].split(';')[0] for row in reader} #dictionary mapping code to name
    if preset == 'y':
        #Preset language codes to be used in model, chosen at random
        language_codes = ["srd", "krc", "nob", "pnb",
                          "mai", "eng", "be-tarask",
                          "xho", "tet", "tha"]
        language_names = [(key, value) for key, value in language_table.items() if value in language_codes]
        return language_names
    elif preset == 'n':
        '''
        experimental function for allowing user to choose which languages to use
        '''
        languages = []
        while len(languages) != 10:
            language = input("Enter language ").capitalize()
            #print(language)
            if language in languages:
                print("You've already said that one! ")
            elif language in language_table:
                languages.append(language)
                print(languages)
            else:
                print('Language not recognised. Please refer to language labels')
                print(languages)
                continue
        language_codes = [language_table[i] for i in language_table if i in languages] #collects languages from predetermined set,
        language_names =  [(key, value) for key, value in language_table.items() if value in language_codes] #dictionary mapping code to name
        return language_names
    else:
        print("has to be y or n dummy") #in case of user error

In [5]:
def gen_data(training_file, training_labels, language_codes, training):
    '''
    Function generates the set based on pre defined language codes and creates various
    attributes to the object
    '''
    if training == True:
        data = [i.split('\n')[:-1] for i in open(training_file, 'r')] #opens text file and splits on new line
        labels = [i.split('\n')[:-1] for i in open(training_labels, 'r')] #opens label file and splits on white space
        things = list(zip([i[0] for i in data], [i[0] for i in labels])) #zips sentences with corrosponding language label
        sets = [(i[0],i[1]) for i in things] #this might actually do the same thing as the above not sure
        
        x = [i[0][:100] for i in sets if i[1] in language_codes] #Matrix of sentences to be used in the model
        y = [i[1] for i in sets if i[1] in language_codes] #labels for each of the sentences
        raw_data = ''.join([i for i in x]) #concatenation of all characters in the training set
        vocab = {char: ord(char) for char in set(raw_data)} #dictionary mapping character to ord(integer)
        int2char = {num : char for char, num in vocab.items()} #dictionary mapping integer to character
        return x, y, vocab, int2char
    else:
        data = [i.split('\n')[:-1] for i in open(training_file, 'r')] #opens text file and splits on new line
        labels = [i.split('\n')[:-1] for i in open(training_labels, 'r')] #opens label file and splits on white space
        things = list(zip([i[0] for i in data], [i[0] for i in labels])) #zips sentences with corrosponding language label
        sets = [(i[0],i[1]) for i in things] #this might actually do the same thing as the above not sure
        x = [i[0][:100] for i in sets if i[1] in language_codes] #Matrix of sentences to be used in the model
        y = [i[1] for i in sets if i[1] in language_codes] #labels for each of the sentences
        return x, y
        

In [6]:
language_names = get_languages('./data/raw/labels.csv', 'y')

In [7]:
language_codes = [i[1] for i in language_names]

In [8]:
x_train = 'data/raw/x_train.txt'
y_train = 'data/raw/y_train.txt'

In [9]:
x_train, y_train, vocab, int2char = gen_data(x_train, y_train, language_codes, training=True)

In [10]:
x_test = 'data/raw/x_test.txt'
y_test = 'data/raw/y_test.txt'

In [11]:
x_test, y_test = gen_data(x_test, y_test, language_codes, training=False)

In [12]:
int2lang = dict(enumerate(language_codes))

In [13]:
lang2int = {num : lang for lang, num in int2lang.items()}

In [14]:
int2lang

{0: 'eng',
 1: 'be-tarask',
 2: 'krc',
 3: 'mai',
 4: 'nob',
 5: 'pnb',
 6: 'srd',
 7: 'tet',
 8: 'tha',
 9: 'xho'}

In [15]:
def build_vocab(x_train):
    total_data = ''.join(x_train)
    int2char = dict(enumerate(set(total_data)))
    char2int = {char : num for num, char in int2char.items() }
    #char2int['<niv>'] = max(char2int.values()) +1
    return char2int

vocab = build_vocab(x_train)

In [16]:
def output_vocab(vocab):
    directory = 'vocab/'
    if os.path.exists(directory) == False:
        os.mkdir(directory)
        
        
    with open('{}vocab.pkl'.format(directory), 'wb') as file:
        pickle.dump(vocab, file)
        file.close()
    

In [17]:
output_vocab(vocab)

In [18]:
from torch.nn.utils.rnn import pad_sequence

In [84]:
def build_data(x,y, lang2int,vocab):
    labels = []
    vectors = []
    sets = zip(x,y)
    for samples in sets:
        
        sample = [i for i in samples[0]]
        label = samples[1]
        
        #print(label)
        count = 100
        while count != 0:
            #x = [(i,int2label[samples[1]]) for i in samples[0]]
            vector = []
            encoded = []
            for i in sample:
                if i in vocab:
                    encoded.append(vocab[i])
                else:
                    encoded.append(random.randint(0,len(vocab)))
                
            for i in range(1,101):
                vectors.append(torch.LongTensor(encoded[:i])) #, int(lang2int[label])))
                labels.append(lang2int[label])
                count -=1
            #vectors += vector
            #label_matrix += labels
    return pad_sequence(vectors, batch_first=True, padding_value=0), labels



In [40]:
def build_data(x,y, lang2int,vocab):
    labels = []
    vectors = []
    sets = zip(x,y)
    examples = []
    for samples in sets:
        
        sample = [i for i in samples[0]]
        label = samples[1]
        example = [[]]
        for i in range(len(sample)):
            for x in sample[:i]:
                if x in vocab:
                    example.append(torch.LongTensor([vocab[x]]))
                else:
                    example.append(torch.LongTensor(random.randint(0,len(vocab))))
                                                    
            #example.append(torch.LongTensor([vocab[i] for i in sample[:i] if i in vocab else]))
        
        vectors.append((pad_sequence(example[2:], batch_first=True, padding_value=0), lang2int[label]))
        #print(vectors)
    return vectors
    
   
    
    
    

In [85]:
train_data, train_labels = build_data(x_train, y_train, lang2int, vocab)

In [None]:
train_data

In [86]:
test_data, test_labels = build_data(x_test, y_test, lang2int, vocab)

In [87]:
from torch.utils.data import Dataset

In [88]:
class RTDataset(Dataset):
    def __init__(self, train_x, train_y):
        self.train_x = train_x
        self.train_y = train_y
        
    def __getitem__(self, index):
        return (self.train_x[index], self.train_y[index])
    
    def __len__(self):
        return len(self.train_x)
        
        

In [89]:
train_data_set = RTDataset(train_data, train_labels)

In [90]:
test_data_set = RTDataset(test_data, test_labels)

In [106]:
train_loader = DataLoader(train_data_set, batch_size=1, shuffle=True)

In [107]:
test_loader = DataLoader(test_data_set, batch_size=1, shuffle=True)

In [108]:
def save_dataloaders(train_loader, test_loader):
    directory = 'dataloaders/'
    if os.path.exists(directory) == False:
        os.mkdir(directoy)
        
    train = 'training_loader.pkl'
    test = 'testing_loader.pkl'
    
    for i in zip([train_loader, test_loader], [train,test]):
        with open(directory+i[1], 'wb') as file:
            pickle.dump(i[0],file)

In [109]:
save_dataloaders(train_loader, test_loader)

In [110]:
from torch import nn

In [111]:
class GRUNet(nn.Module):
    def __init__(self, vocab_size, seq_len, input_size, hidden_size, num_layers, output_size, dev, dropout=0.0):
        super().__init__()
        self.num_layers = num_layers
        self.seq_len = seq_len
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.device = device
        self.emb = nn.Embedding(vocab_size, input_size).to(device)
        self.gru = nn.GRU(input_size, hidden_size,
                          num_layers=self.num_layers, batch_first=True, dropout=dropout).to(device)
        self.fc = nn.Linear(hidden_size * seq_len, output_size).to(device)

    def forward(self, sequence, hidden_layer):
        output = self.emb(sequence).to(device)
        hidden_layer = hidden_layer.to(self.device)
        output, hidden_layer = self.gru(output, hidden_layer)
        output = output.contiguous().view(-1, self.hidden_size *
                                          len(sequence[0]))
        output = self.fc(output).to(device)

        return output, hidden_layer
    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size).float().to(self.device)


In [118]:
vocab_size = len(vocab)
sequence = 100
batch_size = 100
input_size = 100
hidden_size = 256
nr_layers = 2
output_size = 10
device = torch.device('cuda:0')
batch_size = 1

In [119]:
model = GRUNet(vocab_size, sequence, input_size, 
               hidden_size, nr_layers, output_size, device, dropout=0.0)
    

In [120]:
model

GRUNet(
  (emb): Embedding(879, 100)
  (gru): GRU(100, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=25600, out_features=10, bias=True)
)

In [121]:
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

In [122]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [123]:
model.train()
model = model.to(device)
print('Training')
epoch_nr = 0
EPOCH = list(range(10))
tenp = len(train_loader) / 10
for epoch in tqdm(EPOCH):
    epoch_nr += 1
    epoch_loss = []
    h = model.init_hidden(batch_size)
    count = 0

    percent = 0
    with tqdm(total=len(train_loader)) as pbar:
        for (x,y) in train_loader:
            percent += 10
            count +=1 
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            h = h.data
            out, h = model(x, h)
            loss = criterion(out, y.long())
            loss.backward()
            epoch_loss.append(loss.item())
            optimizer.step()
            #print('Loss per timestep = {}'.format(loss.item()))
            pbar.update(1)
            
            if count % tenp == 0:
                print("Training {}% complete".format(percent))
                
        avg_loss = sum(epoch_loss) / len(epoch_loss)
        print("Average loss at epoch %d: %.7f" % (epoch_nr, avg_loss)) 
    

Training


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500000), HTML(value='')))




KeyboardInterrupt: 

In [None]:
trained_model = model

In [None]:
save_model(trained_model, 1)

In [33]:
import os, torch, pickle
from torch.utils.data import DataLoader, Dataset
from LangIdentDataset import RTDataset 
from GRUNetwork import GRUNet
import torch

In [34]:
def langencoder(language_codes):
    one_hot_lang = {}
    lang2int = {lang : (num) for num, lang in dict(enumerate(language_codes)).items()}
  
    return lang2int

In [35]:
def save_model(model, model_nr):
    directory = 'trained_models/'
    if os.path.exists(directory) == False:
        os.mkdir(directory)
    
    #model.save_state_dict(directory+'model_1.pt')
    torch.save(model.state_dict(), '{}gru_model_nr{}.pt'.format(directory,model_nr))
        
def load_model(path):
    model = os.listdir(path)[0]
    print(model)
    with open(path+model, 'rb') as input_model:
        trained_model = torch.load(input_model)
        
        
    return trained_model
        

In [36]:
def get_vocab(path):
    with open(path+'vocab.pkl', 'rb')as file:
        vocab = pickle.load(file)
    return vocab

In [37]:
lang2int = langencoder(language_codes)
int2lang = {num : lang for lang, num in lang2int.items()}

In [38]:
vocab = get_vocab('vocab/')

In [39]:
vocab_size = len(vocab)
sequence = 100
batch_size = 100
input_size = 100
hidden_size = 256
nr_layers = 2
output_size = 10
device = torch.device('cuda:00')



In [40]:
dp = load_model('trained_models/')

gru_model_nr1.pt


In [41]:
trained_model = GRUNet(vocab_size, sequence, input_size, 
               hidden_size, nr_layers, output_size, device, dropout=0.0)

In [42]:
trained_model.load_state_dict(dp)

<All keys matched successfully>

In [43]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [44]:
def get_test_loader(path):
    loaders = os.listdir(path)
    with open(path+loaders[0], 'rb') as file:
        print(file)
        training_loader = pickle.load(file)
        
    with open(path+loaders[1], 'rb') as file:
        testing_loader = pickle.load(file)
        
    return training_loader, testing_loader

In [60]:
training_loader, testing_loader = get_test_loader('dataloaders/')

<_io.BufferedReader name='dataloaders/training_loader.pkl'>


In [49]:
lang_1  = language_names_dict[int2lang[0]]
lang_2  = language_names_dict[int2lang[1]]
lang_3  = language_names_dict[int2lang[2]]
lang_4  = language_names_dict[int2lang[3]]
lang_5  = language_names_dict[int2lang[4]]
lang_6  = language_names_dict[int2lang[5]]
lang_7  = language_names_dict[int2lang[6]]
lang_8  = language_names_dict[int2lang[7]]
lang_9  = language_names_dict[int2lang[8]]
lang_10 = language_names_dict[int2lang[9]]


In [50]:
total_guesses= 'total guesses'
correct_guesses = 'correct guesses'
language_guessed = 'language guessed'
num_characters = 'number of characters'
incorrect_guesses = 'incorrect guesses'
total_stats = 'total stats'
num_correct = 'num correct'
num_incorrect = 'num incorrect'
total_predictions = 'total predictions'
accuracy = 'accuracy'

In [51]:
language_stats = {lang_1  : {total_guesses : 0, correct_guesses : 0, incorrect_guesses : 0, language_guessed : [], num_characters : []},
                  lang_2  : {total_guesses : 0, correct_guesses : 0, incorrect_guesses : 0, language_guessed : [], num_characters : []},
                  lang_3  : {total_guesses : 0, correct_guesses : 0, incorrect_guesses : 0, language_guessed : [], num_characters : []},
                  lang_4  : {total_guesses : 0, correct_guesses : 0, incorrect_guesses : 0, language_guessed : [], num_characters : []},
                  lang_5  : {total_guesses : 0, correct_guesses : 0, incorrect_guesses : 0, language_guessed : [], num_characters : []},
                  lang_6  : {total_guesses : 0, correct_guesses : 0, incorrect_guesses : 0, language_guessed : [], num_characters : []},
                  lang_7  : {total_guesses : 0, correct_guesses : 0, incorrect_guesses : 0, language_guessed : [], num_characters : []},
                  lang_8  : {total_guesses : 0, correct_guesses : 0, incorrect_guesses : 0, language_guessed : [], num_characters : []},
                  lang_9  : {total_guesses : 0, correct_guesses : 0, incorrect_guesses : 0, language_guessed : [], num_characters : []}, 
                  lang_10 : {total_guesses : 0, correct_guesses : 0, incorrect_guesses : 0, language_guessed : [], num_characters : []},
                  total_stats : {num_correct : 0, num_incorrect : 0, total_predictions : 0, accuracy : 0}
                 }

In [52]:
def update_stats(language_stats,prediction, correct_language, int2lang, characters):
    language_stats[total_stats][total_predictions] += 1
    language_stats[language_names_dict[int2lang[correct_language]]][total_guesses] += 1
    language_stats[language_names_dict[int2lang[correct_language]]][language_guessed].append(prediction)
    if prediction == correct_language:
        language_stats[language_names_dict[int2lang[correct_language]]][correct_guesses] += 1
        language_stats[language_names_dict[int2lang[correct_language]]][num_characters].append(characters)
        language_stats[total_stats][num_correct] += 1
        print(characters)
    elif prediction != correct_language:
        language_stats[language_names_dict[int2lang[correct_language]]][incorrect_guesses] += 1
        language_stats[total_stats][num_incorrect] += 1
    
    language_stats[total_stats][accuracy] = round(language_stats[total_stats][num_correct] / language_stats[total_stats][total_predictions] * 100, 2)
    return language_stats
                                     
                                     
    

In [81]:
for i in range(0,50000,100):
    hidden_layer = trained_model.init_hidden(1).to(device)
    count = 0
    for x in range(100):
        count += 1
        #print(test_data_set[i + x][0])
        prediction = trained_model(test_data_set[x][0].unsqueeze(0).to(device), hidden_layer)
        _, indeces = torch.max(prediction[0].data, dim=1)
        if indeces[0].item() == test_data_set[x][1]:
            print(count)
            break
            print('correct')
        else:
            continue
            

    
    

10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
1

In [70]:

correct = 0
count = 0
percent = 0
batch_nr = 1
with tqdm(total=len(testing_loader)) as pbar:
    tenp = 25000
    for batch in testing_loader:
        batch_nr += 1
        hidden_layer = trained_model.init_hidden(1).to(device)
        for i in range(100):
            for example in test_data_set:

                prediction = trained_model(examples[0].unsqueeze(0).to(device), hidden_layer)
                _, indeces = torch.max(prediction[0].data, dim=1)
                characters = len(torch.nonzero(examples[0]))
                update_stats(language_stats, indeces[0].item(), examples[1].item(), int2lang, characters)

                if indeces[0].item() == examples[1].item():

                    correct += 1
                count += 1
        break 
        pbar.update(1)
        
        if count % tenp == 0:
            percent += 10
            print('Accuracy after {}% tested: {}'.format(percent, (correct / count) * 100))
        '''if count == 1000:
            break'''
        
        
        
        
    


NameError: name 'testing_loader' is not defined

In [1]:
len(train_data_set)

NameError: name 'train_data_set' is not defined

In [2]:
for a in range(0,50000,100):
    
    '''for i in range(100):
        while correct != True:
        for i in range(100):
            prediction = trained_model(examples[0].unsqueeze(0).to(device), hidden_layer)
            _, indeces = torch.max(prediction[0].data, dim=1)
            characters = len(torch.nonzero(examples[0]))
            update_stats(language_stats, indeces[0].item(), examples[1].item(), int2lang, characters)

            if indeces[0].item() == examples[1].item():
                    correct += 1
            if a == 6:
                correct = True
                break
            else:
                correct = False
                continue'''
       
        

In [70]:
language_stats

{'English': {'total guesses': 0,
  'correct guesses': 0,
  'incorrect guesses': 0,
  'language guessed': [],
  'number of characters': []},
 'Belarusian (Taraschkewiza)': {'total guesses': 0,
  'correct guesses': 0,
  'incorrect guesses': 0,
  'language guessed': [],
  'number of characters': []},
 'Karachay-Balkar': {'total guesses': 0,
  'correct guesses': 0,
  'incorrect guesses': 0,
  'language guessed': [],
  'number of characters': []},
 'Maithili': {'total guesses': 1,
  'correct guesses': 1,
  'incorrect guesses': 0,
  'language guessed': [3],
  'number of characters': [2]},
 'Bokmål': {'total guesses': 0,
  'correct guesses': 0,
  'incorrect guesses': 0,
  'language guessed': [],
  'number of characters': []},
 'Western Panjabi': {'total guesses': 1,
  'correct guesses': 1,
  'incorrect guesses': 0,
  'language guessed': [5],
  'number of characters': [23]},
 'Sardinian': {'total guesses': 0,
  'correct guesses': 0,
  'incorrect guesses': 0,
  'language guessed': [],
  'number

In [45]:
languages = [i[0] for i in language_names]

In [46]:
with open('results.pickle', 'wb') as handle:
    pickle.dump(language_stats, handle, protocol=pickle.HIGHEST_PROTOCOL)

NameError: name 'language_stats' is not defined

In [47]:
from collections import Counter

In [48]:
language_names_dict = {i[1] : i[0] for i in language_names}

In [None]:
def further_analysis(language_stats, languages,int2lang):
    
    for i in languages:
        lang_guessed = []
    
        lang_guessed = dict(Counter([int2lang[x] for x in language_stats[i]['language guessed']]))
        x = [(value,key) for key, value in lang_guessed.items()]
        sec_max = sorted(x)[-2][1]
        las = sorted(x)[0][1]
        num_char = language_stats[i][num_characters]
        avg_char = round(sum(num_char) / len(num_char))
        print(len(num_char))
        print('Language: {}'.format(i))
        print('Total guesses: {}'.format(language_stats[i]['total guesses']))
        print('Total correct: {}'.format(language_stats[i]['correct guesses']))
        print('Total accuracy for {}: {}%'.format(i,str(round(language_stats[i]['correct guesses']/ language_stats[i]['total guesses'] * 100,2))))
        print('Languages Guessed: {}'.format(dict(Counter(lang_guessed))))
        print('Most incorrectly guessed: {}'.format(sec_max))
        print('Least incorrectly guessed: {}'.format(las))
        print('Average characters until correct guess: {}'.format(avg_char))
        print('\n')
        '''data = {'language'      : i, 
                'total_guesses' : language_stats[i]['total guesses'],
                'total_correct' : language_stats[i]['correct guesses'],
                'accuracy'      :  str(round(language_stats[i]['correct guesses']/ language_stats[i]['total guesses'] * 100,2)),
                'languages_guessed' : dict(Counter(lang_guessed))}'''
        

In [None]:
further_analysis(language_stats, languages, int2lang)

In [None]:
language_names_dict
num_lan_name = language_names_dict.values()

In [None]:
int2lang = dict(enumerate(num_lan_name))