In [1]:
import os, torch, pickle, torch, config
from torch.utils.data import DataLoader, Dataset
from LangIdentDataset import RTDataset 
import stats
from GRUNetwork import RNN_GRU
import get_data

In [2]:
import pandas as pd
import csv, os, argparse
import config

def get_args():
    parser = argparse.ArgumentParser(
        description="")
    parser.add_argument("-P", "--preset", dest='preset', type=str,
                        help="Choose to use default language set or your own", default="y")

    parser.add_argument("-F", "--Folder", dest='folder', type=str, default="data/raw/",
                        help="Directory that contains training data")
    args = parser.parse_args()
    return args

def get_files_from_folder(folder):
    files = os.listdir(folder)

    return folder+files[0], folder+files[1], folder+files[2], folder+files[3], folder+files[4]

def get_languages(csv_file, preset):
    with open(csv_file, 'r') as csv_File: #opens csv containing language codes and their names
        reader = csv.reader(csv_File)
        language_table = {row[0].split(';')[1] : row[0].split(';')[0] for row in reader} #dictionary mapping code to name
    if preset == 'y':
        #Preset language codes to be used in model, chosen at random
        language_codes = ["srd", "krc", "nob", "pnb",
                          "mai", "eng", "be-tarask",
                          "xho", "tet", "tha"]
        language_names = [(key, value) for key, value in language_table.items() if value in language_codes]
        return language_names
    elif preset == 'n':
        '''
        experimental function for allowing user to choose which languages to use
        '''
        languages = []
        while len(languages) != 10:
            language = input("Enter language ").capitalize()
            #print(language)
            if language in languages:
                print("You've already said that one! ")
            elif language in language_table:
                languages.append(language)
                print(' '.join(languages))
            else:
                print('Language not recognised. Please refer to language labels')
                print(' '.join(languages))
                continue
        language_codes = [language_table[i] for i in language_table if i in languages] #collects languages from predetermined set,
        language_names =  [(key, value) for key, value in language_table.items() if value in language_codes] #dictionary mapping code to name
        return language_names
    else:
        print("has to be y or n dummy") #in case of user error



In [35]:
def langencoder(language_codes):
    one_hot_lang = {}
    lang2int = {lang : (num) for num, lang in dict(enumerate(language_codes)).items()}
  
    return lang2int

def load_model(path, config):
    model = os.listdir(path)[0]
    if config['device'] == 'gpu':
        device = torch.device('cuda:01')
    else:
        device = torch.device('cpu')
    print(model)
    with open(path+model, 'rb') as input_model:
        data = torch.load(input_model)
    trained_model = RNN_GRU(vocab_size=config['vocab_size'], seq_len=100, input_size=100, 
               hidden_size=256, num_layers=2, output_size=10, device=device, dropout=0.0)
    trained_model.load_state_dict(data)    
    return trained_model

def get_vocab(path):
    with open(path+'vocab.pkl', 'rb')as file:
        vocab = pickle.load(file)
    return vocab

def get_test_loader(path):
    loaders = os.listdir(path)
    print(loaders)
    with open(path+loaders[0], 'rb') as file:
        testing_loader = pickle.load(file)
    
    return testing_loader



In [48]:
h = trained_model.init_hidden(100)

torch.Size([2, 100, 256])

In [37]:
CONFIG = config.get_config('config/config.json')
if CONFIG['device'] == 'cpu':
    device = torch.device('cpu')
else:
    device = 'cuda:01'
language_codes = [i[1] for i in CONFIG['languages']]
language_names = CONFIG['languages']
lang2int = langencoder(language_codes)
int2lang = {num : lang for lang, num in lang2int.items()}
vocab = get_vocab('vocab/')
trained_model = load_model('trained_models/', CONFIG).to(device)
test_data = get_test_loader('dataloaders/')
language_names_dict = {i[1] : i[0] for i in language_names}
language_stats = stats.gen_empty_stats(int2lang, language_names_dict)

gru_model_100batches_10epochs.pt
['training_loader.pkl', 'test_dataset.pkl']


In [9]:
print(len(test_data))
for i in test_data:
    print(len(i[0]))
    break

5000
100


In [52]:
def test_model(trained_model, test_data, language_stats, device, language_names_dict):
    correct_per_example = 0
    total_predictions = 5000
    percent = 0
    batch_nr = 1
    tenp = 500
    num_characters = []
    count = 0
    for x, y in test_data:
        batch_nr +=1
        
        hidden_layer = trained_model.init_hidden(1).to(device)
        for examples in zip(x,y):
            #total_predictions += 1
            count += 1
            prediction = trained_model(examples[0].unsqueeze(0).to(device), hidden_layer)
            _, indeces = torch.max(prediction[0].data, dim=1)
            characters = len(torch.nonzero(examples[0]))
            stats.update_stats(language_stats, indeces[0].item(), examples[1].item(), int2lang, characters, language_names_dict)
            
            if indeces[0].item() == examples[1].item():
                num_characters.append(characters)
                correct_per_example += 1
                break
            else:
                continue
                
        if count % tenp == 0:
            percent += 10
            print('Accuracy after {}% tested: {}'.format(percent, (correct / total_predictions * 100)))
    return language_stats

In [53]:
language_stats = test_model(trained_model, test_data, language_stats, device, language_names_dict)

(tensor([446,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]), tensor(9))


In [27]:
CONFIG = config.get_config('config/config.json')

In [29]:
language_names

[['English', 'eng'],
 ['Belarusian (Taraschkewiza)', 'be-tarask'],
 ['Karachay-Balkar', 'krc'],
 ['Maithili', 'mai'],
 ['Bokmål', 'nob'],
 ['Western Panjabi', 'pnb'],
 ['Sardinian', 'srd'],
 ['Tetum', 'tet'],
 ['Thai', 'tha'],
 ['Xhosa', 'xho']]

In [12]:
languages = [i[0] for i in language_names]

In [30]:
def further_analysis(language_stats, language_names ,int2lang):
    
    for i in languages:
        lang_guessed = [i[0] for i in languages]
    
        lang_guessed = dict(Counter([int2lang[x] for x in language_stats[i]['languages_guessed']]))
        x = [(value,key) for key, value in lang_guessed.items()]
        sec_max = sorted(x)[-2][1]
        las = sorted(x)[0][1]
        num_char = language_stats[i]['num_characters']
        avg_char = sum(num_char) / len(num_char)
        print('Language: {}'.format(i))
        print('Total guesses: {}'.format(language_stats[i]['total_guesses']))
        print('Total correct: {}'.format(language_stats[i]['correct_guesses']))
        print('Total accuracy for {}: {}%'.format(i,str(round(language_stats[i]['correct_guesses']/ language_stats[i]['total_guesses'] * 100,2))))
        print('Languages Guessed: {}'.format(dict(Counter(lang_guessed))))
        print('Most incorrectly guessed: {}'.format(sec_max))
        print('Least incorrectly guessed: {}'.format(las))
        print('Average characters until correct guess: {}'.format(avg_char))
        print('\n')
        '''data = {'language'      : i, 
                'total_guesses' : language_stats[i]['total guesses'],
                'total_correct' : language_stats[i]['correct guesses'],
                'accuracy'      :  str(round(language_stats[i]['correct guesses']/ language_stats[i]['total guesses'] * 100,2)),
                'languages_guessed' : dict(Counter(lang_guessed))}'''
        

In [31]:
further_analysis(language_stats, language_names, int2lang)

English
500
Language: English
Total guesses: 581
Total correct: 500
Total accuracy for English: 86.06%
Languages Guessed: {'eng': 500, 'be-tarask': 19, 'tha': 15, 'mai': 7, 'krc': 9, 'nob': 1, 'tet': 30}
Most incorrectly guessed: tet
Least incorrectly guessed: nob
Average characters until correct guess: 1.162


Belarusian (Taraschkewiza)
500
Language: Belarusian (Taraschkewiza)
Total guesses: 536
Total correct: 500
Total accuracy for Belarusian (Taraschkewiza): 93.28%
Languages Guessed: {'be-tarask': 500, 'tet': 9, 'mai': 2, 'eng': 13, 'tha': 10, 'krc': 1, 'nob': 1}
Most incorrectly guessed: eng
Least incorrectly guessed: krc
Average characters until correct guess: 1.072


Karachay-Balkar
500
Language: Karachay-Balkar
Total guesses: 1721
Total correct: 500
Total accuracy for Karachay-Balkar: 29.05%
Languages Guessed: {'tha': 241, 'mai': 138, 'tet': 229, 'krc': 500, 'eng': 527, 'be-tarask': 75, 'nob': 3, 'xho': 6, 'pnb': 2}
Most incorrectly guessed: krc
Least incorrectly guessed: pnb
Av

In [15]:
from collections import Counter

NameError: name 'num_characters' is not defined