In [1]:
import  argparse, numpy as np, pandas as pd, subprocess
from nltk.tokenize import word_tokenize as wt
import csv
import torch
from torch import nn
from torch.optim import Adam
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data import Field, Iterator, TabularDataset, BucketIterator
import tqdm
from tqdm import tnrange
from sklearn.metrics import accuracy_score
from sklearn import metrics
from tqdm import tqdm_notebook
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import torch.nn.functional as F

In [2]:
def get_languages(csv_file, preset):
    with open(csv_file, 'r') as csv_File: #opens csv containing language codes and their names
        reader = csv.reader(csv_File)
        language_table = {row[0].split(';')[1] : row[0].split(';')[0] for row in reader} #dictionary mapping code to name
    if preset == 'y':
        #Preset language codes to be used in model, chosen at random
        language_codes = ["srd", "krc", "nob", "pnb",
                          "mai", "eng", "be-tarask",
                          "xho", "tet", "tha"]
        language_names = [(key, value) for key, value in language_table.items() if value in language_codes]
        return language_names
    elif preset == 'n':
        '''
        experimental function for allowing user to choose which languages to use
        '''
        languages = []
        while len(languages) != 10:
            language = input("Enter language ").capitalize()
            #print(language)
            if language in languages:
                print("You've already said that one! ")
            elif language in language_table:
                languages.append(language)
                print(languages)
            else:
                print('Language not recognised. Please refer to language labels')
                print(languages)
                continue
        language_codes = [language_table[i] for i in language_table if i in languages] #collects languages from predetermined set,
        language_names =  [(key, value) for key, value in language_table.items() if value in language_codes] #dictionary mapping code to name
        return language_names
    else:
        print("has to be y or n dummy") #in case of user error

In [3]:
def gen_data(training_file, training_labels, language_codes, training):
    '''
    Function generates the set based on pre defined language codes and creates various
    attributes to the object
    '''
    if training == True:
        data = [i.split('\n')[:-1] for i in open(training_file, 'r')] #opens text file and splits on new line
        labels = [i.split('\n')[:-1] for i in open(training_labels, 'r')] #opens label file and splits on white space
        things = list(zip([i[0] for i in data], [i[0] for i in labels])) #zips sentences with corrosponding language label
        sets = [(i[0],i[1]) for i in things] #this might actually do the same thing as the above not sure
        
        x = [i[0][:100] for i in sets if i[1] in language_codes] #Matrix of sentences to be used in the model
        y = [i[1] for i in sets if i[1] in language_codes] #labels for each of the sentences
        raw_data = ''.join([i for i in x]) #concatenation of all characters in the training set
        vocab = {char: ord(char) for char in set(raw_data)} #dictionary mapping character to ord(integer)
        int2char = {num : char for char, num in vocab.items()} #dictionary mapping integer to character
        return x, y, vocab, int2char
    else:
        data = [i.split('\n')[:-1] for i in open(training_file, 'r')] #opens text file and splits on new line
        labels = [i.split('\n')[:-1] for i in open(training_labels, 'r')] #opens label file and splits on white space
        things = list(zip([i[0] for i in data], [i[0] for i in labels])) #zips sentences with corrosponding language label
        sets = [(i[0],i[1]) for i in things] #this might actually do the same thing as the above not sure
        x = [i[0][:100] for i in sets if i[1] in language_codes] #Matrix of sentences to be used in the model
        y = [i[1] for i in sets if i[1] in language_codes] #labels for each of the sentences
        return x, y
        

In [7]:
language_names = get_languages('./data/raw/labels.csv', 'y')

In [8]:
language_codes = [i[1] for i in language_names]

In [9]:
x_train = 'data/raw/x_train.txt'
y_train = 'data/raw/y_train.txt'

In [10]:
x_train, y_train, vocab, int2char = gen_data(x_train, y_train, language_codes, training=True)

In [11]:
x_test = 'data/raw/x_test.txt'
y_test = 'data/raw/y_test.txt'

In [12]:
x_test, y_test = gen_data(x_test, y_test, language_codes, training=False)

In [13]:
def langencoder(language_codes):
    one_hot_lang = {}
    lang2int = {lang : (num) for num, lang in dict(enumerate(language_codes)).items()}
    '''for lang, num in lang2int.items():
        one_hot_lang[lang] = np.zeros(9)
        one_hot_lang[lang] = np.insert(one_hot_lang[lang],num, 1)
        #one_hot_lang[lang] = list([0,0,0,0,0,0,0,0,0,0,0]).insert(num, 1)
    '''
    #hot2lang = {hot : lang for lang, hot in one_hot_lang.items()}
    
    return lang2int#, hot2lang

In [14]:
lang2int = langencoder(language_codes)

In [15]:
def build_vocab(x_train):
    total_data = ''.join(x_train)
    int2char = dict(enumerate(set(total_data)))
    char2int = {char : (num + 1) for num, char in int2char.items() }
    char2int['<niv>'] = max(char2int.values()) +1
    return char2int

vocab = build_vocab(x_train)

In [16]:
def one_hot_vocab(vocab):
    one_hot_vocab = {}
    for char, num in vocab.items():
        empty_vector = np.zeros(len(vocab) + 1)
        one_hot_vocab[char] = np.insert(empty_vector, num, 1)
    one_hot_vocab['<niv>'] = np.insert(np.zeros(len(vocab) + 1), len(vocab) + 1, 1)
    return one_hot_vocab

In [17]:
one_hot_vocab = one_hot_vocab(vocab)

In [18]:
from torch.nn.utils.rnn import pad_sequence

In [19]:
def build_data(x,y, lang2int,vocab):
    labels = []
    vectors = []
    sets = zip(x,y)
    for samples in sets:
        
        sample = [i for i in samples[0]]
        label = samples[1]
        #print(label)
        count = 100
        while count != 0:
            #x = [(i,int2label[samples[1]]) for i in samples[0]]
            vector = []
            encoded = []
            for i in sample:
                if i in vocab:
                    encoded.append(vocab[i])
                else:
                    encoded.append(vocab['<niv>'])
                
            for i in range(1,101):
                vectors.append(torch.LongTensor(encoded[:i])) #, int(lang2int[label])))
                labels.append(lang2int[label])
                count -=1
            #vectors += vector
            #label_matrix += labels
    return pad_sequence(vectors, batch_first=True, padding_value=0), labels



In [20]:
train_data, train_labels = build_data(x_train, y_train, lang2int, vocab)

In [21]:
test_data, test_labels = build_data(x_test, y_test, lang2int, vocab)

In [22]:
from torch.utils.data import Dataset

In [23]:
class RTDataset(Dataset):
    def __init__(self, train_x, train_y):
        self.train_x = train_x
        self.train_y = train_y
        
    def __getitem__(self, index):
        return (self.train_x[index], self.train_y[index])
    
    def __len__(self):
        return len(self.train_x)
        
        

In [24]:
train_data_set = RTDataset(train_data, train_labels)

In [25]:
test_data_set = RTDataset(test_data, test_labels)

In [26]:
train_loader = DataLoader(train_data_set, batch_size=200, shuffle=True)

In [27]:
test_loader = DataLoader(test_data_set, batch_size=200, shuffle=True)

In [28]:
class GRUNet(nn.Module):
    def __init__(self, vocab_size, seq_len, input_size, hidden_size, num_layers, output_size, dev, dropout=0.0):
        super().__init__()
        self.num_layers = num_layers
        self.seq_len = seq_len
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dev = dev
        self.emb = nn.Embedding(vocab_size, input_size)
        self.gru = nn.GRU(input_size, hidden_size,
                          num_layers=self.num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size * seq_len, output_size)
        # self.softmax = nn.Softmax(dim=2)

    def forward(self, sequence, hidden_layer):
        output = self.emb(sequence)
        hidden_layer = hidden_layer.to(self.dev)
        output, hidden_layer = self.gru(output, hidden_layer)
        output = output.contiguous().view(-1, self.hidden_size *
                                          len(sequence[0]))
        output = self.fc(output)
        # don't need the softmax here as CrossEntropy loss already does softmax at its end
        # output = self.softmax(output)
        return output, hidden_layer

    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size).float()


In [40]:
import pickle, os, zipfile 

In [114]:
import os, base64, addpermissions

ModuleNotFoundError: No module named 'addpermissions'

In [111]:
loaders= [train_loader,test_loader]

In [131]:
def load_dataloaders(filepath):
    directory = 'dataloaders/'
    dataloaders = []
    for filename in os.listdir(filepath):
        file = gzip.GzipFile(directory + filename, 'rb')
        pickle.load(file.read())
      

    return dataloaders[0], dataloaders[1]

In [132]:
training, testing = load_dataloaders('dataloaders/')

TypeError: file must have 'read' and 'readline' attributes

In [67]:
save(train_loader, 'train_loader.zip')

In [82]:
directory = 'dataloaders/'

In [85]:
training = 'dataloaders/training_dataloader.zip'
testing = 'dataloaders/testing_dataloader.zip'
for i in zip([training,testing], loaders):
    print('Pickling Dataloader {}'.format(str(i[0])))
    file = gzip.GzipFile(i[0], 'wb')
    file.write(pickle.dumps(i[1], 1))
    file.close()

Pickling Dataloader dataloaders/training_dataloader.zip
Pickling Dataloader dataloaders/testing_dataloader.zip


In [62]:
training = 'dataloaders/training_dataloader.pickle'
testing = 'dataloaders/testing_dataloader.pickle'
for i in zip([training,testing], loaders):
    print('Pickling Dataloader {}'.format(str(i[0])))
    with open(i[0], 'wb') as output_file:
        pickle.dump(i[1], output_file)
        output_file.close()
print('Zipping')


Zipping


In [63]:
with zipfile.ZipFile('training_dataloader.pickle.zip', 'r') as zip_ref:
    zip_ref.extractall('/')

BadZipFile: File is not a zip file

In [29]:
vocab_size = len(vocab)
seq_len = 100
batch_size = 200
input_size = 100
hidden_size = 128
num_layers = 1
output_size = 10
dev = torch.device('cuda:01')


In [None]:
model = GRUNet(vocab_size, seq_len, input_size, 
               hidden_size, num_layers, output_size, dev, dropout=0.0)
    

In [None]:
len(train_data)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.0005)

In [None]:
model.train()
model = model.to(dev)
print('Training')
for i in range(5):
    
    epoch_loss = []
    h = model.init_hidden(batch_size)
    count = 0
    
    percent = 10
    for (x, y) in train_loader:
        tenp = round(len(x) / 0.2)
        count +=1 
        x = x.to(dev)
        y = y.to(dev)
        optimizer.zero_grad()
        h = h.data
        out, h = model(x, h)
        loss = criterion(out, y.long())
        loss.backward()
        epoch_loss.append(loss.item())
        
        if count % tenp == 0:
            print('Training Epoch {}% complete'.format(percent))
            percent += 10
        #print('Loss per timestep = {}'.format(loss.item()))
        
        
    avg_loss = sum(epoch_loss) / len(epoch_loss)
    print("Average loss at epoch %d: %.7f" % (i + 1, avg_loss)) 
    

In [None]:
gru_model_try = model

In [None]:
for batch in test_loader:
    hidden_layer = model.init_hidden(1)
    for examples in batch[0]:
        prediction = gru_model_try(examples.unsqueeze(0).to('cuda:01'), hidden_layer)
        
        break
    for labels in batch[1]:
        #print(labels, hidden_layer)
        break
    break

In [None]:
training = 'dataloaders/training_dataloader.pickle'
testing = 'dataloaders/testing_dataloader.pickle'
for i in zip([training,testing], loaders):
    print('Pickling Dataloader {}'.format(str(i[0])))
    with open(i[0], 'wb') as output_file:
        pickle.dump(i[1], output_file)
        output_file.close()
print('Zipping')
for i in os.listdir(dir):
    output_zipped = zipfile.ZipFile(i, 'w')
    output_zipped.write(dir + i, compress_type=ZIP_DEFLATED)