### Dataset Preparation
This subsection contains methods to produce uniformly distributed chunks of our data set. From these we can then obtain n-grams of different sizes. The Wikipedia Language Identification database contains txt-files of x_train and x_test for example sentences and accordingly ordered labels in y_train, y_test.
We read these examples and cluster them by their respective language label.

In [None]:
import numpy as np
import pandas as pd
import string
import re
import nltk
from nltk import ngrams
import collections
from collections import defaultdict
from collections import Counter

In [None]:
# read data
# written for the WiLI-2018 data set: https://zenodo.org/record/841984
# make sure txt-files are in the specified directory when running this
X_train = open('x_train.txt', encoding="utf8").read().split('\n')
Y_train = open('y_train.txt', encoding="utf8").read().split('\n')
X_test = open('x_test.txt', encoding="utf8").read().split('\n')
Y_test = open('y_test.txt', encoding="utf8").read().split('\n')
labels = pd.read_csv('labels.csv', delimiter = ';')

In [None]:
# Preprocessing the data

def preprocess(X,Y):

  # convert language labels to language Name => 'en' -> 'English'
  lab_dict = { labels.loc[i]['Label'] : labels.loc[i]['English'] for i in range(0, len(labels)) }
  y_train = [ lab_dict[item] if item != 'nan' else 'Min Nan Chinese' for item in Y ]

  # remove unnecessary characters from data
  extras = '!"$%&/{}[]()=?\\`´*+~#-_.:,;<>|1234567890°-\'' # Characters to remove from data
  rx = '[' + re.escape(''.join(extras)) + ']'
  x_train = [] 
  to_remove = []
  i = 0
  for example in X:
      processed = re.sub(' +', ' ', re.sub(rx, '', example))
      if(len("".join(processed.split()))): # Some examples after preprocessing only contain spaces, this is a check for those examples.
        x_train.append(processed)
      else:
        y_train.pop(i)
      i+=1

  return x_train,y_train

# x_train = [ex1,ex2,ex3,...]
# y_train = [lang_of_ex1,......]

In [None]:
# sort data by language
def data_by_lang(X, Y):
    lang_corpora = defaultdict(list)
    lang_idx = defaultdict(list)
    for i in range(len(X)):
        lang_corpora[Y[i]].append(X[i])
        lang_idx[Y[i]].append(i)

    return lang_corpora, lang_idx
# lang_corpora = { 'Lang1' : [ex1,ex2,...], 'Lang2' : [ex1, ex2,,...],...}
# land_idx = { 'Lang1' : [23,41,..index of example in Lang1..], 'Lang2' : [1,19,....],...}

In [None]:
# extract uniformly distributed list of examples from our data set
# takes an optional argument to constrain the list of languages
def get_data_chunk(X, Y, n_instances, lang_keys=[]):
    _, lang_idx = data_by_lang(X, Y)
    x_train = []
    y_train = []
    
    langs = set()
    if lang_keys: 
        langs = set(lang_keys)
    else:
        langs = set(Y)

    for lang in langs:
        indices = lang_idx[lang]
        for index in range(n_instances):
            x_train.append(X[indices[index]])
            y_train.append(Y[indices[index]])

    return x_train, y_train

# x_train [ lang1_ex,lang1_ex,..n_instance_of_Lang1..,lang2_ex,lang2_ex,....,...]
# y_train [ lang1,lang1,....lang2,lang2,...]

In [None]:
# creating n-grams for each data entry
# optional arguments:
#    lang_keys - constrains the languages to use
#    stepsize  - specifies the amount of characters
#                to jump until the next n-gram
# returns a list of n-grams
def make_n_grams(X, Y, n, lang_keys=[], stepsize=1):
    assert stepsize >= 1
    x_to_grams = []

    langs = set()
    if lang_keys: 
        langs = set(lang_keys)
    else:
        langs = set(Y)

    for i in range(len(X)):
        if Y[i] in langs:
            sent = X[i]
            x_to_grams.append([sent[j:j+n] for j in range(0, len(sent) - n+1, stepsize)])

    return x_to_grams

# x_to_grams = [[ngram_in_ex1],[ngram_in_ex2],....]

In [None]:
# counting and sorting n-grams for each language
# returns a sorted dict of lang : {n-gram : count}
def sort_by_tf(X, Y):
    # calculating term frequency of n-grams per language
    tf_per_lang = defaultdict(list)
    langs = set(Y)
    data, _ = data_by_lang(X, Y)
    for lang,gram_list in data.items():
      data[lang] = [ gram for grams in gram_list for gram in grams] # Comvert list of lists to a single list
    for lang in langs:
        tf_per_lang[lang] = dict(
            zip(list(Counter(data[lang]).keys()),
                 list(Counter(data[lang]).values())))

    # sort by term frequency
    sorted_tf_per_lang = defaultdict(list)
    for lang in langs:
        sorted_tf_per_lang[lang] = { word : value for word, value in sorted(tf_per_lang[lang].items(), key=lambda item:item[1], reverse=True) }
    
    return sorted_tf_per_lang


### Understanding Data
In the following we review some examples to get an understanding of our data...
Particularly interesting are languages with a degree of similarity. Here we print examples of languages that use the latin alphabet.

In [None]:
import matplotlib
import matplotlib.pyplot as plt

In [None]:
x_train, y_train = preprocess(X_train[:-1],Y_train[:-1])

lang_corpora, lang_idx = data_by_lang(x_train, y_train)
# produce charts of counts of common n-grams in different languages, and a table suggesting similar languages based on these

for n_gram_size in range(3, 6):
    m_samples = 20
    latin_languages = ['German', 'English', 'French', 'Spanish', 'Italian', 'Portuguese', 
                       'Estonian', 'Turkish', 'Romanian', 'Swedish', 'Latin', 'Dutch']

    ng_related = {}
    
    x_train_grams = make_n_grams(x_train, y_train, n_gram_size, latin_languages)
    sorted_tf_per_lang = sort_by_tf(x_train_grams, y_train)

    for lang_key in latin_languages:
        ng_related[lang_key] = []
        latin_languages.remove(lang_key)
        latin_langs = latin_languages
        for otherlang in latin_langs:
            top_m = list(sorted_tf_per_lang[lang_key].keys())[:m_samples]
            top_m_x = list(sorted_tf_per_lang[otherlang].keys())[:m_samples]

            # compares the two top m lists for common elements:
            common_ngrams = list(set(top_m).intersection(top_m_x))
                
            if len(common_ngrams) > 3: # if two languages share 4 or more n-grams in their top n n-grams
                
                print(lang_key, "and", otherlang, "have the following frequent", n_gram_size,"-grams in common:",common_ngrams)
                ng_related[lang_key].append(otherlang)
                
            # find counts of the entries in common_ngrams for each language.
            # These are stored as the values corresponding to the ngram keys in the dictionary

                counts_langkey = []
                counts_otherlang = []
                for i in common_ngrams:
                    counts_langkey.append(sorted_tf_per_lang[lang_key][i])
                    counts_otherlang.append(sorted_tf_per_lang[otherlang][i])

                common_ngrams = [k.replace(' ', '_') for k in common_ngrams]

                # code for bar chart:
                x = np.arange(len(common_ngrams))  # the label locations
                width = 0.35  # the width of the bars

                fig, ax = plt.subplots()
                rects1 = ax.bar(x - width/2, counts_langkey, width, color = 'r', label=lang_key)
                rects2 = ax.bar(x + width/2, counts_otherlang, width, color = 'g', label=otherlang)
                ax.set_ylabel('Count')
                ax.set_title('Frequency of %s-grams in given languages' % (n_gram_size))
                ax.set_xticks(x)
                ax.set_xticklabels(common_ngrams, fontsize=12)
                ax.legend()
                fig.tight_layout()

                plt.show()

                #counts_langkey = []
                #counts_otherlang = []

        print('\n ')                 
    
    print('similar languages based on ', n_gram_size, '- grams:')
    for key, val in ng_related.items():
        print(key, ':', val)
    print('\n ')

### Naive Bayes Classifier
To obtain a baseline for the language identification task we employ a simple Naive Bayes classifier. Our first step is to collect the top n-grams into feature matrices...

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
import time,math

In [None]:
def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [None]:
# extracts lists of top n frequent n-grams from data
def get_top_n_features(X_grams, Y, n_features):
  '''
  X : [[ngram_in_ex1],[ngram_in_ex2],....]
  Y : ['lang1','lang1',...,'lang2',...]
  n_features : number of ngram to pick from each language
  '''
  sorted_freq_per_lang = sort_by_tf(X_grams, Y)
  features = []
  for lang, grams_dict in sorted_freq_per_lang.items():
      i = 0
      for gram, count in grams_dict.items():
          if i <= n_features:
              features.append(gram)
          else:
              break
          i += 1
      
  return list(set(features))

  # features : ['and','he ','öä ',....] Top ngrams from corpus

In [None]:
# convert data to feature matrix
def create_feature_matrix(X, features):
  '''
  X : [[ngram_in_ex1],[ngram_in_ex2],....]
  features : ['and','he ','öä ',....] Top ngrams from corpus

  '''
  mat = np.zeros((len(X),len(features)))
  i = 0
  for gram_list in X:
      gram_count = []
      for gram in features:          
          if gram in gram_list:
              gram_count.append(gram_list.count(gram)+1)
          else:
              gram_count.append(1)
      mat[i] = gram_count
      i+=1

  return mat
  # mat : array([[4,1,2,1,...],[1,1,1,2,3,1,...],...])

In [None]:
n_instances = 100 # instance per language
n_gram_size = 5
n_features = 10 # features per language
start = time.time()
print("Starting preprocessing at {} ..".format(time_since(start)))

x_train, y_train = preprocess(X_train[:-1], Y_train[:-1])

print("Preprocessing Done at {}.".format(time_since(start)))

# reduce languages to get smaller data subset
x_train, y_train = get_data_chunk(x_train, y_train, n_instances)

print("Making ngrams at {} ..".format(time_since(start)))
x_train_grams = make_n_grams(x_train, y_train, n_gram_size)

print('Extracting features at {} ...'.format(time_since(start)))
# create features for dataset
features = get_top_n_features(x_train_grams,y_train,n_features) # Creating features from whole dataset ????

print('Creating feature matrix at {}  ....'.format(time_since(start)))
# Convert dataset into feature matrix
feature_matrix = create_feature_matrix(x_train_grams, features)
print('Data Preperation completed after {}'.format(time_since(start)))

TypeError: ignored

In [None]:
# Gaussian Naive Bayes Model Training
encoder = LabelEncoder()
X = feature_matrix
Y = encoder.fit_transform(y_train)
model = GaussianNB()
model.fit(X,Y)

In [None]:
# model testing
start = time.time()
print('Test Data preperation starting ...')
x,y = preprocess(X_test[:20000], Y_test[:20000])

print("Making ngrams at {} ..".format(time_since(start)))
x_test_grams = make_n_grams(x, y, n_gram_size)

print('Creating feature matrix at {}  ....'.format(time_since(start)))
x = create_feature_matrix(x_test_grams, features)

print('Test Data Preperation completed after {}'.format(time_since(start)))

y = encoder.fit_transform(y)
y_pred = model.predict(x)
conf_matrix = confusion_matrix(y_pred=y_pred, y_true=y)
acc = round(accuracy_score(y_pred=y_pred, y_true=y), 4) * 100
print(f"Accuracy is {acc}%")

In [None]:
# Model                Instance_per_language   N_gram       Features_per_language         Accuracy             Test_Instance
# GaussianNB              150                     3                   40                     79%                   20k
# GaussianNB              150                     4                   40                     87%                   20k
# GaussianNB              150                     5                   40                     87%                   25k
# GaussianNB              200                     5                   30                     85%                   25k
# MultinomialNB           150                     3                   40                     77%                   25k  
# MultinomialNB           150                     4                   40                     73%                   25k  

### RNN Classifier


In [None]:
import torch
import time
from torch.autograd import Variable 
import torch
import torch.nn as nn
import torch.nn.functional as F

**CHANGE TO GPU MODE**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class RNN(nn.Module):
    def __init__(self,input_size,hid_size,output_size,layers,embedding):

        super(RNN, self).__init__()
        self.hidden_dim = hid_size
        self.layers = layers
        self.embedding_size = embedding 
        self.dropout = nn.Dropout(0.4)
        self.input_size = input_size
        self.output_size = output_size
        self.embeddings = nn.Embedding(self.input_size,self.embedding_size)
        self.rnn = nn.GRU(input_size=self.embedding_size,hidden_size=self.hidden_dim,num_layers = self.layers)
        self.linear = nn.Linear(self.hidden_dim,self.output_size)

    
    def forward(self,x):
        # x : B x S where B is batch size and S is sequence size
        # Sequence size is length of one ngram vector encoding
        batch_size = x.size(0) 
        x = x.t() 
        embedded = self.embeddings(x) # S x B x I , here I is input_size/vocab size
        hidden = self._init_hidden(batch_size)
        output,hidden = self.rnn(embedded,hidden)
        output = self.dropout(output) 
        fc_output = self.linear(output[-1]) # B x L , L is number of classes/languages
        return fc_output


    def _init_hidden(self,batch_size):
        hidden_state = torch.zeros(self.layers,batch_size, self.hidden_dim, device=device)
        return hidden_state

In [None]:
def padding(vector_inps, lengths ):
  '''
  This function takes variable lengths vectors and convert them into equal length by padding 0
  Input : "vector_inps" list of vectors containing indices of ngrams
          "lengths " length of each vector
  Output : tensor containing vectors of equal length after padding. This length is equal to maximum number(M) in list of "lengths".
  '''
  inp_tensor = torch.zeros((len(vector_inps),lengths.max()), device= device).long()
  for idx, (seq, seq_len) in enumerate(zip(vector_inps,lengths)):
    inp_tensor[idx, :seq_len] = torch.LongTensor(seq)
  return inp_tensor

  # inp_tensor : tensor([[12,342,...,0],[56,2311,....],....])


In [None]:
def train(decoder, inp, target, batch = 100):
  '''
  decoder : Model
  inp  : tensor([[12,342,...,0],[56,2311,....],....]) Example encodings
  target : tensor([41,127,234,16,....]) Label Encodings
  '''
    decoder.zero_grad()
    loss = 0
    for i in range(1,int(len(inp)/batch)):
        output = decoder(inp[(i-1)*batch:(i)*batch].view(batch,-1)) # Input to model should be in form B x S where B is batch size and S is sequence size
        loss += criterion(output,target[(i-1)*batch:(i)*batch])
    loss.backward()
    decoder_optimizer.step()

    return loss.item()


In [None]:
def create_Encodings(X_grams,Y,word_to_ix):
  '''
  X_grams : [[ngram_in_ex1],[ngram_in_ex2],....]
  Y : ['lang1','lang1',...,'lang2',....]
  word_to_ix : {' of':1,'apf':2,....} This is vocabulary of selected top ngrams
  '''
  x_grams = []
  y_grams = []
  gram_len = []
  iter = 500
  print('Creating Encoding for X')
  for j in range(len(X_grams)):
    gramlist = X_grams[j] # list of ngrams in example j
    grams = [word_to_ix[w] for w in gramlist if w in list(word_to_ix.keys()) ] 

    if(len(grams) >= 1): # resulting grams list must not be empty
      x_grams.append(grams) # Add encodings to x_grams
      gram_len.append(len(grams)) # Keeping track of number of grams in each sentence, it will be used for padding later
      y_grams.append(Y[j]) # Add corresponding language to y_grams

    if( j % iter == 0):
      print("Iteration {} completed at time {}".format(j,time_since(start)))

  gram_len = torch.LongTensor(gram_len)
  inp = padding(x_grams,gram_len) # Created input vector with equal size list for all example

  print('Creating Encoding for Y')
  label = list(set(y_grams))
  labels_to_idx = { lang:i  for i,lang in enumerate(label)}
  y_label = torch.zeros(len(y_grams),device=device).long()
  for i in range(len(y_label)):
    y_label[i] = labels_to_idx[y_grams[i]] # Encoding each training label
  target = Variable(torch.cuda.LongTensor(y_label))
  return inp,target

  # inp  : tensor([[12,342,...,0],[56,2311,....],....])
  # target : tensor([41,127,234,16,....])


In [None]:
# GPU MODE AHEAD
n_instances = 130 # instance per language
n_gram_size = 4
n_features = 15 # features per language
start = time.time()
print("Starting preprocessing at {} ..".format(time_since(start)))

x_train, y_train = preprocess(X_train[:-1], Y_train[:-1])

print("Preprocessing Done at {}.".format(time_since(start)))

# reduce languages to get smaller data subset
x_train, y_train = get_data_chunk(x_train, y_train, n_instances)

print("Making ngrams at {} ..".format(time_since(start)))
x_train_grams = make_n_grams(x_train, y_train, n_gram_size)

print('Extracting features at {} ...'.format(time_since(start)))
# create features for dataset
features = get_top_n_features(x_train_grams,y_train,n_features) # Creating features from whole dataset ????
word_to_ix = {word: i for i, word in enumerate(features)}

print('Preparing encoding at {}'.format(time_since(start)))
inp, target = create_Encodings(x_train_grams,y_train,word_to_ix)
print('Encoding completed at {}'.format(time_since(start)))

In [None]:
print('Training Phase..')
hidden_size = 64
input_size = len(features) # Number of grams in vocabulary
output_size = len(set(target)) # Number of languages
n_layers = 1 # Number of layers of RNN
embedding_size = 64
lr = 0.001
model = RNN(input_size,hidden_size,output_size,n_layers,embedding_size).to(device)
decoder_optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
start = time.time()
print_every = 500
all_losses = []
loss_avg = 0
n_epochs = 3000
inp = create_batch(x_train,3)
for epoch in range(1, n_epochs+1):
    loss = train(model,inp,target)
    all_losses.append(loss)
    if(epoch%print_every == 0):
      print('[{} ({} {}%) {:.4f}]'.format(time_since(start), epoch, epoch/n_epochs * 100, loss))

print("Training Complete.")

In [None]:
def predict(model, test_x, test_y):
  '''
  decoder : Model
  test_x  : tensor([[12,342,...,0],[56,2311,....],....]) Example encodings
  true : tensor([41,127,234,16,....]) Label Encodings

  '''
  out = model(test_x) # out : B x L, where B is batch size and L is number of Labels/Classes
  out = out.argmax(dim=1)
  correct = out.eq(test_y.data.view_as(out)).cpu().sum()
  print(correct)
  accuracy = (correct/len(test_x))*100
  print("Accuracy is {}%".format(accuracy))

In [None]:
# model testing
start = time.time()
print('Test Data preperation starting ...')
x,y = preprocess(X_test[:20000], Y_test[:20000])

print("Making ngrams at {} ..".format(time_since(start)))
x_test_grams = make_n_grams(x, y, n_gram_size)

print('Preparing encoding at {}'.format(time_since(start)))
inp_t, true = create_Encodings(x_test_grams,y,word_to_ix)
print('Encoding completed at {}'.format(time_since(start)))

predict(model,inp_t,true)