## Test Example

In [51]:
import pandas as pd

#Read in full dataset
data = pd.read_csv('data/sentences.csv',
                            sep='\t', 
                            encoding='utf8', 
                            index_col=0,
                            names=['lang','text'])

#Filter by text length
len_cond = [True if 20<=len(s)<=200 else False for s in data['text']]
data = data[len_cond]

#Filter by text language
lang = ['deu', 'eng', 'fra', 'ita', 'por', 'spa']
data = data[data['lang'].isin(lang)]

#Select 50000 rows for each language
data_trim = pd.DataFrame(columns=['lang','text'])

for l in lang:
    lang_trim = data[data['lang'] ==l].sample(50000,random_state = 100)
    data_trim = data_trim.append(lang_trim)

#Create a random train, valid, test split
data_shuffle = data_trim.sample(frac=1)

train = data_shuffle[0:210000]
valid = data_shuffle[210000:270000]
test = data_shuffle[270000:300000]

  data_trim = data_trim.append(lang_trim)
  data_trim = data_trim.append(lang_trim)
  data_trim = data_trim.append(lang_trim)
  data_trim = data_trim.append(lang_trim)
  data_trim = data_trim.append(lang_trim)
  data_trim = data_trim.append(lang_trim)


In [52]:
from sklearn.feature_extraction.text import CountVectorizer

def get_trigrams(corpus,n_feat=200):
    """
    Returns a list of the N most common character trigrams from a list of sentences
    params
    ------------
        corpus: list of strings
        n_feat: integer
    """
    
    #fit the n-gram model
    vectorizer = CountVectorizer(analyzer='char',
                            ngram_range=(3, 3)
                            ,max_features=n_feat)
    
    X = vectorizer.fit_transform(corpus)
    
    #Get model feature names
    feature_names = vectorizer.get_feature_names_out()
    
    return feature_names

In [53]:
#obtain trigrams from each language
features = {}
features_set = set()

for l in lang:
    
    #get corpus filtered by language
    corpus = train[train.lang==l]['text']
    
    #get 200 most frequent trigrams
    trigrams = get_trigrams(corpus)
    
    #add to dict and set
    features[l] = trigrams 
    features_set.update(trigrams)

    
#create vocabulary list using feature set
vocab = dict()
for i,f in enumerate(features_set):
    vocab[f]=i

In [54]:
#train count vectoriser using vocabulary
vectorizer = CountVectorizer(analyzer='char',
                             ngram_range=(3, 3),
                            vocabulary=vocab)

#create feature matrix for training set
corpus = train['text']   
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()

train_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)

In [55]:
#Scale feature matrix 
train_min = train_feat.min()
train_max = train_feat.max()
train_feat = (train_feat - train_min)/(train_max-train_min)

#Add target variable 
train_feat['lang'] = list(train['lang'])

In [56]:
#create feature matrix for validation set
corpus = valid['text']   
X = vectorizer.fit_transform(corpus)

valid_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
valid_feat = (valid_feat - train_min)/(train_max-train_min)
valid_feat['lang'] = list(valid['lang'])

#create feature matrix for test set
corpus = test['text']   
X = vectorizer.fit_transform(corpus)

test_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
test_feat = (test_feat - train_min)/(train_max-train_min)
test_feat['lang'] = list(test['lang'])

In [59]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

#Fit encoder
encoder = LabelEncoder()
encoder.fit(['deu', 'eng', 'fra', 'ita', 'por', 'spa'])

def encode(y):
    """
    Returns a list of one hot encodings
    Params
    ---------
        y: list of language labels
    """
    
    y_encoded = encoder.transform(y)
    y_dummy = np_utils.to_categorical(y_encoded)
    
    return y_dummy

In [60]:
from keras.models import Sequential
from keras.layers import Dense


#Get training data
x = train_feat.drop('lang',axis=1)
y = encode(train_feat['lang'])

#Define model
model = Sequential()
model.add(Dense(500, input_dim=663, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#Train model
model.fit(x, y, epochs=4, batch_size=100)

(4075959, 2)
Epoch 1/4


ValueError: in user code:

    File "C:\Users\Derek\anaconda3\lib\site-packages\keras\engine\training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Derek\anaconda3\lib\site-packages\keras\engine\training.py", line 1268, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Derek\anaconda3\lib\site-packages\keras\engine\training.py", line 1249, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Derek\anaconda3\lib\site-packages\keras\engine\training.py", line 1050, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\Derek\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Derek\anaconda3\lib\site-packages\keras\engine\input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_8" is incompatible with the layer: expected shape=(None, 663), found shape=(100, 667)


In [15]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score,confusion_matrix

x_test = test_feat.drop('lang',axis=1)
y_test = test_feat['lang']

#Get predictions on test set
labels = model.predict_classes(x_test)
predictions = encoder.inverse_transform(labels)

#Accuracy on test set
accuracy = accuracy_score(y_test,predictions)
print(accuracy)

#Create confusion matrix
lang = ['deu', 'eng', 'fra', 'ita', 'por', 'spa']
conf_matrix = confusion_matrix(y_test,predictions)
conf_matrix_df = pd.DataFrame(conf_matrix,columns=lang,index=lang)

#Plot confusion matrix heatmap
plt.figure(figsize=(10, 10), facecolor='w', edgecolor='k')
sns.set(font_scale=1.5)
sns.heatmap(conf_matrix_df,cmap='coolwarm',annot=True,fmt='.5g',cbar=False)
plt.xlabel('Predicted',fontsize=22)
plt.ylabel('Actual',fontsize=22)

AttributeError: 'Sequential' object has no attribute 'predict_classes'

## Clean Dataset

In [80]:
import pandas as pd
import numpy as np

data_original = pd.read_csv('data/sentences.csv',sep='\t', encoding='utf8', header = None)
data_original.drop([0], axis=1,inplace=True)
data_original.columns = ['lang','sent']
data_original = data_original[data_original.sent.str.len()<=200]#Only select sentences less that 200 characters
data_original = data_original[data_original.sent.str.len()>=20]#Only select sentences greater that 20 characters
print(len(data_original)) #5788767 rows
print(data_original['lang'].nunique()) #314 unique languages
data_original.head()

9381771
401


Unnamed: 0,lang,sent
4,cmn,今天是６月１８号，也是Muiriel的生日！
20,cmn,选择什么是“对”或“错”是一项艰难的任务，我们却必须要完成它。
66,cmn,我们看东西不是看其实质，而是以我们的主观意识看它们的。
70,cmn,生活就是當你忙著進行你的計劃時總有其他的事情發生。
75,deu,Lass uns etwas versuchen!


In [81]:
#We want English, German, Russian, Spanish, French, Japanese, Portuguese, Italian
lang = ['eng','deu','spa','fra','por','ita']
data = data_original[data_original['lang'].isin(lang)]
print(len(data)) #2759972 rows
data.head()

4075959


Unnamed: 0,lang,sent
75,deu,Lass uns etwas versuchen!
76,deu,Ich muss schlafen gehen.
78,deu,Heute ist der 18. Juni und das ist der Geburts...
79,deu,"Herzlichen Glückwunsch zum Geburtstag, Muiriel!"
80,deu,Muiriel ist jetzt 20.


In [82]:
data['lang'].value_counts()

eng    1589318
ita     723534
deu     571659
fra     487634
por     366601
spa     337213
Name: lang, dtype: int64

In [83]:
#Select 50000 rows for each language
data_trim = pd.DataFrame(columns=['lang','sent'])
data_trim
for l in lang:
    data_trim = data_trim.append(data[data['lang'] ==l].sample(50000,random_state = 100))
data_trim['lang'].value_counts()

  data_trim = data_trim.append(data[data['lang'] ==l].sample(50000,random_state = 100))
  data_trim = data_trim.append(data[data['lang'] ==l].sample(50000,random_state = 100))
  data_trim = data_trim.append(data[data['lang'] ==l].sample(50000,random_state = 100))
  data_trim = data_trim.append(data[data['lang'] ==l].sample(50000,random_state = 100))
  data_trim = data_trim.append(data[data['lang'] ==l].sample(50000,random_state = 100))
  data_trim = data_trim.append(data[data['lang'] ==l].sample(50000,random_state = 100))


eng    50000
deu    50000
spa    50000
fra    50000
por    50000
ita    50000
Name: lang, dtype: int64

In [84]:
#Divide data into training, validation and testing
from sklearn.model_selection import train_test_split

X = data_trim['sent']
y = data_trim['lang']
X_train, X, y_train, y = train_test_split(X, y, test_size=0.30, random_state=101)
X_valid, X_test, y_valid, y_test = train_test_split(X, y, test_size=1/3, random_state=101)

In [85]:
#save datsets
train = pd.concat([pd.Series(y_train),pd.Series(X_train) ], axis=1)
valid = pd.concat([pd.Series(y_valid),pd.Series(X_valid) ], axis=1)
test = pd.concat([pd.Series(y_test),pd.Series(X_test) ], axis=1)
print(len(train), len(valid), len(test))

train.to_csv('data/train.csv')
valid.to_csv('data/valid.csv')
test.to_csv('data/test.csv')


210000 60000 30000


## Create Model Features

In [86]:
#Imports
import numpy as np
import pandas as pd
import string
from collections import Counter
import json

In [87]:
train = pd.read_csv("data/train.csv")
train.drop(['Unnamed: 0'], axis=1,inplace=True)
print(len(train))
train.head()

210000


Unnamed: 0,lang,sent
0,eng,I guess we have no choice.
1,fra,« Où est Tom ? » – « Il est à la maison. »
2,deu,Er spricht unsere Sprache nicht.
3,por,Eu conheço Tom melhor do que você conhece.
4,deu,Ein edler Mensch widmet sich dem Erreichen hoh...


In [88]:
#Remove punctuation and numbers
def clean_text(sent):
    "Takes in a string and returns it with no numbers or punctuation and normalized spaces"
    remove=string.punctuation + "1234567890" #Characters to be removed
    table=str.maketrans("","",remove)    
    sent = sent.translate(table)  
    sent = " ".join(sent.split()) #Normalize spaces
    return sent

sent = "Hello. #This function908 Removes  numbers12,    punctuation... and     normalizes spaces"
clean_text(sent)

'Hello This function Removes numbers punctuation and normalizes spaces'

In [89]:
def char_trigram(sent):
    "Takes a string and returns a list of character n-grams"
    return [sent[i:i+3] for i in range(len(sent)-3+1)]

sent = "This is a sentence."
char_trigram(sent)

['Thi',
 'his',
 'is ',
 's i',
 ' is',
 'is ',
 's a',
 ' a ',
 'a s',
 ' se',
 'sen',
 'ent',
 'nte',
 'ten',
 'enc',
 'nce',
 'ce.']

In [90]:
def trigram_list(sent_list):
    "Takes in a list of sentences and returns a list of trigrams "
    sent_clean = list(map(clean_text, sent_list))
    sent_trigram = list(map(char_trigram,sent_clean))
    list_trigram = [item for sublist in sent_trigram for item in sublist]
    return list_trigram

sent_list = train['sent'][0:10]
trigram_list(sent_list)

['I g',
 ' gu',
 'gue',
 'ues',
 'ess',
 'ss ',
 's w',
 ' we',
 'we ',
 'e h',
 ' ha',
 'hav',
 'ave',
 've ',
 'e n',
 ' no',
 'no ',
 'o c',
 ' ch',
 'cho',
 'hoi',
 'oic',
 'ice',
 '« O',
 ' Où',
 'Où ',
 'ù e',
 ' es',
 'est',
 'st ',
 't T',
 ' To',
 'Tom',
 'om ',
 'm »',
 ' » ',
 '» –',
 ' – ',
 '– «',
 ' « ',
 '« I',
 ' Il',
 'Il ',
 'l e',
 ' es',
 'est',
 'st ',
 't à',
 ' à ',
 'à l',
 ' la',
 'la ',
 'a m',
 ' ma',
 'mai',
 'ais',
 'iso',
 'son',
 'on ',
 'n »',
 'Er ',
 'r s',
 ' sp',
 'spr',
 'pri',
 'ric',
 'ich',
 'cht',
 'ht ',
 't u',
 ' un',
 'uns',
 'nse',
 'ser',
 'ere',
 're ',
 'e S',
 ' Sp',
 'Spr',
 'pra',
 'rac',
 'ach',
 'che',
 'he ',
 'e n',
 ' ni',
 'nic',
 'ich',
 'cht',
 'Eu ',
 'u c',
 ' co',
 'con',
 'onh',
 'nhe',
 'heç',
 'eço',
 'ço ',
 'o T',
 ' To',
 'Tom',
 'om ',
 'm m',
 ' me',
 'mel',
 'elh',
 'lho',
 'hor',
 'or ',
 'r d',
 ' do',
 'do ',
 'o q',
 ' qu',
 'que',
 'ue ',
 'e v',
 ' vo',
 'voc',
 'ocê',
 'cê ',
 'ê c',
 ' co',
 'con',
 'onh',


In [91]:
def most_frequent(n,trigrams):
    "Takes in a list of trigrams and returns the n most frequent trigrams"
    common = []
    for e in Counter(trigrams).most_common(n):
        common.append(e[0])
    return common

sent_list = train['sent'][0:1000]
trigrams = trigram_list(sent_list)
most_frequent(20, trigrams)

[' de',
 'er ',
 'as ',
 'en ',
 'es ',
 ' qu',
 'de ',
 'que',
 'te ',
 ' co',
 're ',
 ' a ',
 'est',
 'Tom',
 'om ',
 'ue ',
 'ch ',
 'ent',
 ' pa',
 'to ']

In [92]:
def lang_features(n_list,lang):
    """Returns a dictionary of the most frequent trigrams for a given language. Each element is a list of the n most
    frequent trigrams when n is a element of n_list"""
    
    train_lang = train[train['lang'] == lang]
    sent_list = train_lang['sent']
    trigrams = trigram_list(sent_list)
    
    freq = {}
    for n in n_list:
        freq[n] = most_frequent(int(n), trigrams)
    return freq 
    
lang_features(['20','30'],'eng')

{'20': [' th',
  'he ',
  ' to',
  'the',
  'to ',
  'om ',
  'Tom',
  'hat',
  'nt ',
  'ing',
  'ed ',
  'at ',
  'is ',
  'ng ',
  'tha',
  ' do',
  ' wa',
  ' yo',
  'you',
  'e t'],
 '30': [' th',
  'he ',
  ' to',
  'the',
  'to ',
  'om ',
  'Tom',
  'hat',
  'nt ',
  'ing',
  'ed ',
  'at ',
  'is ',
  'ng ',
  'tha',
  ' do',
  ' wa',
  ' yo',
  'you',
  'e t',
  're ',
  ' a ',
  ' an',
  ' ha',
  ' he',
  'as ',
  't t',
  'er ',
  'd t',
  'nd ']}

In [93]:
#Create a dictionary of the features for all the languages 
lang = ['eng','deu','spa','fra','por','ita']
n_list = ['50','100','200']
lang_trigrams = {}
for l in lang:
    lang_trigrams[l] = lang_features(n_list,l)
    print(l)
lang_trigrams

eng
deu
spa
fra
por
ita


{'eng': {'50': [' th',
   'he ',
   ' to',
   'the',
   'to ',
   'om ',
   'Tom',
   'hat',
   'nt ',
   'ing',
   'ed ',
   'at ',
   'is ',
   'ng ',
   'tha',
   ' do',
   ' wa',
   ' yo',
   'you',
   'e t',
   're ',
   ' a ',
   ' an',
   ' ha',
   ' he',
   'as ',
   't t',
   'er ',
   'd t',
   'nd ',
   ' is',
   ' in',
   'ry ',
   'her',
   'ou ',
   ' be',
   'and',
   've ',
   'in ',
   'thi',
   ' of',
   'll ',
   'ary',
   'e a',
   'ere',
   'Mar',
   's a',
   'was',
   'e w',
   ' To'],
  '100': [' th',
   'he ',
   ' to',
   'the',
   'to ',
   'om ',
   'Tom',
   'hat',
   'nt ',
   'ing',
   'ed ',
   'at ',
   'is ',
   'ng ',
   'tha',
   ' do',
   ' wa',
   ' yo',
   'you',
   'e t',
   're ',
   ' a ',
   ' an',
   ' ha',
   ' he',
   'as ',
   't t',
   'er ',
   'd t',
   'nd ',
   ' is',
   ' in',
   'ry ',
   'her',
   'ou ',
   ' be',
   'and',
   've ',
   'in ',
   'thi',
   ' of',
   'll ',
   'ary',
   'e a',
   'ere',
   'Mar',
   's a',
   'was',

In [94]:
#From the lang_trigrams select list of unique trigrams i.e. final feature list
features = {} #final feature list
for n in n_list:
    n_trigrams = []
    for l in lang:
            n_trigrams = n_trigrams + lang_trigrams[l][n]
    features[n] = sorted(list(set(n_trigrams)))

with open('data/features.json', 'w') as outfile:
    json.dump(features, outfile)

In [95]:
with open('data/features.json', encoding='utf-8') as data_file:
    features = json.loads(data_file.read())
print(features['50'])

[' To', ' a ', ' an', ' au', ' be', ' ca', ' ch', ' co', ' da', ' de', ' di', ' do', ' ei', ' el', ' en', ' es', ' ge', ' ha', ' he', ' in', ' is', ' la', ' le', ' lo', ' ma', ' me', ' mi', ' ne', ' ni', ' no', ' nã', ' o ', ' of', ' pa', ' pe', ' po', ' qu', ' se', ' si', ' so', ' st', ' te', ' th', ' to', ' um', ' un', ' vo', ' wa', ' wi', ' yo', ' zu', ' à ', ' è ', ' é ', 'Eu ', 'Ich', 'Je ', 'Mar', 'Non', 'Tom', 'a c', 'a d', 'a e', 'a s', 'ach', 'ado', 'ais', 'ait', 'and', 'ar ', 'ara', 'are', 'ary', 'as ', 'at ', 'ato', 'ch ', 'che', 'cht', 'com', 'con', 'cos', 'cê ', 'd t', 'da ', 'das', 'de ', 'den', 'der', 'di ', 'die', 'do ', 'e a', 'e c', 'e d', 'e e', 'e i', 'e l', 'e n', 'e p', 'e s', 'e t', 'e w', 'ed ', 'ein', 'el ', 'em ', 'en ', 'ent', 'er ', 'ere', 'es ', 'est', 'eu ', 'gen', 'hat', 'he ', 'hen', 'her', 'ht ', 'i s', 'ia ', 'ich', 'ie ', 'ien', 'in ', 'ine', 'ing', 'is ', 'ist', 'it ', 'la ', 'le ', 'les', 'll ', 'lle', 'n d', 'na ', 'nd ', 'ne ', 'nen', 'ng ', 'nic'

In [96]:
def vectorize(sent,feature_list):
    "Takes in a list trigram (sentence in trigram form) and returns a feature in vector form, given a list of features"
    vector = [sent.count(f) for f in feature_list]
    return(vector)
print(vectorize([' Ma', ' To', ' a ', ' an', ' au', " Ma", "e t",'ccc','ach','st ','ach'],features['50']))
print(vectorize([' Ma', ' To', ' a ', ' an', ' au', " Ma", "e t",'ccc','ach','st ','ach'],features['100']))
print(vectorize(["aaa", "bbb",'eee','aaa','ccc','aaa'],features['50']))

[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[2, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [97]:
def create_features(name,number):
    "Creates the apporiate feature matrix"
    mat = pd.read_csv("data/{}.csv".format(name))
    mat.drop(['Unnamed: 0'], axis=1,inplace=True)
    
    sent_list = mat['sent']
    sent_clean = list(map(clean_text, sent_list))
    sent_trigram = list(map(char_trigram,sent_clean))
    lang = mat['lang']
    
    vectors = [vectorize(s,features[number]) for s in sent_trigram]
    df = pd.DataFrame(vectors, columns=features[number])
    
    df['lang'] = lang
    df.to_csv('data/features/{}_{}.csv'.format(name,number))
    return df

In [99]:
#Create feature vector matrix size 50
create_features('train','50')
create_features('valid','50')
create_features('test','50')

Unnamed: 0,To,a,an,au,be,ca,ch,co,da,de,...,to,ue,un,und,us,ve,was,you,ão,lang
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,eng
1,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,spa
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,eng
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,eng
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,ita
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,spa
29996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,eng
29997,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,eng
29998,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,fra


In [100]:
#size 100
create_features('train','100')
create_features('valid','100')
create_features('test','100')

#size 200
create_features('train','200')
create_features('valid','200')
create_features('test','200')

Unnamed: 0,Ge,I,Ma,Sc,Si,To,a,ac,al,an,...,y w,you,zio,zu,ás,ão,ère,ía,ón,lang
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,eng
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spa
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,eng
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,eng
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ita
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,spa
29996,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,eng
29997,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,eng
29998,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,fra


## Create Model Features

In [101]:
#Imports
import numpy as np
import pandas as pd
import string
from collections import Counter
import json

In [102]:
train = pd.read_csv("data/train.csv")
train.drop(['Unnamed: 0'], axis=1,inplace=True)
print(len(train))
train.head()

210000


Unnamed: 0,lang,sent
0,eng,I guess we have no choice.
1,fra,« Où est Tom ? » – « Il est à la maison. »
2,deu,Er spricht unsere Sprache nicht.
3,por,Eu conheço Tom melhor do que você conhece.
4,deu,Ein edler Mensch widmet sich dem Erreichen hoh...


In [103]:
#Remove punctuation and numbers
def clean_text(sent):
    "Takes in a string and returns it with no numbers or punctuation and normalized spaces"
    remove=string.punctuation + "1234567890" #Characters to be removed
    table=str.maketrans("","",remove)    
    sent = sent.translate(table)  
    sent = " ".join(sent.split()) #Normalize spaces
    return sent

sent = "Hello. #This function908 Removes  numbers12,    punctuation... and     normalizes spaces"
clean_text(sent)

'Hello This function Removes numbers punctuation and normalizes spaces'

In [104]:
def char_trigram(sent):
    "Takes a string and returns a list of character n-grams"
    return [sent[i:i+3] for i in range(len(sent)-3+1)]

sent = "This is a sentence."
char_trigram(sent)

['Thi',
 'his',
 'is ',
 's i',
 ' is',
 'is ',
 's a',
 ' a ',
 'a s',
 ' se',
 'sen',
 'ent',
 'nte',
 'ten',
 'enc',
 'nce',
 'ce.']

In [105]:
def trigram_list(sent_list):
    "Takes in a list of sentences and returns a list of trigrams "
    sent_clean = list(map(clean_text, sent_list))
    sent_trigram = list(map(char_trigram,sent_clean))
    list_trigram = [item for sublist in sent_trigram for item in sublist]
    return list_trigram

sent_list = train['sent'][0:10]
trigram_list(sent_list)

['I g',
 ' gu',
 'gue',
 'ues',
 'ess',
 'ss ',
 's w',
 ' we',
 'we ',
 'e h',
 ' ha',
 'hav',
 'ave',
 've ',
 'e n',
 ' no',
 'no ',
 'o c',
 ' ch',
 'cho',
 'hoi',
 'oic',
 'ice',
 '« O',
 ' Où',
 'Où ',
 'ù e',
 ' es',
 'est',
 'st ',
 't T',
 ' To',
 'Tom',
 'om ',
 'm »',
 ' » ',
 '» –',
 ' – ',
 '– «',
 ' « ',
 '« I',
 ' Il',
 'Il ',
 'l e',
 ' es',
 'est',
 'st ',
 't à',
 ' à ',
 'à l',
 ' la',
 'la ',
 'a m',
 ' ma',
 'mai',
 'ais',
 'iso',
 'son',
 'on ',
 'n »',
 'Er ',
 'r s',
 ' sp',
 'spr',
 'pri',
 'ric',
 'ich',
 'cht',
 'ht ',
 't u',
 ' un',
 'uns',
 'nse',
 'ser',
 'ere',
 're ',
 'e S',
 ' Sp',
 'Spr',
 'pra',
 'rac',
 'ach',
 'che',
 'he ',
 'e n',
 ' ni',
 'nic',
 'ich',
 'cht',
 'Eu ',
 'u c',
 ' co',
 'con',
 'onh',
 'nhe',
 'heç',
 'eço',
 'ço ',
 'o T',
 ' To',
 'Tom',
 'om ',
 'm m',
 ' me',
 'mel',
 'elh',
 'lho',
 'hor',
 'or ',
 'r d',
 ' do',
 'do ',
 'o q',
 ' qu',
 'que',
 'ue ',
 'e v',
 ' vo',
 'voc',
 'ocê',
 'cê ',
 'ê c',
 ' co',
 'con',
 'onh',


In [106]:
def most_frequent(n,trigrams):
    "Takes in a list of trigrams and returns the n most frequent trigrams"
    common = []
    for e in Counter(trigrams).most_common(n):
        common.append(e[0])
    return common

sent_list = train['sent'][0:1000]
trigrams = trigram_list(sent_list)
most_frequent(20, trigrams)

[' de',
 'er ',
 'as ',
 'en ',
 'es ',
 ' qu',
 'de ',
 'que',
 'te ',
 ' co',
 're ',
 ' a ',
 'est',
 'Tom',
 'om ',
 'ue ',
 'ch ',
 'ent',
 ' pa',
 'to ']

In [107]:
def lang_features(n_list,lang):
    """Returns a dictionary of the most frequent trigrams for a given language. Each element is a list of the n most
    frequent trigrams when n is a element of n_list"""
    
    train_lang = train[train['lang'] == lang]
    sent_list = train_lang['sent']
    trigrams = trigram_list(sent_list)
    
    freq = {}
    for n in n_list:
        freq[n] = most_frequent(int(n), trigrams)
    return freq 
    
lang_features(['20','30'],'eng')

{'20': [' th',
  'he ',
  ' to',
  'the',
  'to ',
  'om ',
  'Tom',
  'hat',
  'nt ',
  'ing',
  'ed ',
  'at ',
  'is ',
  'ng ',
  'tha',
  ' do',
  ' wa',
  ' yo',
  'you',
  'e t'],
 '30': [' th',
  'he ',
  ' to',
  'the',
  'to ',
  'om ',
  'Tom',
  'hat',
  'nt ',
  'ing',
  'ed ',
  'at ',
  'is ',
  'ng ',
  'tha',
  ' do',
  ' wa',
  ' yo',
  'you',
  'e t',
  're ',
  ' a ',
  ' an',
  ' ha',
  ' he',
  'as ',
  't t',
  'er ',
  'd t',
  'nd ']}

In [108]:
#Create a dictionary of the features for all the languages 
lang = ['eng','deu','spa','fra','por','ita']
n_list = ['50','100','200']
lang_trigrams = {}
for l in lang:
    lang_trigrams[l] = lang_features(n_list,l)
    print(l)
lang_trigrams

eng
deu
spa
fra
por
ita


{'eng': {'50': [' th',
   'he ',
   ' to',
   'the',
   'to ',
   'om ',
   'Tom',
   'hat',
   'nt ',
   'ing',
   'ed ',
   'at ',
   'is ',
   'ng ',
   'tha',
   ' do',
   ' wa',
   ' yo',
   'you',
   'e t',
   're ',
   ' a ',
   ' an',
   ' ha',
   ' he',
   'as ',
   't t',
   'er ',
   'd t',
   'nd ',
   ' is',
   ' in',
   'ry ',
   'her',
   'ou ',
   ' be',
   'and',
   've ',
   'in ',
   'thi',
   ' of',
   'll ',
   'ary',
   'e a',
   'ere',
   'Mar',
   's a',
   'was',
   'e w',
   ' To'],
  '100': [' th',
   'he ',
   ' to',
   'the',
   'to ',
   'om ',
   'Tom',
   'hat',
   'nt ',
   'ing',
   'ed ',
   'at ',
   'is ',
   'ng ',
   'tha',
   ' do',
   ' wa',
   ' yo',
   'you',
   'e t',
   're ',
   ' a ',
   ' an',
   ' ha',
   ' he',
   'as ',
   't t',
   'er ',
   'd t',
   'nd ',
   ' is',
   ' in',
   'ry ',
   'her',
   'ou ',
   ' be',
   'and',
   've ',
   'in ',
   'thi',
   ' of',
   'll ',
   'ary',
   'e a',
   'ere',
   'Mar',
   's a',
   'was',

In [109]:
#From the lang_trigrams select list of unique trigrams i.e. final feature list
features = {} #final feature list
for n in n_list:
    n_trigrams = []
    for l in lang:
            n_trigrams = n_trigrams + lang_trigrams[l][n]
    features[n] = sorted(list(set(n_trigrams)))

with open('data/features.json', 'w') as outfile:
    json.dump(features, outfile)

In [110]:
with open('data/features.json', encoding='utf-8') as data_file:
    features = json.loads(data_file.read())
print(features['50'])

[' To', ' a ', ' an', ' au', ' be', ' ca', ' ch', ' co', ' da', ' de', ' di', ' do', ' ei', ' el', ' en', ' es', ' ge', ' ha', ' he', ' in', ' is', ' la', ' le', ' lo', ' ma', ' me', ' mi', ' ne', ' ni', ' no', ' nã', ' o ', ' of', ' pa', ' pe', ' po', ' qu', ' se', ' si', ' so', ' st', ' te', ' th', ' to', ' um', ' un', ' vo', ' wa', ' wi', ' yo', ' zu', ' à ', ' è ', ' é ', 'Eu ', 'Ich', 'Je ', 'Mar', 'Non', 'Tom', 'a c', 'a d', 'a e', 'a s', 'ach', 'ado', 'ais', 'ait', 'and', 'ar ', 'ara', 'are', 'ary', 'as ', 'at ', 'ato', 'ch ', 'che', 'cht', 'com', 'con', 'cos', 'cê ', 'd t', 'da ', 'das', 'de ', 'den', 'der', 'di ', 'die', 'do ', 'e a', 'e c', 'e d', 'e e', 'e i', 'e l', 'e n', 'e p', 'e s', 'e t', 'e w', 'ed ', 'ein', 'el ', 'em ', 'en ', 'ent', 'er ', 'ere', 'es ', 'est', 'eu ', 'gen', 'hat', 'he ', 'hen', 'her', 'ht ', 'i s', 'ia ', 'ich', 'ie ', 'ien', 'in ', 'ine', 'ing', 'is ', 'ist', 'it ', 'la ', 'le ', 'les', 'll ', 'lle', 'n d', 'na ', 'nd ', 'ne ', 'nen', 'ng ', 'nic'

In [111]:
def vectorize(sent,feature_list):
    "Takes in a list trigram (sentence in trigram form) and returns a feature in vector form, given a list of features"
    vector = [sent.count(f) for f in feature_list]
    return(vector)
print(vectorize([' Ma', ' To', ' a ', ' an', ' au', " Ma", "e t",'ccc','ach','st ','ach'],features['50']))
print(vectorize([' Ma', ' To', ' a ', ' an', ' au', " Ma", "e t",'ccc','ach','st ','ach'],features['100']))
print(vectorize(["aaa", "bbb",'eee','aaa','ccc','aaa'],features['50']))

[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[2, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [112]:
def create_features(name,number):
    "Creates the apporiate feature matrix"
    mat = pd.read_csv("data/{}.csv".format(name))
    mat.drop(['Unnamed: 0'], axis=1,inplace=True)
    
    sent_list = mat['sent']
    sent_clean = list(map(clean_text, sent_list))
    sent_trigram = list(map(char_trigram,sent_clean))
    lang = mat['lang']
    
    vectors = [vectorize(s,features[number]) for s in sent_trigram]
    df = pd.DataFrame(vectors, columns=features[number])
    
    df['lang'] = lang
    df.to_csv('data/features/{}_{}.csv'.format(name,number))
    return df

In [113]:
#Create feature vector matrix size 50
create_features('train','50')
create_features('valid','50')
create_features('test','50')

Unnamed: 0,To,a,an,au,be,ca,ch,co,da,de,...,to,ue,un,und,us,ve,was,you,ão,lang
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,eng
1,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,spa
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,eng
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,eng
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,ita
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,spa
29996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,eng
29997,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,eng
29998,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,fra


In [114]:
#size 100
create_features('train','100')
create_features('valid','100')
create_features('test','100')

#size 200
create_features('train','200')
create_features('valid','200')
create_features('test','200')

Unnamed: 0,Ge,I,Ma,Sc,Si,To,a,ac,al,an,...,y w,you,zio,zu,ás,ão,ère,ía,ón,lang
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,eng
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spa
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,eng
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,eng
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ita
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,spa
29996,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,eng
29997,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,eng
29998,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,fra


## ANN Tuning

In [124]:
def prepare_data(df):
    "Rerformates data so it is appropriate for Tensorflow DNNC"
    x = df.drop(['lang'], axis=1)
    x.columns = ['trigram_'+str(col) for col in list(range(len(x.columns)))]
    y = df['lang']
    y = y.map({"eng": 0, "deu": 1, "spa": 2, "fra": 3, "por": 4, "ita": 5})
    return (x,y)

def get_data(feat_type):
    "Gets the training, valid and test data bases for a specific feature type"
    train = pd.read_csv("ANN_features/train_{}.csv".format(feat_type),index_col=0)
    valid = pd.read_csv("ANN_features/valid_{}.csv".format(feat_type),index_col=0)
    
    train_red = train[0:50000] #Reduce number of records for testing purposes 
    valid_red = valid[0:5000]
    (train_x,train_y) = prepare_data(train_red)
    (valid_x,valid_y) = prepare_data(valid_red)
    return (train_x,train_y), (valid_x,valid_y)

In [125]:
#Input functions 
def train_input_fn(features, labels, batch_size =100):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)
    # Return the dataset.
    return dataset

def eval_input_fn(features, labels, batch_size=100):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset


 #TensorFlow (2016) An Example of a DNNClassifier for the Iris dataset. [Source code]. WWW.tensorflow.org

In [126]:
def fit_model(hidden):
    
    # Feature columns describe how to use the input.
    my_feature_columns = []
    for key in train_x.keys():
        my_feature_columns.append(tf.feature_column.numeric_column(key=key))

    "Fits a DNNC with the desired features and stores validation results "
    # Build a DNN.
    classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 196 nodes each.
    hidden_units=hidden,
    # 6 languages.
    n_classes=6)

    # Train the Model.
    classifier.train(
    input_fn=lambda:train_input_fn(train_x, train_y),
    steps=1000)

    predictions = list(classifier.predict(input_fn=lambda:eval_input_fn(valid_x,labels=None)))

    pred_y = []
    for p in predictions:
        pred_y.append(p['class_ids'][0])
        
    return pred_y

In [127]:
(train_x,train_y), (valid_x,valid_y) = get_data('50')
print(len(train_x),len(valid_x))
train_x.head()

FileNotFoundError: [Errno 2] No such file or directory: 'ANN_features/train_50.csv'

In [None]:
#Feature: 50 hidden layer: [152]
pred_y_50_1 = fit_model([152])
print(classification_report(valid_y,pred_y_50_1,digits=4))
print(confusion_matrix(valid_y,pred_y_50_1))

In [None]:
#Feature: 50 hidden layer: [101]
pred_y_50_2 = fit_model([101])
print(classification_report(valid_y,pred_y_50_2,digits=4))
print(confusion_matrix(valid_y,pred_y_50_2))

In [None]:
#Feature: 50 hidden layer: [51]
pred_y_50_3 = fit_model([51])
print(classification_report(valid_y,pred_y_50_3,digits=4))
print(confusion_matrix(valid_y,pred_y_50_3))

In [None]:
(train_x,train_y), (valid_x,valid_y) = get_data('100')
print(len(train_x),len(valid_x))
train_x.head()

In [None]:
#Feature: 100 hidden layer: [266]
pred_y_100_1 = fit_model([266])
print(classification_report(valid_y,pred_y_100_1,digits=4))
print(confusion_matrix(valid_y,pred_y_100_1))

In [None]:
#Feature: 100 hidden layer: [178]
pred_y_100_2 = fit_model([178])
print(classification_report(valid_y,pred_y_100_2,digits=4))
print(confusion_matrix(valid_y,pred_y_100_2))

In [119]:
#Feature: 100 hidden layer: [88]
pred_y_100_3 = fit_model([88])
print(classification_report(valid_y,pred_y_100_3,digits=4))
print(confusion_matrix(valid_y,pred_y_100_3))

NameError: name 'train_x' is not defined

In [120]:
(train_x,train_y), (valid_x,valid_y) = get_data('200')
print(len(train_x),len(valid_x))
train_x.head()


FileNotFoundError: [Errno 2] No such file or directory: 'ANN_features/train_200.csv'

In [121]:
#Feature: 200 hidden layer: [339]
pred_y_200_2 = fit_model([339])
print(classification_report(valid_y,pred_y_200_2,digits=4))
print(confusion_matrix(valid_y,pred_y_200_2))

NameError: name 'train_x' is not defined