In [49]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score


Read Dataset

In [26]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
data = pd.read_csv('../content/gdrive/MyDrive/sentences.csv', sep='\t', 
                            encoding='utf8', 
                            index_col=0,
                            names=['lang','text'])
data.head(50)

Filter Data by Text Length

In [28]:
len_cond = [True if 20<=len(s)<=200 else False for s in data['text']]
data = data[len_cond]

Filter by Text Language

In [None]:
lang = ['deu', 'eng', 'fra', 'ita', 'por', 'spa']
data = data[data['lang'].isin(lang)]
data.head(50)

Select at most 50,000 rows per language

In [30]:
data_trim = pd.DataFrame(columns=['lang','text'])
for l in lang:
    lang_trim = data[data['lang'] ==l].sample(50000,random_state = 100)
    data_trim = data_trim.append(lang_trim)


Split Data into Training, Testing and Validation sets

In [31]:
data_shuffle = data_trim.sample(frac=1)

train = data_shuffle[0:210000]
valid = data_shuffle[210000:270000]
test = data_shuffle[270000:300000]

Feature Engineering

Function to get 200 most common trigrams from each of the 6 languages

In [32]:
def get_trigrams(corpus,n_feat=200):
    
    #fit the n-gram model
    vectorizer = CountVectorizer(analyzer='char',ngram_range=(3, 3) ,max_features=n_feat)
    
    X = vectorizer.fit_transform(corpus)
    
    #Get model feature names
    feature_names = vectorizer.get_feature_names()
    
    return feature_names

Use get_triagrams function to obtain the 200 common trigrams from each language and add them to a set

In [33]:
features = {}
features_set = set()

for l in lang:
    
    #get corpus filtered by language
    corpus = train[train.lang==l]['text']
    
    #get 200 most frequent trigrams
    trigrams = get_trigrams(corpus)
    
    #add to dict and set
    features[l] = trigrams 
    features_set.update(trigrams)

    
#create vocabulary list using feature set
vocab = dict()
for i,f in enumerate(features_set):
    vocab[f]=i   

Vectorize Sentences in the training dataset

In [34]:
#train count vectoriser using vocabulary
vectorizer = CountVectorizer(analyzer='char',
                             ngram_range=(3, 3),
                            vocabulary=vocab)

#create feature matrix for training set
corpus = train['text']   
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()

train_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)

#We end up with a feature matrix

Use min-max scaling to scale our feature matrix

In [35]:
train_min = train_feat.min()
train_max = train_feat.max()
train_feat = (train_feat - train_min)/(train_max-train_min)

#Add target variable 
train_feat['lang'] = list(train['lang'])

Validate and Vectorize Testing and Validation Sets

In [36]:
#create feature matrix for validation set
corpus = valid['text']   
X = vectorizer.fit_transform(corpus)

valid_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
valid_feat = (valid_feat - train_min)/(train_max-train_min)
valid_feat['lang'] = list(valid['lang'])

#create feature matrix for test set
corpus = test['text']   
X = vectorizer.fit_transform(corpus)

test_feat = pd.DataFrame(data=X.toarray(),columns=feature_names)
test_feat = (test_feat - train_min)/(train_max-train_min)
test_feat['lang'] = list(test['lang'])

Encode the data in readiness for training using label encoding

In [38]:
#Fit encoder
encoder = LabelEncoder()
encoder.fit(['deu', 'eng', 'fra', 'ita', 'por', 'spa'])

def encode(y):
    """
    Returns a list of one hot encodings
    Params
    ---------
        y: list of language labels
    """
    
    y_encoded = encoder.transform(y)
    y_dummy = np_utils.to_categorical(y_encoded)
    
    return y_dummy

Train DNN. We have 3 hidden layers, each with 500, 250 and 250 nodes respectively. The output layer has 6 nodes. Relu used for activating the hidden layers whereas the output layer is activated using softmax.

In [43]:
#Get training data
x = train_feat.drop('lang',axis=1)
y = encode(train_feat['lang'])

#Define model
model = Sequential()
model.add(Dense(500, input_dim=664, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#Train model
model.fit(x, y, epochs=4, batch_size=100)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fe57d679790>

Get Predictions and accuracy on test set

In [51]:
x_test = test_feat.drop('lang',axis=1)
y_test = test_feat['lang']

#Get predictions on test set
labels = model.predict(x_test)
classes_x=np.argmax(labels,axis=1)
predictions = encoder.inverse_transform(classes_x)

#Accuracy on test set
accuracy = accuracy_score(y_test,predictions)
print(accuracy)

0.9865666666666667
