## Libraries <a class="anchor" id="zero-bullet"></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Data set preprocessing

Read the data set <a class="anchor" id="read_data"></a>

In [3]:
df_eng = pd.read_csv('Eng.csv')
df_tr = pd.read_csv('Turkish.csv')

In [4]:
df_eng.describe()

Unnamed: 0,Words
count,21665
unique,21665
top,prow
freq,1


In [5]:
df_eng.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21666 entries, 0 to 21665
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Words   21665 non-null  object
dtypes: object(1)
memory usage: 169.4+ KB


In [6]:
df_tr.describe()

Unnamed: 0,Kelimeler
count,2235
unique,2235
top,ugunma
freq,1


In [7]:
df_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2235 entries, 0 to 2234
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Kelimeler  2235 non-null   object
dtypes: object(1)
memory usage: 17.6+ KB


### No Turkish characters

In [8]:
check_tr_chr = df_tr['Kelimeler'].str.contains('ç|ğ|ı|ö|ş|ü|', regex=False)
check_tr_chr[check_tr_chr].index

Int64Index([], dtype='int64')

## Feature Creation
### Since the data is provided as single words, context or grammar does not exist
### The position of letters and their sequence can be useful features
### First thing to do to is split the words into letters and record their postions

In [9]:
df_tr_split = df_tr['Kelimeler'].apply(lambda x: pd.Series(list(x)))
df_tr_split

Unnamed: 0,0,1,2,3,4,5
0,f,r,a,k,,
1,f,a,s,i,t,
2,f,e,c,i,r,
3,f,o,n,t,,
4,f,l,u,t,,
...,...,...,...,...,...,...
2230,k,a,l,i,c,
2231,k,o,r,v,e,t
2232,k,a,s,a,c,i
2233,k,o,c,m,a,


In [10]:
df_eng_split = df_eng['Words'].astype(str).apply(lambda x: pd.Series(list(x)))
df_eng_split

Unnamed: 0,0,1,2,3,4,5
0,f,l,y,i,n,g
1,f,i,l,l,e,t
2,f,i,a,n,c,e
3,f,a,i,l,e,d
4,f,a,n,o,n,s
...,...,...,...,...,...,...
21661,k,a,i,n,,
21662,k,v,u,t,z,a
21663,k,o,r,e,r,o
21664,k,e,e,n,l,y


### Add the language of the word to the dataframe

In [11]:
df_eng_split['lang'] = 0
df_tr_split['lang'] = 1

### Append the dataframes to create 1 dataframe before the creation of features so that they share the same features

In [12]:
df_split = df_eng_split.append(df_tr_split)

### Create numeric features with the help of dummy variables(one hot encoding)

In [13]:
df_dummy = pd.get_dummies(df_split, columns=[0,1,2,3,4,5])

### The provided datasets are unbalanced
### The cause of this unbalance is not stated
### Depending on the cause of this unbalance, the testing methods for the algorihms should be different
### There are 2 scenarios exploring 2 possible causes

## 1. If the real life distribution of languages is 50-50

### Separate the appended dataset into 2 with respect to language

In [14]:
df_eng_dummy = df_dummy[df_dummy['lang']==0]
df_tr_dummy = df_dummy[df_dummy['lang']==1]

### Since the distribution is 50-50 test data should be balanced
### 447 words from each language for test data

In [15]:
df_eng_dummy = df_eng_dummy.sample(frac=1,random_state=42).reset_index(drop=True)
df_tr_dummy = df_tr_dummy.sample(frac=1,random_state=42).reset_index(drop=True)

In [16]:
df_tr_train = df_tr_dummy[:1788]
df_tr_test = df_tr_dummy[1788:]

In [17]:
df_eng_train = df_eng_dummy[:21219]
df_eng_test = df_eng_dummy[21219:]

### Create train and test data sets

In [18]:
X_train = df_eng_train.append(df_tr_train)
del X_train['lang']
X_test = df_eng_test.append(df_tr_test)
del X_test['lang']
y_train = df_eng_train.append(df_tr_train)['lang']
y_test = df_eng_test.append(df_tr_test)['lang']

## Testing algorithms
### The problem can be solved as a binary classification problem

#### Classifiers:

* Logistic Regression
* Decision Tree
* XGBoost - Not in the sklearn library, XGBoost is a very popular classification algorithm that uses NNs. In my previous work, I have observed that for some cases it works much better than sklearn's algorithms.
* Linear Discriminant Analysis
* Quadratic Discriminant Analysis
* Random Forest
* K-Nearest Neighbors
* Naive Bayes
* Multi-Layer Perceptron
* AdaBoost

#### Scoring:

* precision score
* recall score
* F1 score
* support score
* accuracy score
* AUC/ROC

#### Logistic Regression

In [19]:
LR = LogisticRegression()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LR, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
LR_fit_time = scores['fit_time'].mean()
LR_score_time = scores['score_time'].mean()
LR_accuracy = scores['test_accuracy'].mean()
LR_precision = scores['test_precision_macro'].mean()
LR_recall = scores['test_recall_macro'].mean()
LR_f1 = scores['test_f1_weighted'].mean()
LR_roc = scores['test_roc_auc'].mean()

#### Decision Tree

In [20]:
decision_tree = DecisionTreeClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(decision_tree, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
dtree_fit_time = scores['fit_time'].mean()
dtree_score_time = scores['score_time'].mean()
dtree_accuracy = scores['test_accuracy'].mean()
dtree_precision = scores['test_precision_macro'].mean()
dtree_recall = scores['test_recall_macro'].mean()
dtree_f1 = scores['test_f1_weighted'].mean()
dtree_roc = scores['test_roc_auc'].mean()

#### Support Vector Machine

In [21]:
SVM = SVC(probability = True)

scoring = ['accuracy','precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(SVM, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
SVM_fit_time = scores['fit_time'].mean()
SVM_score_time = scores['score_time'].mean()
SVM_accuracy = scores['test_accuracy'].mean()
SVM_precision = scores['test_precision_macro'].mean()
SVM_recall = scores['test_recall_macro'].mean()
SVM_f1 = scores['test_f1_weighted'].mean()
SVM_roc = scores['test_roc_auc'].mean()

#### XGBoost

In [22]:
xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.08, objective= 'binary:logistic',n_jobs=-1)

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(xgb_model, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
XG_fit_time = scores['fit_time'].mean()
XG_score_time = scores['score_time'].mean()
XG_accuracy = scores['test_accuracy'].mean()
XG_precision = scores['test_precision_macro'].mean()
XG_recall = scores['test_recall_macro'].mean()
XG_f1 = scores['test_f1_weighted'].mean()
XG_roc = scores['test_roc_auc'].mean()

#### Linear Discriminant Analysis

In [23]:
LDA = LinearDiscriminantAnalysis()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LDA, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
LDA_fit_time = scores['fit_time'].mean()
LDA_score_time = scores['score_time'].mean()
LDA_accuracy = scores['test_accuracy'].mean()
LDA_precision = scores['test_precision_macro'].mean()
LDA_recall = scores['test_recall_macro'].mean()
LDA_f1 = scores['test_f1_weighted'].mean()
LDA_roc = scores['test_roc_auc'].mean()

#### Quadratic Discriminant Analysis

In [24]:
QDA = QuadraticDiscriminantAnalysis()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(QDA, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
QDA_fit_time = scores['fit_time'].mean()
QDA_score_time = scores['score_time'].mean()
QDA_accuracy = scores['test_accuracy'].mean()
QDA_precision = scores['test_precision_macro'].mean()
QDA_recall = scores['test_recall_macro'].mean()
QDA_f1 = scores['test_f1_weighted'].mean()
QDA_roc = scores['test_roc_auc'].mean()

#### Random Forest Classifier

In [25]:
random_forest = RandomForestClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(random_forest, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
forest_fit_time = scores['fit_time'].mean()
forest_score_time = scores['score_time'].mean()
forest_accuracy = scores['test_accuracy'].mean()
forest_precision = scores['test_precision_macro'].mean()
forest_recall = scores['test_recall_macro'].mean()
forest_f1 = scores['test_f1_weighted'].mean()
forest_roc = scores['test_roc_auc'].mean()

#### K-Nearest Neighbors

In [26]:
KNN = KNeighborsClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(KNN, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
KNN_fit_time = scores['fit_time'].mean()
KNN_score_time = scores['score_time'].mean()
KNN_accuracy = scores['test_accuracy'].mean()
KNN_precision = scores['test_precision_macro'].mean()
KNN_recall = scores['test_recall_macro'].mean()
KNN_f1 = scores['test_f1_weighted'].mean()
KNN_roc = scores['test_roc_auc'].mean()

#### Naive Bayes

In [27]:
bayes = GaussianNB()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(bayes, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
bayes_fit_time = scores['fit_time'].mean()
bayes_score_time = scores['score_time'].mean()
bayes_accuracy = scores['test_accuracy'].mean()
bayes_precision = scores['test_precision_macro'].mean()
bayes_recall = scores['test_recall_macro'].mean()
bayes_f1 = scores['test_f1_weighted'].mean()
bayes_roc = scores['test_roc_auc'].mean()

#### Multi-Layer Perceptron

In [28]:
MLP = MLPClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(MLP, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
MLP_fit_time = scores['fit_time'].mean()
MLP_score_time = scores['score_time'].mean()
MLP_accuracy = scores['test_accuracy'].mean()
MLP_precision = scores['test_precision_macro'].mean()
MLP_recall = scores['test_recall_macro'].mean()
MLP_f1 = scores['test_f1_weighted'].mean()
MLP_roc = scores['test_roc_auc'].mean()

#### AdaBoost

In [29]:
Ada = AdaBoostClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(Ada, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
Ada_fit_time = scores['fit_time'].mean()
Ada_score_time = scores['score_time'].mean()
Ada_accuracy = scores['test_accuracy'].mean()
Ada_precision = scores['test_precision_macro'].mean()
Ada_recall = scores['test_recall_macro'].mean()
Ada_f1 = scores['test_f1_weighted'].mean()
Ada_roc = scores['test_roc_auc'].mean()

### Comparison <a class="anchor" id="sum_1"></a>

In [30]:
models_initial  = pd.DataFrame({
    'Model'       : ['Logistic Regression', 'Decision Tree', 'Support Vector Machine', 'XGBoost', 'Linear Discriminant Analysis', 'Quadratic Discriminant Analysis', 'Random Forest', 'K-Nearest Neighbors', 'Bayes', 'MLP', 'Ada'],
    'Fitting time': [LR_fit_time, dtree_fit_time, SVM_fit_time, XG_fit_time, LDA_fit_time, QDA_fit_time, forest_fit_time, KNN_fit_time, bayes_fit_time, MLP_fit_time, Ada_fit_time],
    'Scoring time': [LR_score_time, dtree_score_time, SVM_score_time, XG_score_time, LDA_score_time, QDA_score_time, forest_score_time, KNN_score_time, bayes_score_time, MLP_score_time, Ada_score_time],
    'Accuracy'    : [LR_accuracy, dtree_accuracy, SVM_accuracy, XG_accuracy, LDA_accuracy, QDA_accuracy, forest_accuracy, KNN_accuracy, bayes_accuracy, MLP_accuracy, Ada_accuracy],
    'Precision'   : [LR_precision, dtree_precision, SVM_precision, XG_precision, LDA_precision, QDA_precision, forest_precision, KNN_precision, bayes_precision, MLP_precision, Ada_precision],
    'Recall'      : [LR_recall, dtree_recall, XG_recall, SVM_recall, LDA_recall, QDA_recall, forest_recall, KNN_recall, bayes_recall, MLP_recall, Ada_recall],
    'F1_score'    : [LR_f1, dtree_f1, SVM_f1, XG_f1, LDA_f1, QDA_f1, forest_f1, KNN_f1, bayes_f1, MLP_f1, Ada_f1],
    'AUC_ROC'     : [LR_roc, dtree_roc, SVM_roc, XG_roc, LDA_roc, QDA_roc, forest_roc, KNN_roc, bayes_roc, MLP_roc, Ada_roc],
    }, columns = ['Model', 'Fitting time', 'Scoring time', 'Accuracy', 'Precision', 'Recall', 'F1_score', 'AUC_ROC',])

models_initial.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Fitting time,Scoring time,Accuracy,Precision,Recall,F1_score,AUC_ROC
2,Support Vector Machine,233.567928,2.953619,0.933368,0.907694,0.548088,0.911422,0.868292
6,Random Forest,3.554085,0.268532,0.931803,0.810018,0.609895,0.915721,0.851397
3,XGBoost,8.489035,0.060797,0.928326,0.877371,0.581288,0.901316,0.841395
0,Logistic Regression,0.787187,0.032039,0.926718,0.783522,0.564369,0.904104,0.835005
7,K-Nearest Neighbors,0.824703,59.64528,0.925588,0.814793,0.534315,0.896449,0.690418
10,Ada,1.318253,0.16091,0.925023,0.754702,0.560132,0.902226,0.821558
9,MLP,51.725407,0.05051,0.923198,0.729672,0.681041,0.918722,0.868794
4,Linear Discriminant Analysis,0.919291,0.032215,0.919459,0.706886,0.627276,0.91004,0.816762
1,Decision Tree,0.454136,0.016586,0.905029,0.659407,0.643267,0.902405,0.644344
8,Bayes,0.047549,0.031118,0.267657,0.544488,0.596059,0.329306,0.610336


### SVM has the highest accuracy and precision but its recall is one of the lowest. It is also by far the slowest algorithm to train.
### Random Forest has high recall but its precison is low
### Depending on the importance of precision and recall either algorithm can be chosen
### XGBoost can be a good alternative as it is a midway point between precision and recall
### F1 and AUC_ROC scores are very comperable between top performing algorithms. They will not be much use when deciding on which algorithm to choose.

## Voting classifier
### Every algorithm has strong and weak parts.
### Ensembling has the potential create a stronger model using the strong parts of many algorithms.

Documentation: If ‘hard’, uses predicted class labels for majority rule voting. Else if ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities, which is recommended for an ensemble of well-calibrated classifiers.

In [31]:
models = [LogisticRegression(),
         DecisionTreeClassifier(),
         SVM,
         xgb_model,
         LinearDiscriminantAnalysis(),
         QuadraticDiscriminantAnalysis(),
         RandomForestClassifier(),
         KNeighborsClassifier(),
         GaussianNB(),
         MLPClassifier(),
         AdaBoostClassifier()]

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']

In [32]:
for model in models:
    scores = cross_validate(model, X_train, y_train, scoring=scoring, cv=5)
    #print(model, scores['fit_time'].mean(), scores['score_time'].mean(), scores['test_accuracy'].mean(),
          #scores['test_precision_macro'].mean(), scores['test_recall_macro'].mean(), 
          #scores['test_f1_weighted'].mean(), scores['test_roc_auc'].mean())

### Hard <a class="anchor" id="hard"></a>

In [33]:
models_ens = list(zip(['LR', 'DT', 'SVM', 'XGB', 'LDA', 'QDA', 'RF', 'KNN', 'NB', 'MLP', 'Ada'], models))

model_ens = VotingClassifier(estimators = models_ens, voting = 'hard')
model_ens.fit(X_train, y_train)
pred = model_ens.predict(X_test)
#prob = model_ens.predict_proba(X_test)[:,1]

acc_hard = accuracy_score(y_test, pred)
prec_hard = precision_score(y_test, pred)
recall_hard = recall_score(y_test, pred)
f1_hard = f1_score(y_test, pred)
roc_auc_hard = 'not applicable'

### Soft <a class="anchor" id="soft"></a>

In [34]:
model_ens = VotingClassifier(estimators = models_ens, voting = 'soft')
model_ens.fit(X_train, y_train)
pred = model_ens.predict(X_test)
prob = model_ens.predict_proba(X_test)[:,1]

acc_soft = accuracy_score(y_test, pred)
prec_soft = precision_score(y_test, pred)
recall_soft = recall_score(y_test, pred)
f1_soft = f1_score(y_test, pred)
roc_auc_soft = roc_auc_score(y_test, prob)

### Comparison <a class="anchor" id="sum_3"></a>

In [35]:
models_ensembling = pd.DataFrame({
    'Model'       : ['Ensembling_hard', 'Ensembling_soft'],
    'Accuracy'    : [acc_hard, acc_soft],
    'Precision'   : [prec_hard, prec_soft],
    'Recall'      : [recall_hard, recall_soft],
    'F1_score'    : [f1_hard, f1_soft],
    'AUC_ROC'     : [roc_auc_hard, roc_auc_soft],
    }, columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1_score', 'AUC_ROC'])

models_ensembling.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1_score,AUC_ROC
1,Ensembling_soft,0.643177,0.950704,0.302013,0.458404,0.855107
0,Ensembling_hard,0.604027,0.989474,0.210291,0.346863,not applicable


### The accuracy and recall of the ensembles are really low, but they are very precise. If precision is really really important they can be used but for most cases they will not be useful.

## 2. If the real life distribution of languages is the same as the provided data

In [54]:
X = df_dummy.copy()
del X['lang']
y = df_dummy['lang']

### The distribution of test data should be the same as the provided datasets. If English is much more common, correctly labeling it is more important.
### Stratification helps us create the correct distribution

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y, shuffle=True)

#### Logistic Regression

In [56]:
LR = LogisticRegression()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LR, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
LR_fit_time = scores['fit_time'].mean()
LR_score_time = scores['score_time'].mean()
LR_accuracy = scores['test_accuracy'].mean()
LR_precision = scores['test_precision_macro'].mean()
LR_recall = scores['test_recall_macro'].mean()
LR_f1 = scores['test_f1_weighted'].mean()
LR_roc = scores['test_roc_auc'].mean()

#### Decision Tree

In [57]:
decision_tree = DecisionTreeClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(decision_tree, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
dtree_fit_time = scores['fit_time'].mean()
dtree_score_time = scores['score_time'].mean()
dtree_accuracy = scores['test_accuracy'].mean()
dtree_precision = scores['test_precision_macro'].mean()
dtree_recall = scores['test_recall_macro'].mean()
dtree_f1 = scores['test_f1_weighted'].mean()
dtree_roc = scores['test_roc_auc'].mean()

#### Support Vector Machine

In [58]:
SVM = SVC(probability = True)

scoring = ['accuracy','precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(SVM, X_train, y_train, scoring=scoring, cv=20)

sorted(scores.keys())
SVM_fit_time = scores['fit_time'].mean()
SVM_score_time = scores['score_time'].mean()
SVM_accuracy = scores['test_accuracy'].mean()
SVM_precision = scores['test_precision_macro'].mean()
SVM_recall = scores['test_recall_macro'].mean()
SVM_f1 = scores['test_f1_weighted'].mean()
SVM_roc = scores['test_roc_auc'].mean()

#### XGBoost

In [59]:
xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.08, objective= 'binary:logistic',n_jobs=-1)

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(xgb_model, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
XG_fit_time = scores['fit_time'].mean()
XG_score_time = scores['score_time'].mean()
XG_accuracy = scores['test_accuracy'].mean()
XG_precision = scores['test_precision_macro'].mean()
XG_recall = scores['test_recall_macro'].mean()
XG_f1 = scores['test_f1_weighted'].mean()
XG_roc = scores['test_roc_auc'].mean()

#### Linear Discriminant Analysis

In [60]:
LDA = LinearDiscriminantAnalysis()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(LDA, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
LDA_fit_time = scores['fit_time'].mean()
LDA_score_time = scores['score_time'].mean()
LDA_accuracy = scores['test_accuracy'].mean()
LDA_precision = scores['test_precision_macro'].mean()
LDA_recall = scores['test_recall_macro'].mean()
LDA_f1 = scores['test_f1_weighted'].mean()
LDA_roc = scores['test_roc_auc'].mean()

#### Quadratic Discriminant Analysis

In [61]:
QDA = QuadraticDiscriminantAnalysis()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(QDA, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
QDA_fit_time = scores['fit_time'].mean()
QDA_score_time = scores['score_time'].mean()
QDA_accuracy = scores['test_accuracy'].mean()
QDA_precision = scores['test_precision_macro'].mean()
QDA_recall = scores['test_recall_macro'].mean()
QDA_f1 = scores['test_f1_weighted'].mean()
QDA_roc = scores['test_roc_auc'].mean()

#### Random Forest Classifier

In [62]:
random_forest = RandomForestClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(random_forest, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
forest_fit_time = scores['fit_time'].mean()
forest_score_time = scores['score_time'].mean()
forest_accuracy = scores['test_accuracy'].mean()
forest_precision = scores['test_precision_macro'].mean()
forest_recall = scores['test_recall_macro'].mean()
forest_f1 = scores['test_f1_weighted'].mean()
forest_roc = scores['test_roc_auc'].mean()

#### K-Nearest Neighbors

In [63]:
KNN = KNeighborsClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(KNN, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
KNN_fit_time = scores['fit_time'].mean()
KNN_score_time = scores['score_time'].mean()
KNN_accuracy = scores['test_accuracy'].mean()
KNN_precision = scores['test_precision_macro'].mean()
KNN_recall = scores['test_recall_macro'].mean()
KNN_f1 = scores['test_f1_weighted'].mean()
KNN_roc = scores['test_roc_auc'].mean()

#### Naive Bayes

In [64]:
bayes = GaussianNB()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(bayes, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
bayes_fit_time = scores['fit_time'].mean()
bayes_score_time = scores['score_time'].mean()
bayes_accuracy = scores['test_accuracy'].mean()
bayes_precision = scores['test_precision_macro'].mean()
bayes_recall = scores['test_recall_macro'].mean()
bayes_f1 = scores['test_f1_weighted'].mean()
bayes_roc = scores['test_roc_auc'].mean()

#### Multi-Layer Perceptron

In [65]:
MLP = MLPClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(MLP, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
MLP_fit_time = scores['fit_time'].mean()
MLP_score_time = scores['score_time'].mean()
MLP_accuracy = scores['test_accuracy'].mean()
MLP_precision = scores['test_precision_macro'].mean()
MLP_recall = scores['test_recall_macro'].mean()
MLP_f1 = scores['test_f1_weighted'].mean()
MLP_roc = scores['test_roc_auc'].mean()

#### AdaBoost

In [66]:
Ada = AdaBoostClassifier()

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
scores = cross_validate(Ada, X_train, y_train, scoring=scoring, cv=5)

sorted(scores.keys())
Ada_fit_time = scores['fit_time'].mean()
Ada_score_time = scores['score_time'].mean()
Ada_accuracy = scores['test_accuracy'].mean()
Ada_precision = scores['test_precision_macro'].mean()
Ada_recall = scores['test_recall_macro'].mean()
Ada_f1 = scores['test_f1_weighted'].mean()
Ada_roc = scores['test_roc_auc'].mean()

### Comparison <a class="anchor" id="sum_2"></a>

In [67]:
models_initial  = pd.DataFrame({
    'Model'       : ['Logistic Regression', 'Decision Tree', 'Support Vector Machine', 'XGBoost', 'Linear Discriminant Analysis', 'Quadratic Discriminant Analysis', 'Random Forest', 'K-Nearest Neighbors', 'Bayes', 'MLP', 'Ada'],
    'Fitting time': [LR_fit_time, dtree_fit_time, SVM_fit_time, XG_fit_time, LDA_fit_time, QDA_fit_time, forest_fit_time, KNN_fit_time, bayes_fit_time, MLP_fit_time, Ada_fit_time],
    'Scoring time': [LR_score_time, dtree_score_time, SVM_score_time, XG_score_time, LDA_score_time, QDA_score_time, forest_score_time, KNN_score_time, bayes_score_time, MLP_score_time, Ada_score_time],
    'Accuracy'    : [LR_accuracy, dtree_accuracy, SVM_accuracy, XG_accuracy, LDA_accuracy, QDA_accuracy, forest_accuracy, KNN_accuracy, bayes_accuracy, MLP_accuracy, Ada_accuracy],
    'Precision'   : [LR_precision, dtree_precision, SVM_precision, XG_precision, LDA_precision, QDA_precision, forest_precision, KNN_precision, bayes_precision, MLP_precision, Ada_precision],
    'Recall'      : [LR_recall, dtree_recall, XG_recall, SVM_recall, LDA_recall, QDA_recall, forest_recall, KNN_recall, bayes_recall, MLP_recall, Ada_recall],
    'F1_score'    : [LR_f1, dtree_f1, SVM_f1, XG_f1, LDA_f1, QDA_f1, forest_f1, KNN_f1, bayes_f1, MLP_f1, Ada_f1],
    'AUC_ROC'     : [LR_roc, dtree_roc, SVM_roc, XG_roc, LDA_roc, QDA_roc, forest_roc, KNN_roc, bayes_roc, MLP_roc, Ada_roc],
    }, columns = ['Model', 'Fitting time', 'Scoring time', 'Accuracy', 'Precision', 'Recall', 'F1_score', 'AUC_ROC',])

models_initial.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Fitting time,Scoring time,Accuracy,Precision,Recall,F1_score,AUC_ROC
2,Support Vector Machine,140.257172,2.060714,0.92173,0.894857,0.559684,0.897866,0.866609
6,Random Forest,2.709547,0.200707,0.92,0.817677,0.616878,0.901735,0.849454
0,Logistic Regression,0.682664,0.037814,0.915537,0.802023,0.586587,0.892649,0.829942
3,XGBoost,6.610818,0.049063,0.915258,0.863695,0.594313,0.885749,0.843781
7,K-Nearest Neighbors,0.51307,32.405215,0.913417,0.778225,0.582481,0.890452,0.754896
9,MLP,39.706351,0.042994,0.912469,0.741434,0.694335,0.90769,0.864565
10,Ada,1.04192,0.1241,0.911409,0.76026,0.573348,0.887157,0.816656
4,Linear Discriminant Analysis,0.733972,0.033805,0.90834,0.723882,0.631322,0.896482,0.813442
1,Decision Tree,0.352374,0.014924,0.887978,0.664211,0.653262,0.886048,0.653976
8,Bayes,0.037983,0.025934,0.278438,0.551693,0.593176,0.327992,0.605396


### Very Similar to the first scenario
### SVM has the highest accuracy and precision but its recall is the lowest. It is also by far the slowest algorithm to train.
### Random Forest has high recall but its precison is low
### Depending on the importance of precision and recall either algorithm can be chosen
### XGBoost can be a good alternative as it is a midway point between precision and recall
### F1 and AUC_ROC scores are very comperable between top performing algorithms. They will not be much use when deciding on which algorithm to choose.

## Voting classifier <a class="anchor" id="voting"></a>


Documentation: If ‘hard’, uses predicted class labels for majority rule voting. Else if ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities, which is recommended for an ensemble of well-calibrated classifiers.

In [68]:
models = [LogisticRegression(),
         DecisionTreeClassifier(),
         SVM,
         xgb_model,
         LinearDiscriminantAnalysis(),
         QuadraticDiscriminantAnalysis(),
         RandomForestClassifier(),
         KNeighborsClassifier(),
         GaussianNB(),
         MLPClassifier(),
         AdaBoostClassifier()]

scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']

In [69]:
for model in models:
    scores = cross_validate(model, X_train, y_train, scoring=scoring, cv=5)
    #print(model, scores['fit_time'].mean(), scores['score_time'].mean(), scores['test_accuracy'].mean(),
          #scores['test_precision_macro'].mean(), scores['test_recall_macro'].mean(), 
          #scores['test_f1_weighted'].mean(), scores['test_roc_auc'].mean())

### Hard <a class="anchor" id="hard"></a>

In [70]:
models_ens = list(zip(['LR', 'DT', 'SVM', 'XGB', 'LDA', 'QDA', 'RF', 'KNN', 'NB', 'MLP', 'Ada'], models))

model_ens = VotingClassifier(estimators = models_ens, voting = 'hard')
model_ens.fit(X_train, y_train)
pred = model_ens.predict(X_test)
#prob = model_ens.predict_proba(X_test)[:,1]

acc_hard = accuracy_score(y_test, pred)
prec_hard = precision_score(y_test, pred)
recall_hard = recall_score(y_test, pred)
f1_hard = f1_score(y_test, pred)
roc_auc_hard = 'not applicable'

### Soft <a class="anchor" id="soft"></a>

In [71]:
model_ens = VotingClassifier(estimators = models_ens, voting = 'soft')
model_ens.fit(X_train, y_train)
pred = model_ens.predict(X_test)
prob = model_ens.predict_proba(X_test)[:,1]

acc_soft = accuracy_score(y_test, pred)
prec_soft = precision_score(y_test, pred)
recall_soft = recall_score(y_test, pred)
f1_soft = f1_score(y_test, pred)
roc_auc_soft = roc_auc_score(y_test, prob)

### Comparison <a class="anchor" id="sum_3"></a>

In [72]:
models_ensembling = pd.DataFrame({
    'Model'       : ['Ensembling_hard', 'Ensembling_soft'],
    'Accuracy'    : [acc_hard, acc_soft],
    'Precision'   : [prec_hard, prec_soft],
    'Recall'      : [recall_hard, recall_soft],
    'F1_score'    : [f1_hard, f1_soft],
    'AUC_ROC'     : [roc_auc_hard, roc_auc_soft],
    }, columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1_score', 'AUC_ROC'])

models_ensembling.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1_score,AUC_ROC
1,Ensembling_soft,0.922021,0.648562,0.363148,0.465596,0.868849
0,Ensembling_hard,0.921017,0.760479,0.227191,0.349862,not applicable


### The ensembles will not be useful in this scenario.

## Future Work:
### Creating new feautures with n-gram model can improve the performance of the algorithms. As it can detect syllables and common letter patterns better.
### Recurrent Neural Networks are really good with sequence data. With sufficient data RNNs can provide better results.