<h1><center>Text Classification</center></h1>

In [1]:
%matplotlib inline

import time
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

import stanza

#from preprocess import * 
from custom_preprocessing import CustomPreProcessing
from custom_preprocessing import PreProcessing
from class_metric import Metrics

import sklearn
from sklearn.utils import class_weight
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import decomposition, ensemble
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import neighbors
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import matthews_corrcoef

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.utils import np_utils
from sklearn.metrics import make_scorer

import itertools
from textblob import TextBlob 
import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier
sns.set(style="darkgrid")


import string
import fasttext
import fasttext.util
from tqdm import tqdm

# ---- Call tqdm to see progress bar with pandas
tqdm().pandas()

Using TensorFlow backend.
0it [00:00, ?it/s]


In [2]:
sklearn.__version__

'0.23.0'

---

<center><h2>Parameters</h2></center>

---

This part allows you to determine the text column to classify as well as the label column.

In [3]:
TEXT = "text"
LABEL = "label"

In [4]:
# ---- Create object to preprocess the text 
preprocess = CustomPreProcessing()
preproc = PreProcessing()
Metric = Metrics()

Welcome in this custom preprocessing class
        
Welcome in the preprocessing


---

<center><h2>List of Models</h2></center>

---

In [5]:
save_results           = False
lang                   = False
sample                 = True
multinomial_naive_bayes= True
logistic_regression    = True
svm_model              = False
sgd                    = True
random_forest          = True
gradient_boosting      = True
xgboost_classifier     = True
shallow_network        = True
deep_nn                = True
rnn                    = True
lstm                   = True
cnn                    = True
gru                    = True
cnn_lstm               = True
cnn_gru                = True
bidirectional_rnn      = True
bidirectional_lstm     = True
bidirectional_gru      = True
rcnn                   = True
pre_trained            = True

---

---

<center><i><h1>Sand Box to Load Data</h1></i></center>

---

The sandbox is the working area of your data if it has not been processed before using the pipe

In [11]:
def load_imdb_sentiment_analysis_dataset(data_path, seed=123):
    """Loads the IMDb movie reviews sentiment analysis dataset.

    # Arguments
        data_path: string, path to the data directory.
        seed: int, seed for randomizer.

    # Returns
        A tuple of training and validation data.
        Number of training samples: 25000
        Number of test samples: 25000
        Number of categories: 2 (0 - negative, 1 - positive)

    # References
        Mass et al., http://www.aclweb.org/anthology/P11-1015

        Download and uncompress archive from:
        http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    """
    imdb_data_path = os.path.join(data_path, 'aclImdb')

    # Load the training data
    train_texts = []
    train_labels = []
    for category in ['pos', 'neg']:
        train_path = os.path.join(imdb_data_path, 'train', category)
        for fname in sorted(os.listdir(train_path)):
            if fname.endswith('.txt'):
                with open(os.path.join(train_path, fname)) as f:
                    train_texts.append(f.read())
                train_labels.append(0 if category == 'neg' else 1)

    # Load the validation data.
    test_texts = []
    test_labels = []
    for category in ['pos', 'neg']:
        test_path = os.path.join(imdb_data_path, 'test', category)
        for fname in sorted(os.listdir(test_path)):
            if fname.endswith('.txt'):
                with open(os.path.join(test_path, fname)) as f:
                    test_texts.append(f.read())
                test_labels.append(0 if category == 'neg' else 1)

    # Shuffle the training data and labels.
    random.seed(seed)
    random.shuffle(train_texts)
    random.seed(seed)
    random.shuffle(train_labels)

    return ((train_texts, np.array(train_labels)),
            (test_texts, np.array(test_labels)))

In [12]:
import os
import random
(x_train, y_train), (x_test, y_test) = load_imdb_sentiment_analysis_dataset(".")

In [13]:
y_train[(y_train!=1) & (y_train!=0)]

array([], dtype=int64)

In [14]:
df = pd.DataFrame(data=[x_train, y_train], index=["text", "label"]).T

In [15]:
df = df.append(pd.DataFrame(data=[x_test, y_test], index=["text", "label"]).T)

In [16]:
df.head()

Unnamed: 0,text,label
0,POSSIBLE SPOILERS<br /><br />The Spy Who Shagg...,0
1,"The long list of ""big"" names in this flick (in...",0
2,Bette Midler showcases her talents and beauty ...,1
3,Great movie when I saw it. Have to say one of ...,1
4,Although it's most certainly politically incor...,1


In [17]:
df = df[(df[LABEL]==1) | (df[LABEL]==0)]#.sum()

In [18]:
df.shape

(50000, 2)

In [19]:
df[LABEL].isnull().sum()

0

In [20]:
df["text"] = df["text"].apply(preprocess.func_remove_upper_case)
df["text"] = df["text"].apply(preproc.func_remove_URL)
df["text"] = df["text"].apply(preproc.func_remove_html)
df["text"] = df["text"].apply(preproc.func_remove_emoji)

---

<center><i><h1>Sart Pipeline</h1></i></center>

---

In [21]:
if lang:
    # ---- Language detection of the text
    df.loc[:,"language"] = df[TEXT].progress_apply(preproc.func_detect_lang_google)
    # ---- Extract most frequent language 
    language = df.language.value_counts().index.tolist()[0]
    print(f"The language most present in the dataset is {language}")
else:
    language="en"

---

---

<center><h3>Prepare data for ML Classic</h3></center>

---

In [22]:
if sample:
    df_save = df.copy()
    df = df.sample(5000, random_state=42)

In [23]:
#df = df_save.copy()

In [24]:
# ---- Load stopwords 
if language=="fr":
    stop_word = np.loadtxt("stopwords-fr/stopwords-fr.txt", dtype=str)
if language=="en":
    stop_word = np.loadtxt("stopwords_en.txt", dtype=str)

In [25]:
df.loc[:,TEXT+"_sw"] = df.loc[:,TEXT].progress_apply(lambda x : preproc.func_remove_stop_words(x, stop_word))

100%|██████████| 5000/5000 [00:08<00:00, 583.25it/s]


In [26]:
if df[TEXT+"_sw"].isnull().sum()>0:
    print("Empty text")
    df[TEXT+"_sw"][df[TEXT+"_sw"].isnull()] = "empty_text"

In [27]:
df[LABEL].isnull().sum()

0

---

---

<h1><center>Machine Learning</center></h1>

---

---

In [28]:
# split the dataset into training and validation datasets 
# ML classic 
train_x_sw, valid_x_sw, y_train_sw, y_valid_sw = model_selection.train_test_split(df[TEXT+"_sw"], df[LABEL], random_state=42, stratify=df[LABEL], test_size=0.2)

# For Embeddings
train_x, valid_x, y_train, y_valid = model_selection.train_test_split(df[TEXT], df[LABEL], random_state=42, stratify=df[LABEL], test_size=0.2)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y_sw = encoder.fit_transform(y_train_sw)
valid_y_sw = encoder.fit_transform(y_valid_sw)
train_y = encoder.fit_transform(y_train)
valid_y = encoder.fit_transform(y_valid)

---

<center><h3>Classes Weight</h3></center>

---

In [29]:
# Compute the class weight with sklearn 
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)

In [30]:
print(*[f'Class weight: {round(i[0],4)}\tclass: {i[1]}' for i in zip(class_weights, np.unique(y_train))], sep='\n')

Class weight: 1.0194	class: 0
Class weight: 0.9814	class: 1


In [31]:
# Determined if the dataset is balanced or imbalanced 
ratio = np.min(df.label.value_counts()) / np.max(df.label.value_counts())
if ratio > 0.1:      # Ratio 1:10 -> limite blanced / imbalanced 
    balanced = True
    print(f"\nThe dataset is balanced (ratio={round(ratio, 3)})")
else:
    balanced = False
    print(f"\nThe dataset is imbalanced (ratio={round(ratio, 3)})")
    #from imblearn.over_sampling import ADASYN
    # put class for debalanced data 
    # in progress


The dataset is balanced (ratio=0.962)


---

<h2>Save Unique Labels</h2>

---

In [32]:
# Keep the unique label corresponding to their encoding correspondance
labels = df[LABEL].unique()
test=pd.DataFrame(data=np.transpose([labels,encoder.fit_transform(labels)]), columns=["labels", "encoding"]).sort_values(by=["encoding"])
labels=test.labels.tolist()

---

<h3>One-Hot encoding (CountVectorizing)</h3>

---

In [33]:
%%time
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df[TEXT]+"_sw")

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x_sw)
xvalid_count =  count_vect.transform(valid_x_sw)

CPU times: user 1.64 s, sys: 46.9 ms, total: 1.69 s
Wall time: 1.74 s


In [34]:
#xtrain_tfidf.toarray()[0][xtrain_tfidf.toarray()[0]  >0]

---

<h3>TF-IDF</h3>

---

In [35]:
%%time
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=10000)
tfidf_vect.fit(df[TEXT])
xtrain_tfidf =  tfidf_vect.transform(train_x_sw)
xvalid_tfidf =  tfidf_vect.transform(valid_x_sw)
print("word level tf-idf done")
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=10000)
tfidf_vect_ngram.fit(df[TEXT])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x_sw)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x_sw)
print("ngram level tf-idf done")
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',  ngram_range=(2,3), max_features=10000) #token_pattern=r'\w{1,}',
tfidf_vect_ngram_chars.fit(df[TEXT])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x_sw) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x_sw) 
print("characters level tf-idf done")

word level tf-idf done
ngram level tf-idf done
characters level tf-idf done
CPU times: user 20.8 s, sys: 1.19 s, total: 22 s
Wall time: 22.5 s


---

<h2>Load Pre-Trained model fastText</h2>

---

In [36]:
%%time
if language=="fr":
    pretrained = fasttext.FastText.load_model('fastText/cc.fr.300.bin')
if language=="en":
    pretrained = fasttext.FastText.load_model('fastText/crawl-300d-2M-subword.bin')
    

CPU times: user 6.02 s, sys: 11.3 s, total: 17.3 s
Wall time: 17.9 s




---

<h2>Word Embeddings</h2>

---

In [37]:
%%time 
# create a tokenizer 
token = Tokenizer()
token.fit_on_texts(df[TEXT])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=300)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=300)

# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
words = []
for word, i in tqdm(word_index.items()):
    embedding_vector = pretrained.get_word_vector(word) #embeddings_index.get(word)
    words.append(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        

100%|██████████| 43321/43321 [00:00<00:00, 45221.84it/s]

CPU times: user 3.02 s, sys: 109 ms, total: 3.12 s
Wall time: 3.26 s





In [38]:
#words[1], embedding_matrix[1]

In [39]:
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]

In [40]:
from sklearn.metrics import roc_auc_score

In [41]:
def report(clf, x, y, name='classifier', cv=5, fit_params=None):
    
    scoring = {'acc': 'accuracy',
           'balanced_accuracy':'balanced_accuracy',
           'prec': 'precision',
           'recall': 'recall',
           'f1-score':'f1',
           'tp': make_scorer(tp), 'tn': make_scorer(tn),
           'fp': make_scorer(fp), 'fn': make_scorer(fn),
            'cohens_kappa':make_scorer(cohen_kappa_score),
            'matthews_corrcoef':make_scorer(matthews_corrcoef),
              "roc_auc":make_scorer(roc_auc_score)}
    #if clf==XGBClassifier():
    scores = cross_validate(clf, x, y, scoring=scoring,
                         cv=cv, return_train_score=False, n_jobs=-1,  fit_params=fit_params)
    
    index = []
    value = []
    index.append("Model")
    value.append(name)
    for i in scores:
        if i == "estimator":
            continue
        for j in enumerate(scores[i]):
            index.append(i+"_cv"+str(j[0]+1))
            value.append(j[1])
        #if any(x in i for x in scoring.keys()):
        
        index.append(i+"_mean")
        value.append(np.mean(scores[i]))
        index.append(i+"_std")
        value.append(np.std(scores[i]))
        
    return pd.DataFrame(data=value, index=index).T

---

<center><h2>Multinomial Naive Bayes</h2></center>

---

In [42]:
df_results = pd.DataFrame()

In [43]:
%%time
if multinomial_naive_bayes:
    df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_count,train_y_sw, name='NB_Count_Vectors', cv=5))
    df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_tfidf,train_y_sw, name='NB_WordLevel_TF-IDF', cv=5))
    df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram,train_y_sw, name='NB_N-Gram_Vectors', cv=5))
    df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars,train_y_sw, name='NB_CharLevel_Vectors', cv=5))

CPU times: user 172 ms, sys: 4.41 s, total: 4.58 s
Wall time: 6.89 s


---

<center><h2>Logistic Regression</h2></center>

---

In [44]:
%%time
if logistic_regression:
    df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_count,train_y_sw, name='LR_Count_Vectors', cv=5))
    df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_tfidf,train_y_sw, name='LR_WordLevel_TF-IDF', cv=5))
    df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_tfidf_ngram,train_y_sw, name='LR_N-Gram_Vectors', cv=5))
    df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_tfidf_ngram_chars,train_y_sw, name='LR_CharLevel_Vectors', cv=5))

CPU times: user 266 ms, sys: 0 ns, total: 266 ms
Wall time: 4.59 s


---

<center><h2>SVM</h2></center>

---

In [45]:
%%time
if svm_model:
    df_results = df_results.append(report(svm.SVC(), xtrain_count,train_y_sw, name='SVM_Count_Vectors', cv=5))
    df_results = df_results.append(report(svm.SVC(), xtrain_tfidf,train_y_sw, name='SVM_WordLevel_TF-IDF', cv=5))
    df_results = df_results.append(report(svm.SVC(), xtrain_tfidf_ngram,train_y_sw, name='SVM_N-Gram_Vectors', cv=5))
    df_results = df_results.append(report(svm.SVC(), xtrain_tfidf_ngram_chars,train_y_sw, name='SVM_CharLevel_Vectors', cv=5))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 19.8 µs


---

<center><h2>RandomForest</h2></center>

---

In [46]:
%%time
if random_forest:
    df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), xtrain_count,train_y_sw, name='RF_Count_Vectors', cv=5))

CPU times: user 46.9 ms, sys: 62.5 ms, total: 109 ms
Wall time: 13.7 s


In [47]:
%%time
if random_forest:
    df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), xtrain_tfidf,train_y_sw, name='RF_WordLevel_TF-IDF', cv=5))

CPU times: user 62.5 ms, sys: 0 ns, total: 62.5 ms
Wall time: 7.98 s


In [48]:
%%time
if random_forest:
    df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), xtrain_tfidf_ngram,train_y_sw, name='RF_N-Gram_Vectors', cv=5))

CPU times: user 46.9 ms, sys: 15.6 ms, total: 62.5 ms
Wall time: 11.7 s


In [49]:
%%time
if random_forest:
    df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), xtrain_tfidf_ngram_chars,train_y_sw, name='RF_CharLevel_Vectors', cv=5))

CPU times: user 93.8 ms, sys: 15.6 ms, total: 109 ms
Wall time: 18.2 s


In [50]:
%%time
if random_forest:
    df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), train_seq_x,train_y, name='RF_Words', cv=5))

CPU times: user 78.1 ms, sys: 0 ns, total: 78.1 ms
Wall time: 7.66 s


---

<center><h2>Stochastic Descent</h2></center>

---

In [51]:
%%time
if sgd:
    df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), xtrain_count,train_y_sw, name='SGD_Count_Vectors', cv=5))
    df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), xtrain_tfidf,train_y_sw, name='SGD_WordLevel_TF-IDF', cv=5))
    df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), xtrain_tfidf_ngram,train_y_sw, name='SGD_N-Gram_Vectors', cv=5))
    df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), xtrain_tfidf_ngram_chars,train_y_sw, name='SGD_CharLevel_Vectors', cv=5))
    df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), train_seq_x,train_y, name='SGD_Words', cv=5))

CPU times: user 250 ms, sys: 78.1 ms, total: 328 ms
Wall time: 1.98 s


---

<center><h2>Gradient Boosting</h2></center>

---

In [52]:
%%time
if gradient_boosting:
    df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                               validation_fraction=0.2,
                                               n_iter_no_change=10, tol=0.01,
                                               random_state=0, verbose=0 ), xtrain_count,train_y_sw, name='GB_Count_Vectors', cv=5))

CPU times: user 46.9 ms, sys: 62.5 ms, total: 109 ms
Wall time: 35.6 s


In [53]:
%%time
if gradient_boosting:
    df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                               validation_fraction=0.2,
                                               n_iter_no_change=10, tol=0.01,
                                               random_state=0, verbose=0 ), xtrain_tfidf,train_y_sw, name='GB_WordLevel_TF-IDF', cv=5))

CPU times: user 62.5 ms, sys: 31.2 ms, total: 93.8 ms
Wall time: 26.2 s


In [54]:
%%time
if gradient_boosting:
    df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                               validation_fraction=0.2,
                                               n_iter_no_change=10, tol=0.01,
                                               random_state=0, verbose=0 ), xtrain_tfidf_ngram,train_y_sw, name='GB_N-Gram_Vectors', cv=5))

CPU times: user 15.6 ms, sys: 62.5 ms, total: 78.1 ms
Wall time: 2.45 s


In [55]:
%%time
if gradient_boosting:
    df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                               validation_fraction=0.2,
                                               n_iter_no_change=10, tol=0.01,
                                               random_state=0, verbose=0 ), xtrain_tfidf_ngram_chars,train_y_sw, name='GB_CharLevel_Vectors', cv=5))

CPU times: user 62.5 ms, sys: 31.2 ms, total: 93.8 ms
Wall time: 2min 38s


In [56]:
%%time
if gradient_boosting:
    df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                               validation_fraction=0.2,
                                               n_iter_no_change=10, tol=0.01,
                                               random_state=0, verbose=0 ), train_seq_x,train_y, name='GB_CharLevel_Vectors', cv=5))

CPU times: user 0 ns, sys: 46.9 ms, total: 46.9 ms
Wall time: 4.84 s


---

<h2>XGBoost Classifier</h2>

---

All the XGBoost have early stopping implemented with 10 rounds

In [57]:
%%time
if xgboost_classifier:
    fit_params={'early_stopping_rounds':10,\
                         'eval_set':[(xvalid_count, valid_y_sw)]}
    df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), xtrain_count,train_y_sw, name='XGB_Count_Vectors', cv=5, fit_params=fit_params))




CPU times: user 62.5 ms, sys: 1.08 s, total: 1.14 s
Wall time: 20.7 s


In [58]:
%%time
if xgboost_classifier:
    fit_params={'early_stopping_rounds':10,\
                         'eval_set':[(xvalid_tfidf, valid_y_sw)]}
    df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), xtrain_tfidf,train_y_sw, name='XGB_WordLevel_TF-IDF', cv=5, fit_params=fit_params))

CPU times: user 31.2 ms, sys: 1.69 s, total: 1.72 s
Wall time: 17.4 s


In [59]:
%%time
if xgboost_classifier:
    fit_params={'early_stopping_rounds':10,\
                         'eval_set':[(xvalid_tfidf_ngram, valid_y_sw)]}
    df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), xtrain_tfidf_ngram,train_y_sw, name='XGB_N-Gram_Vectors', cv=5, fit_params=fit_params))

CPU times: user 78.1 ms, sys: 15.6 ms, total: 93.8 ms
Wall time: 4.62 s


In [60]:
%%time
if xgboost_classifier:
    fit_params={'early_stopping_rounds':10,\
                         'eval_set':[(xvalid_tfidf_ngram_chars, valid_y_sw)]}
    df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), xtrain_tfidf_ngram_chars,train_y_sw, name='XGB_CharLevel_Vectors', cv=5, fit_params=fit_params))

CPU times: user 46.9 ms, sys: 62.5 ms, total: 109 ms
Wall time: 1min 42s


In [61]:
%%time
if xgboost_classifier:
    fit_params={'early_stopping_rounds':10,\
                         'eval_set':[(valid_seq_x,valid_y)]}
    df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), train_seq_x,train_y, name='XGB_Words', cv=5, fit_params=fit_params))

CPU times: user 78.1 ms, sys: 31.2 ms, total: 109 ms
Wall time: 5.59 s


---

In [62]:
df_results[[ "Model","test_acc_mean","test_acc_std", 
                        "test_balanced_accuracy_mean","test_balanced_accuracy_std", 
                       "test_prec_mean", "test_prec_std", 
                        "test_recall_mean","test_recall_std", 
                       "test_f1-score_mean", "test_f1-score_std", 
                       "test_cohens_kappa_mean", "test_cohens_kappa_std", "test_matthews_corrcoef_mean","test_matthews_corrcoef_std", 
                       "test_roc_auc_mean", "test_roc_auc_std"]].sort_values(by=["test_prec_mean", "test_recall_mean"], ascending=False).head()

Unnamed: 0,Model,test_acc_mean,test_acc_std,test_balanced_accuracy_mean,test_balanced_accuracy_std,test_prec_mean,test_prec_std,test_recall_mean,test_recall_std,test_f1-score_mean,test_f1-score_std,test_cohens_kappa_mean,test_cohens_kappa_std,test_matthews_corrcoef_mean,test_matthews_corrcoef_std,test_roc_auc_mean,test_roc_auc_std
0,NB_WordLevel_TF-IDF,0.84425,0.00979796,0.844484,0.00987974,0.85776,0.0148784,0.832679,0.0119426,0.844921,0.00933715,0.688565,0.0196391,0.689048,0.0197066,0.844484,0.00987974
0,NB_Count_Vectors,0.83125,0.00890926,0.831661,0.00888021,0.851671,0.0118938,0.810105,0.0166339,0.83022,0.00965036,0.662697,0.017784,0.663726,0.017619,0.831661,0.00888021
0,LR_Count_Vectors,0.8495,0.0120312,0.849242,0.0119993,0.844876,0.0109466,0.86311,0.015549,0.853854,0.0119668,0.69877,0.0240643,0.699015,0.0241805,0.849242,0.0119993
0,SGD_WordLevel_TF-IDF,0.84925,0.00963717,0.848965,0.00977927,0.843826,0.015964,0.864581,0.0105832,0.853936,0.00827072,0.698246,0.0193793,0.698716,0.018907,0.848965,0.00977927
0,RF_WordLevel_TF-IDF,0.828,0.009,0.828219,0.00899646,0.841285,0.00950077,0.816482,0.0106341,0.828675,0.00909323,0.656069,0.0179905,0.6564,0.0179992,0.828219,0.00899646


<center><h1>Deep Learning</h1></center>

---

<h3>Cohen’s kappa</h3>

The function [cohen_kappa_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html#sklearn.metrics.cohen_kappa_score) computes [Cohen’s kappa](https://en.wikipedia.org/wiki/Cohen%27s_kappa) statistic. This measure is intended to compare labelings by different human annotators, not a classifier versus a ground truth.

The kappa score (see docstring) is a number between -1 and 1. Scores above .8 are generally considered good agreement; zero or lower means no agreement (practically random labels).

Kappa scores can be computed for binary or multiclass problems, but not for multilabel problems (except by manually computing a per-label score) and not for more than two annotators.

<h3>Balanced Accuracy</h3>

Compute the balanced accuracy

The balanced accuracy in binary and multiclass classification problems to deal with imbalanced datasets. It is defined as the average of recall obtained on each class.

The best value is 1 and the worst value is 0 when adjusted=False

---

<h3>Early Stopping, Model saving, Class weight configuration</h3>

In [63]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='auto', patience=3)
check_p = tf.keras.callbacks.ModelCheckpoint("save_models/model.h5", save_best_only=True)

In [64]:
class_w = {}
for i in zip(range(len(class_weights)), class_weights):
    class_w[i[0]] = i[1]

---

In [65]:
from sklearn.model_selection import StratifiedKFold

In [66]:
from sklearn.metrics import balanced_accuracy_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

In [67]:
def cross_validate_NN(model, X, y, X_test, y_test, callbacks,name="NN", fit_params=None, scoring=None, n_splits=5):
    #print(model.__class__.__name__)
    # ---- Parameters initialisation
    seed = 42
    i = 1
    np.random.seed(seed)
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    fit_time, score_time, acc, acc_balanced = [], [], [], []
    pred_weight, recall_weight, f1_score_weight = [], [], []
    tp_, tn_, fp_, fn_ = [], [], [], []
    cohen_kappa_, matthews_corrcoef_, roc_auc_ = [], [], []
    index = ["Model"]
    results = [name]
    # ---- Loop on k-fold for cross-valisation
    for train, test in kfold.split(X, y):
        # create model
        print(f"k-fold : {i}")
        fit_start = time.time()
        _model = model
        _model.fit(X[train], y[train],
                        epochs=1000, callbacks=[callbacks],
                        validation_split=0.2, verbose=False)
        
        fit_end = time.time() - fit_start

        _acc = _model.evaluate(X_test, y_test, verbose=0)

        score_start = time.time()
        y_pred = (model.predict(X_test)>0.5).astype(int)
        score_end = time.time() - score_start

        # ---- save each metric
        fit_time.append(fit_end)
        index.append('fit_time_cv'+str(i))
        results.append(fit_end)
        
        score_time.append(score_end)
        index.append('score_time_cv'+str(i))
        results.append(score_end)
        
        acc.append(accuracy_score(y_test, y_pred))
        index.append( 'test_acc_cv'+str(i))
        results.append(acc[-1])
        
        acc_balanced.append(balanced_accuracy_score(y_test, y_pred))
        index.append('test_balanced_accuracy_cv'+str(i))
        results.append(acc_balanced[-1])
        
        pred_weight.append(precision_score(y_test, y_pred))
        index.append('test_prec_cv'+str(i))
        results.append(pred_weight[-1])
        
        recall_weight.append(recall_score(y_test, y_pred))
        results.append(recall_weight[-1])
        index.append('test_recall_cv'+str(i))
        
        f1_score_weight.append(f1_score(y_test, y_pred) )
        index.append('test_f1-score_cv'+str(i))
        results.append(f1_score_weight[-1])
        
        tp_.append(tp(y_test, y_pred))
        index.append('test_tp_cv'+str(i))
        results.append(tp_[-1])
        
        tn_.append(tn(y_test, y_pred))
        index.append('test_tn_cv'+str(i))
        results.append(tn_[-1])
        
        fp_.append(fp(y_test, y_pred))
        index.append('test_fp_cv'+str(i))
        results.append(fp_[-1])
        
        fn_.append(fn(y_test, y_pred))
        index.append('test_fn_cv'+str(i))
        results.append(fn_[-1])
        
        cohen_kappa_.append(cohen_kappa_score(y_test, y_pred))
        index.append('test_cohens_kappa_cv'+str(i))
        results.append(cohen_kappa_[-1])
        
        matthews_corrcoef_.append(matthews_corrcoef(y_test, y_pred))
        index.append('test_matthews_corrcoef_cv'+str(i))
        results.append(matthews_corrcoef_[-1])
        
        roc_auc_.append(roc_auc_score(y_test, y_pred))
        index.append('test_roc_auc_cv'+str(i))
        results.append(roc_auc_[-1])
        
        i+=1

    index.extend([ "fit_time_mean","fit_time_std",'score_time_mean', 'score_time_std',
                  "test_acc_mean","test_acc_std", 
                        "test_balanced_accuracy_mean","test_balanced_accuracy_std", 
                       "test_prec_mean", "test_prec_std", 
                        "test_recall_mean","test_recall_std", 
                       "test_f1-score_mean", "test_f1-score_std", 'test_tp_mean', 'test_tp_std', 'test_tn_mean', 
                  'test_tn_std', 'test_fp_mean', 'test_fp_std','test_fn_mean', 'test_fn_std',
                       "test_cohens_kappa_mean", "test_cohens_kappa_std", "test_matthews_corrcoef_mean","test_matthews_corrcoef_std", 
                       "test_roc_auc_mean", "test_roc_auc_std"])

    results.extend([np.mean(fit_time), np.std(fit_time),np.mean(score_time), np.std(score_time),np.mean(acc), np.std(acc), np.mean(acc_balanced), 
     np.std(acc_balanced), np.mean(pred_weight),np.std(pred_weight),
     np.mean(recall_weight),np.std(recall_weight),np.mean(f1_score_weight), np.std(f1_score_weight), np.mean(tp_),np.std(tp_),np.mean(tn_),np.std(tn_),
                    np.mean(fp_),np.std(fp_), np.mean(fn_),np.std(fn_), np.mean(cohen_kappa_),np.std(cohen_kappa_),
     np.mean(matthews_corrcoef_),np.std(matthews_corrcoef_),np.mean(roc_auc_),np.std(roc_auc_)])

    #print(list(zip(index, results)))
    
    return pd.DataFrame(results, index=index).T

---

<h2>Shallow Neural Networks</h2>

In [68]:
def shallow_neural_networks(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 16)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
      embedded,
      keras.layers.GlobalAveragePooling1D(),
      
      keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [69]:
%%time
if shallow_network:
    df_results = df_results.append(cross_validate_NN(shallow_neural_networks(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="Shallow_NN_WE", scoring=None, n_splits=5))

k-fold : 1
k-fold : 2
k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 1h 22min 19s, sys: 7min 22s, total: 1h 29min 42s
Wall time: 35min 30s


In [70]:
#c = cross_validate_NN(shallow_neural_networks(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="Shallow_NN_WE", scoring=None, n_splits=5)
#c[["test_acc_cv1", "test_recall_cv1"]]

---

<h2>Deep Neural Networks</h2>

---

In [71]:
def deep_neural_networks(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 50)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
      embedded,
      keras.layers.GlobalAveragePooling1D(),
      keras.layers.Dense(16, activation='relu'),
      keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [72]:
%%time
if deep_nn:
    df_results = df_results.append(cross_validate_NN(deep_neural_networks(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="Deep_NN_WE",scoring=None, n_splits=5))

k-fold : 1
k-fold : 2
k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 4min 52s, sys: 34.2 s, total: 5min 26s
Wall time: 2min 14s


<h2>Deep Neural Networks variation 1</h2>

In [73]:
def deep_neural_networks_var1(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
      embedded,
      keras.layers.GlobalAveragePooling1D(),
      keras.layers.Dense(16, activation='relu'),
      keras.layers.Dense(16, activation='relu'),
      keras.layers.Dense(1  if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [74]:
%%time
if deep_nn:
    df_results = df_results.append(cross_validate_NN(deep_neural_networks_var1(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="Deep_NN_var1_WE",scoring=None, n_splits=5))

k-fold : 1
k-fold : 2
k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 2min 20s, sys: 13.3 s, total: 2min 33s
Wall time: 1min 2s


<h2>Deep Neural Networks variation 2</h2>

In [75]:
def deep_neural_networks_var2(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
      embedded,
      keras.layers.GlobalAveragePooling1D(),
      keras.layers.Dense(32, activation='relu'),
      keras.layers.Dense(16, activation='relu'),
      keras.layers.Dense(1  if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model


In [76]:
%%time
if deep_nn:
    df_results = df_results.append(cross_validate_NN(deep_neural_networks_var2(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="Deep_NN_var2_WE",scoring=None, n_splits=5))

k-fold : 1
k-fold : 2
k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 1min 47s, sys: 11.2 s, total: 1min 59s
Wall time: 47.7 s


---

<h2>Recurent Neural Network - RNN</h2>

---

In [77]:
def create_rnn_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 50)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SimpleRNN(40, return_sequences=True),
    keras.layers.SimpleRNN(40, return_sequences=True),
    keras.layers.SimpleRNN(40, return_sequences=True),
    keras.layers.SimpleRNN(40),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=1e-4),
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [78]:
%%time
if rnn:
    df_results = df_results.append(cross_validate_NN(create_rnn_model(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="RNN_WE",scoring=None, n_splits=5))

k-fold : 1
k-fold : 2
k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 52min 2s, sys: 6min 56s, total: 58min 59s
Wall time: 19min 20s


---

<h2>Convolutional Neural Network</h2>

---

In [79]:
def create_conv_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) +1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Conv1D(100, 5, activation='relu'), # padding='same'
    keras.layers.Dropout(0.2),
    keras.layers.MaxPooling1D(pool_size=4),
    keras.layers.Conv1D(64, 5, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.MaxPooling1D(pool_size=4),
    keras.layers.Conv1D(32, 5, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.GlobalMaxPooling1D(),

    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=1e-4),
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [80]:
%%time
if cnn:
    df_results = df_results.append(cross_validate_NN(create_rnn_model(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="CNN_WE",scoring=None, n_splits=5))

k-fold : 1
k-fold : 2
k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 40min 59s, sys: 5min 26s, total: 46min 26s
Wall time: 15min 13s


---

<h2>Recurrent Neural Network – LSTM</h2>

---

In [81]:
def create_lstm_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) +1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index)+1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.LSTM(32),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [82]:
%%time
if lstm:
    df_results = df_results.append(cross_validate_NN(create_lstm_model(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="LSTM_WE",scoring=None, n_splits=5))

k-fold : 1


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


k-fold : 2
k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 20min 50s, sys: 1min 45s, total: 22min 35s
Wall time: 6min 54s


---

<h2>CNN – LSTM</h2>

---

In [83]:
def create_cnn_lstm_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Conv1D(128, 5, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.MaxPooling1D(pool_size=4),
    keras.layers.LSTM(32),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [84]:
%%time
if cnn_lstm:
    df_results = df_results.append(cross_validate_NN(create_cnn_lstm_model(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es,name="CNN_LSTM_WE", scoring=None, n_splits=5))

k-fold : 1
k-fold : 2
k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 30min 12s, sys: 1min 54s, total: 32min 6s
Wall time: 9min 6s


---

<h2>CNN – GRU</h2>

---

In [85]:
def create_cnn_gru_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Conv1D(128, 5, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.MaxPooling1D(pool_size=4),
    keras.layers.GRU(32),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [86]:
%%time
if cnn_gru:
    df_results = df_results.append(cross_validate_NN(create_cnn_gru_model(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="CNN_GRU_WE", scoring=None, n_splits=5))

k-fold : 1


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


k-fold : 2
k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 26min 6s, sys: 1min 12s, total: 27min 19s
Wall time: 7min 48s


---

<h2>Recurrent Neural Network – GRU</h2>

---

tf.keras.layers.GRU(
    units, activation='tanh', recurrent_activation='sigmoid', use_bias=True,
    kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal',
    bias_initializer='zeros', kernel_regularizer=None, recurrent_regularizer=None,
    bias_regularizer=None, activity_regularizer=None, kernel_constraint=None,
    recurrent_constraint=None, bias_constraint=None, dropout=0.0,
    recurrent_dropout=0.0, implementation=2, return_sequences=False,
    return_state=False, go_backwards=False, stateful=False, unroll=False,
    time_major=False, reset_after=True, **kwargs
)

In [87]:
def create_gru_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.GRU(32),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [88]:
%%time
if gru:
    df_results = df_results.append(cross_validate_NN(create_gru_model(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="GRU_WE", scoring=None, n_splits=5))

k-fold : 1
k-fold : 2
k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 41min 18s, sys: 3min 36s, total: 44min 55s
Wall time: 14min 53s


---

<h2>Bidirectional RNN</h2>

---

In [89]:
def create_bidirec_rnn_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Bidirectional(keras.layers.SimpleRNN(32, return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.SimpleRNN(32, return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.SimpleRNN(32, return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.SimpleRNN(32)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [90]:
%%time
if bidirectional_rnn:
    df_results = df_results.append(cross_validate_NN(create_bidirec_rnn_model(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="BiRNN_WE",scoring=None, n_splits=5))

k-fold : 1
k-fold : 2
k-fold : 3
k-fold : 4


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


k-fold : 5
CPU times: user 1h 18min 16s, sys: 9min 21s, total: 1h 27min 38s
Wall time: 26min 47s


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


---

<h2>Bidirectional LSTM</h2>

---

In [91]:
def create_bidirec_lstm_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Bidirectional(keras.layers.LSTM(32)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [92]:
%%time
if bidirectional_lstm:
    df_results = df_results.append(cross_validate_NN(create_bidirec_lstm_model(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="BiLSTM_WE",scoring=None, n_splits=5))

k-fold : 1


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


k-fold : 2


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 45min 13s, sys: 1min 19s, total: 46min 33s
Wall time: 12min 55s


---

<h2>Bidirectional GRU</h2>

---

In [93]:
def create_bidirec_gru_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Bidirectional(keras.layers.GRU(32)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [94]:
%%time
if bidirectional_gru:
    df_results = df_results.append(cross_validate_NN(create_bidirec_gru_model(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="BiGRU_WE",scoring=None, n_splits=5))

k-fold : 1
k-fold : 2
k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 59min 28s, sys: 2min 15s, total: 1h 1min 43s
Wall time: 17min 8s


---

<h2>Recurrent Convolutional Neural Network</h2>

---

In [95]:
def create_rcnn(X, word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300,input_length=X.shape[1], weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.GRU(32,return_sequences=True)),
    keras.layers.Convolution1D(32, 3, activation="relu"),
    keras.layers.GlobalMaxPool1D(),
    keras.layers.Dense(25, activation="relu"),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [96]:
%%time
if rcnn:
    df_results = df_results.append(cross_validate_NN(create_rcnn(train_seq_x, word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="RCNN_WE",scoring=None, n_splits=5))

k-fold : 1
k-fold : 2
k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 1h 1min 25s, sys: 2min 21s, total: 1h 3min 46s
Wall time: 17min 53s


---

<h2>Recurrent Convolutional Neural Network variation 1</h2>

---

In [97]:
def create_rcnn_var1(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.LSTM(32,return_sequences=True)),
    keras.layers.Convolution1D(32, 3, activation="relu"),
    keras.layers.GlobalMaxPool1D(),
    keras.layers.Dense(25, activation="relu"),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [98]:
%%time
if rcnn:
    df_results = df_results.append(cross_validate_NN(create_rcnn_var1(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="RCNN_var1_WE",scoring=None, n_splits=5))

k-fold : 1
k-fold : 2
k-fold : 3


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


k-fold : 4
k-fold : 5
CPU times: user 55min 1s, sys: 1min 34s, total: 56min 36s
Wall time: 16min 44s


---

<h2>Recurrent Convulational Neural Network variation 2</h2>

---

In [99]:
def create_rcnn_var2(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.GRU(32,return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.GRU(32,return_sequences=True)),
    keras.layers.Convolution1D(32, 3, activation="relu"),
    keras.layers.GlobalMaxPool1D(),
    keras.layers.Dense(25, activation="relu"),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [100]:
%%time
if rcnn:
    df_results = df_results.append(cross_validate_NN(create_rcnn_var2(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="RCNN_var2_WE",scoring=None, n_splits=5))

k-fold : 1
k-fold : 2
k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 2h 10min 54s, sys: 6min 19s, total: 2h 17min 14s
Wall time: 38min 10s


---

<h2>Recurrent Convulational Neural Network variation 3</h2>

---

In [101]:
def create_rcnn_var3(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.GRU(32,return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.LSTM(32,return_sequences=True)),
    keras.layers.Convolution1D(32, 3, activation="relu"),
    keras.layers.GlobalMaxPool1D(),
    keras.layers.Dense(25, activation="relu"),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    if len(label)==2:
        model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
    else: 
        model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
    #print(model.summary())
    
    return model

In [102]:
%%time
if rcnn:
    df_results = df_results.append(cross_validate_NN(create_rcnn_var3(word_index, pre_trained=pre_trained), train_seq_x, train_y, valid_seq_x, valid_y, es, name="RCNN_var3_WE",scoring=None, n_splits=5))

k-fold : 1
k-fold : 2


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


k-fold : 3
k-fold : 4
k-fold : 5
CPU times: user 50min 29s, sys: 2min 32s, total: 53min 1s
Wall time: 14min 52s


---

<center><h1>Results</h1></center>

---

In [103]:
df_results = df_results.reset_index()

In [107]:
df_results[[ "Model","test_acc_mean","test_acc_std", 
                        "test_balanced_accuracy_mean","test_balanced_accuracy_std", 
                       "test_prec_mean", "test_prec_std", 
                        "test_recall_mean","test_recall_std", 
                       "test_f1-score_mean", "test_f1-score_std", 
                       "test_cohens_kappa_mean", "test_cohens_kappa_std", "test_matthews_corrcoef_mean","test_matthews_corrcoef_std", 
                       "test_roc_auc_mean", "test_roc_auc_std"]][df_results["test_prec_mean"]<1].sort_values(by=["test_prec_mean", "test_recall_mean"], ascending=False)

Unnamed: 0,Model,test_acc_mean,test_acc_std,test_balanced_accuracy_mean,test_balanced_accuracy_std,test_prec_mean,test_prec_std,test_recall_mean,test_recall_std,test_f1-score_mean,test_f1-score_std,test_cohens_kappa_mean,test_cohens_kappa_std,test_matthews_corrcoef_mean,test_matthews_corrcoef_std,test_roc_auc_mean,test_roc_auc_std
41,RCNN_WE,0.8248,0.0238445,0.826875,0.0225728,0.921394,0.036202,0.723137,0.0868554,0.805326,0.0388524,0.651076,0.0466568,0.671749,0.0313675,0.826875,0.0225728
28,Shallow_NN_WE,0.7724,0.030748,0.774974,0.029947,0.874746,0.00343647,0.646275,0.0700919,0.741085,0.0492035,0.54713,0.0603521,0.568289,0.0487082,0.774974,0.029947
30,Deep_NN_var1_WE,0.8112,0.00670522,0.812709,0.00612512,0.874506,0.0224713,0.737255,0.0385033,0.798841,0.0140079,0.623457,0.0129221,0.632775,0.00743012,0.812709,0.00612512
35,CNN_LSTM_WE,0.8292,0.0416625,0.83006,0.0401371,0.873832,0.0431336,0.787059,0.126871,0.818868,0.0660651,0.659149,0.0817655,0.671717,0.0625067,0.83006,0.0401371
31,Deep_NN_var2_WE,0.8134,0.0157048,0.81485,0.0151005,0.873564,0.0121224,0.742353,0.0461947,0.801492,0.0240338,0.627823,0.0308348,0.636273,0.0241465,0.81485,0.0151005
29,Deep_NN_WE,0.8166,0.00233238,0.817939,0.00214471,0.872539,0.0184199,0.75098,0.0224251,0.806693,0.00616396,0.634089,0.00450131,0.641114,0.00511518,0.817939,0.00214471
1,NB_WordLevel_TF-IDF,0.84425,0.00979796,0.844484,0.00987974,0.85776,0.0148784,0.832679,0.0119426,0.844921,0.00933715,0.688565,0.0196391,0.689048,0.0197066,0.844484,0.00987974
0,NB_Count_Vectors,0.83125,0.00890926,0.831661,0.00888021,0.851671,0.0118938,0.810105,0.0166339,0.83022,0.00965036,0.662697,0.017784,0.663726,0.017619,0.831661,0.00888021
4,LR_Count_Vectors,0.8495,0.0120312,0.849242,0.0119993,0.844876,0.0109466,0.86311,0.015549,0.853854,0.0119668,0.69877,0.0240643,0.699015,0.0241805,0.849242,0.0119993
14,SGD_WordLevel_TF-IDF,0.84925,0.00963717,0.848965,0.00977927,0.843826,0.015964,0.864581,0.0105832,0.853936,0.00827072,0.698246,0.0193793,0.698716,0.018907,0.848965,0.00977927


In [109]:
if save_results:
    df_results.sort_values(by=["test_prec_mean", "test_recall_mean"], ascending=False).to_csv("model_selection_results_IMDB.csv", index=False)