<h1><center>Text Classification</center></h1>

In [1]:
%matplotlib inline

import time
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import os
import random
import re
from googletrans import Translator
import sklearn
from sklearn.utils import class_weight
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import decomposition, ensemble
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, recall_score, f1_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.utils import to_categorical

import itertools
import seaborn as sns
from xgboost import XGBClassifier
sns.set(style="darkgrid")


import string
import fasttext
import fasttext.util
from tqdm import tqdm

# ---- Call tqdm to see progress bar with pandas
tqdm().pandas()

Using TensorFlow backend.
0it [00:00, ?it/s]
  from pandas import Panel


In [2]:
print(sklearn.__version__)
print(tf.__version__)

0.23.1
2.2.0


In [3]:
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]

---

<h2><a id="content">Content</a></h2>

- [Parameters](#part_1)
- [List of Models](#part_2)
- [List of Metrics for the Model Selection](#part_3)
- [Sand box to load Data](#part_4)
- [Start Pipeline](#part_5)
    - [Prepare data to ML classic](#part_5_1)
- [Machine Learning](#part_6)
    - [Class Weights](#part_6_1)
    - [Save Unique Labels](#part_6_2)
    - [DataFrame for the Results](#part_6_3)
    - [One-Hot encoding](#part_6_4)
    - [TF-IDF](#part_6_5)
    - [Load Pre-trained Model FastText](#part_6_6)
    - [Word Embeddings](#part_6_7)
    - [Multinomial Naive Bayes](#part_6_8)
    - [Logistic Regression](#part_6_9)
    - [SVM](#part_6_10)
    - [k-NN](#part_6_11)
    - [RandomForest](#part_6_12)
    - [Stochastic Gradient Descent](#part_6_13)
    - [Gradient Boosting](#part_6_14)
    - [XGBoost Classifier](#part_6_15)
    - [Adaboost Classifier](#part_6_16)
    - [Catboost Classifier](#part_6_17)
    - [LightGBM](#part_6_18)
    - [ExtraTreesClassifier](#part_6_19)
- [Deep Learning](#part_7)
    - [Shallow Neural Networks](#part_7_1)
    - [Deep Neural Networks](#part_7_2)
    - [Recurrent Neural Networks (RNN)](#part_7_3)
    - [Convolutional Neural Networks (CNN)](#part_7_4)
    - [Long Short Terme Memory (LSTM)](#part_7_5)
    - [CNN-LSTM](#part_7_6)
    - [CNN-GRU](#part_7_7)
    - [Gated Recurrent Unit (GRU)](#part_7_8)
    - [Biderectional RNN](#part_7_9)
    - [Biderectional LSTM](#part_7_10)
    - [Bidirectional GRU](#part_7_11)
    - [Recurent Convulotional Neural Nerworks (RCNN)](#part_7_12)
    - [Transformers](#part_7_13)
- [Results](#part_8)
- [Visualization](#part_9)

---

<center><h2><a id="part_1">Parameters</a></h2></center>

---

This part allows you to determine the text column to classify as well as the label column.

In [4]:
TEXT           = "mails"
LABEL          = "label"
NAME_SAVE_FILE = "model_selection_results_multiclasses" # put just the name the .csv will be added at the end

# global parameters
num_gpu                = len(tf.config.experimental.list_physical_devices('GPU'))   # detect the number of gpu
CV_splits              = 5        # Number of splits for cross-validation and k-folds
save_results           = True     # if you want an output file containing all the results
lang                   = False    # test if you want to use Google API detection
sample                 = True     # use just a sample of data
nb_sample              = 5000     # default value of rows if sample selected
save_model             = True     # concat all the data representation
root_dir               = ""       # Place here the path where you want your models stored or use /path/to/your/folder/

In [5]:
# Name file 
NAME_ENCODER                  = "encoder.sav"
NAME_COUNT_VECT_MODEL         = "count_vect_model.sav"
NAME_TF_IDF_MODEL             = "TF_IDF_model.sav"
NAME_TF_IDF_NGRAM_MODEL       = "TF_IDF_ngram_model.sav"
NAME_TF_IDF_NGRAM_CHAR_MODEL  = "TF_IDF_ngram_chars_model.sav"
NAME_TOKEN_EMBEDDINGS         = "token_embeddings.sav"

---

<center><h2><a id="part_2">List of Models</a></h2></center>

---

In [6]:
# models 
multinomial_naive_bayes= True
logistic_regression    = True
svm_model              = False
k_nn_model             = True
sgd                    = True
random_forest          = False
gradient_boosting      = True
xgboost_classifier     = True
adaboost_classifier    = False 
catboost_classifier    = False 
lightgbm_classifier    = False 
extratrees_classifier  = True
shallow_network        = True
deep_nn                = True
rnn                    = True
lstm                   = True
cnn                    = True
gru                    = True
cnn_lstm               = True
cnn_gru                = True
bidirectional_rnn      = True
bidirectional_lstm     = True
bidirectional_gru      = True
rcnn                   = True
transformers           = False
pre_trained            = False

---

<center><h2>Create folders to save models</h2></center>

---

In [7]:
if save_model:
    # will create the folder to save all the models
    try:
        dir_name =  NAME_SAVE_FILE
        os.makedirs(os.path.join(root_dir,dir_name))
        print("The folder is created")
    except:
        print("The folder can not be created")

The folder can not be created


---

<center><h2><a id="part_3">List of Metrics for the Model Selection</a></h2></center>

---

Here you can put all the metrics you want (included in sklearn.metrics).

In [8]:
score_metrics = {'acc': accuracy_score,
               'balanced_accuracy': balanced_accuracy_score,
               'prec': precision_score,
               'recall': recall_score,
               'f1-score': f1_score,
               'tp': tp, 'tn': tn,
               'fp': fp, 'fn': fn,
               'cohens_kappa':cohen_kappa_score,
               'matthews_corrcoef':matthews_corrcoef,
               "roc_auc":roc_auc_score}

---

<center><h2><a id="part_4">Sand Box to Load Data</a></h2></center>

---

The sandbox is the working area of your data if it has not been processed before using the pipe

In [9]:
# Functions for preprocessing
def remove_upper_case( text):
    '''
    Function to transform upper string in title words
    @param text: (str) text 
    @return: (str) text without upper words 
    '''
    sentences = text.split("\n")
    new_sentences = []
    for i in sentences:
        words = text.split()
        stripped = [w.title() if w.isupper() else w for w in words]
        new_sentences.append(" ".join(stripped))
    return "\n".join(new_sentences)
  
def remove_URL( text):
    '''
    Function to remove url from text.
    @param text: (str) sentence
    @return: (str) clean text

    '''
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
    
    
def remove_html( text):
    '''
    Function regex to clean text from html balises.
    @param text: (str) sentence 
    @return: (str) clean text 
    '''
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
    
    

def remove_emoji( text):
    '''
    Function to remove emojis, symbols and pictograms etc from text
    @param text: (str) sentences 
    @return: (str) clean text 
    '''
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [10]:
#%%script false --no-raise-error
df = pd.read_csv("../projet_classification_mails/mails_clean_concat_ref_folders_train_2.csv", sep=";")
df_test  = pd.read_csv("../projet_classification_mails/mails_clean_concat_ref_folders_test_2.csv", sep=";")


#df = pd.read_csv("../projet_classification_mails/mails_clean_concat_ref_folders_2.csv", sep=";")
#df_test = df[(df[LABEL]=="co") | (df[LABEL]=="ft")]

df[LABEL][(df[LABEL]!="ft_notaire")  ] = "other" #& (df[LABEL]!="annulation")
df.drop_duplicates(inplace=True)
 
df_test[LABEL][(df_test[LABEL]!="ft_notaire") ] = "other" #  & (df_test[LABEL]!="annulation")
df_test.drop_duplicates(inplace=True)


#f = df[~(df[LABEL]=="ft") & ~(df[LABEL]=="co")]
print(df[LABEL].value_counts())
print(df_test[LABEL].value_counts())
print(df[TEXT].isnull().sum())
#df[TEXT][df[TEXT].isnull()] = "empty"
df.dropna(subset=[TEXT], inplace=True)
df_test.dropna(subset=[TEXT], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[LABEL][(df[LABEL]!="ft_notaire")  ] = "other" #& (df[LABEL]!="annulation")


other         20440
ft_notaire     6492
Name: label, dtype: int64
other         6488
ft_notaire    2096
Name: label, dtype: int64
0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[LABEL][(df_test[LABEL]!="ft_notaire") ] = "other" #  & (df_test[LABEL]!="annulation")


In [11]:
%%script false --no-raise-error
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

In [12]:
def load_imdb_sentiment_analysis_dataset(data_path, seed=123):
    """Loads the IMDb movie reviews sentiment analysis dataset.

    # Arguments
        data_path: string, path to the data directory.
        seed: int, seed for randomizer.

    # Returns
        A tuple of training and validation data.
        Number of training samples: 25000
        Number of test samples: 25000
        Number of categories: 2 (0 - negative, 1 - positive)

    # References
        Mass et al., http://www.aclweb.org/anthology/P11-1015

        Download and uncompress archive from:
        http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    """
    imdb_data_path = os.path.join(data_path, 'aclImdb')

    # Load the training data
    train_texts = []
    train_labels = []
    for category in ['pos', 'neg']:
        train_path = os.path.join(imdb_data_path, 'train', category)
        for fname in tqdm(sorted(os.listdir(train_path))):
            if fname.endswith('.txt'):
                with open(os.path.join(train_path, fname)) as f:
                    train_texts.append(f.read())
                train_labels.append(0 if category == 'neg' else 1)
    print("\nTrain done\n")
    # Load the validation data.
    test_texts = []
    test_labels = []
    for category in ['pos', 'neg']:
        test_path = os.path.join(imdb_data_path, 'test', category)
        for fname in tqdm(sorted(os.listdir(test_path))):
            if fname.endswith('.txt'):
                with open(os.path.join(test_path, fname)) as f:
                    test_texts.append(f.read())
                test_labels.append(0 if category == 'neg' else 1)
    print("\nTest done\n")
    # Shuffle the training data and labels.
    random.seed(seed)
    random.shuffle(train_texts)
    random.seed(seed)
    random.shuffle(train_labels)

    return ((train_texts, np.array(train_labels)),
            (test_texts, np.array(test_labels)))

In [13]:
%%time
%%script false --no-raise-error
(x_train, y_train), (x_test, y_test) = load_imdb_sentiment_analysis_dataset("../Datasets/")

df = pd.DataFrame(data=[x_train, y_train], index=["text", "label"]).T
df = df.append(pd.DataFrame(data=[x_test, y_test], index=["text", "label"]).T)

df[TEXT] = df[TEXT].apply(remove_upper_case)
df[TEXT] = df[TEXT].apply(remove_URL)
df[TEXT] = df[TEXT].apply(remove_html)
df[TEXT] = df[TEXT].apply(remove_emoji)

print(df.head())

CPU times: user 0 ns, sys: 125 ms, total: 125 ms
Wall time: 182 ms


In [14]:
#df_test[df_test[LABEL]=="ft_notaire"].iloc[60].values

---

<center><h2><a id="part_5">Sart Pipeline</a></h2></center>

---

In [15]:
def detect_lang_google( x):
        '''
        Function to detect the language of the string
        @param x: (str) sentences of text to detect language
        @return: (str or nan) language of the sentence
        '''
        translate = Translator()
        try:
            return translate.detect(x).lang
        except:
            return np.nan

In [16]:
if lang:
    # ---- Language detection of the text
    df.loc[:,"language"] = df[TEXT].progress_apply(detect_lang_google)
    # ---- Extract most frequent language 
    language = df.language.value_counts().index.tolist()[0]
    print(f"The language most present in the dataset is {language}")
else:
    language="fr"

---

---

<center><h3><a id="part_5_1">Prepare data for ML Classic</a></h3></center>

---

In [17]:
if sample:
    df_save = df.copy()
    df = df.sample(nb_sample, random_state=42)

In [18]:
df.shape

(5000, 3)

In [19]:
# ---- Load stopwords 
if language=="fr":
    stop_word = np.loadtxt("../stopwords/stopwords-fr.txt", dtype=str)
if language=="en":
    stop_word = np.loadtxt("../stopwords/stopwords_en.txt", dtype=str)

In [20]:
def remove_stop_words( x, stop_word):
        '''
        Function to remove a list of words
        @param x : (str) text 
        @param stop_word: (list) list of stopwords to delete 
        @return: (str) new string without stopwords 
        '''
        x_new = text_to_word_sequence(x)    # tokenize text 
        x_ = []
        for i in x_new:
            if i not in stop_word:
                x_.append(i)
        return " ".join(x_)

In [None]:
df.loc[:,TEXT+"_sw"] = df.loc[:,TEXT].progress_apply(lambda x : remove_stop_words(x, stop_word))

 69%|██████▊   | 3430/5000 [00:19<00:08, 179.04it/s]

In [None]:
# clean rows which are empty after proceding of stopwords removal 
if df[TEXT+"_sw"].isnull().sum()>0:
    print("Empty text")
    df.dropna(subset=[TEXT+"_w"], inplace=True)

---

---

<center><a id="part_6"><h1>Machine Learning</h1></a></center>

---

---

In [None]:
df_test.loc[:,TEXT+"_sw"] = df_test.loc[:,TEXT].progress_apply(lambda x : remove_stop_words(x, stop_word))

In [None]:
df[LABEL].value_counts(), df_test[LABEL].value_counts()

In [None]:
# split the dataset into training and validation datasets 
# ML classic 
train_x_sw, valid_x_sw, y_train_sw, y_valid_sw = model_selection.train_test_split(df[TEXT+"_sw"], df[LABEL], random_state=42, stratify=df[LABEL].values, test_size=0.2)

# For Embeddings
train_x, valid_x, y_train, y_valid = model_selection.train_test_split(df[TEXT], df[LABEL], random_state=42, stratify=df[LABEL], test_size=0.2)


train_x_sw, valid_x_sw, y_train_sw, y_valid_sw = df[TEXT+"_sw"].values, df_test[TEXT+"_sw"].values, df[LABEL].values, df_test[LABEL].values  
train_x, valid_x, y_train, y_valid = df[TEXT].values, df_test[TEXT].values, df[LABEL].values, df_test[LABEL].values 


# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y_sw = encoder.fit_transform(y_train_sw)
valid_y_sw = encoder.transform(y_valid_sw)
train_y = encoder.transform(y_train)
valid_y = encoder.transform(y_valid)

if save_model:
    # save the model to disk
    filename = NAME_ENCODER
    pickle.dump(encoder, open(os.path.join(root_dir, dir_name, NAME_ENCODER), 'wb'))

---

<center><a id="part_6_1"><h3>Class Weights</h3></a></center>

---

In [None]:
# Compute the class weight with sklearn 
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)

In [None]:
print(*[f'Class weight: {round(i[0],4)}\tclass: {i[1]}' for i in zip(class_weights, np.unique(y_train))], sep='\n')

In [None]:
# Determined if the dataset is balanced or imbalanced 
ratio = np.min(df[LABEL].value_counts()) / np.max(df[LABEL].value_counts())
if ratio > 0.1:      # Ratio 1:10 -> limite blanced / imbalanced 
    balanced = True
    print(f"\nThe dataset is balanced (ratio={round(ratio, 3)})")
else:
    balanced = False
    print(f"\nThe dataset is imbalanced (ratio={round(ratio, 3)})")
    #from imblearn.over_sampling import ADASYN
    # put class for debalanced data 
    # in progress

---

<center><a id="part_6_2"><h3>Save Unique Labels</h3></a></center>

---

In [None]:
# Keep the unique label corresponding to their encoding correspondance
labels = df[LABEL].unique()
test=pd.DataFrame(data=np.transpose([labels,encoder.transform(labels)]), columns=["labels", "encoding"]).sort_values(by=["encoding"])
labels=test.labels.tolist()
if any([0,1]) in labels and len(labels)==2:
    labels[labels.index(0)] = "negative"
    labels[labels.index(1)] = "positive"

---

<center><a id="part_6_3"><h3>DataFrame for the results</h3></a></center>

---

In [None]:
df_results = pd.DataFrame()

---

<center><a id="part_6_4"><h3>One-Hot encoding (CountVectorizing)</h3></a></center>

---

In [None]:
%%time
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df[TEXT]+"_sw")

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x_sw)
xvalid_count =  count_vect.transform(valid_x_sw)

if save_model:
    # save the model to disk
    filename = NAME_COUNT_VECT_MODEL
    pickle.dump(count_vect, open(os.path.join(root_dir, dir_name,filename), 'wb'))

---

<center><a id="part_6_5"><h3>TF-IDF</h3></a></center>

---

In [None]:
%%time
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=10000)
tfidf_vect.fit(df[TEXT])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)
print("word level tf-idf done")
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=10000)
tfidf_vect_ngram.fit(df[TEXT])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
print("ngram level tf-idf done")
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char',  ngram_range=(2,3), max_features=10000) #token_pattern=r'\w{1,}',
tfidf_vect_ngram_chars.fit(df[TEXT])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 
print("characters level tf-idf done")

if save_model:
    # save the model tf-idf to disk
    filename = NAME_TF_IDF_MODEL
    pickle.dump(tfidf_vect, open(os.path.join(root_dir, dir_name,filename), 'wb'))

    # save the model ngram to disk
    filename = NAME_TF_IDF_NGRAM_MODEL
    pickle.dump(tfidf_vect_ngram, open(os.path.join(root_dir, dir_name,filename), 'wb'))
    
    # save the model ngram char to disk
    filename = NAME_TF_IDF_NGRAM_CHAR_MODEL
    pickle.dump(tfidf_vect_ngram_chars, open(os.path.join(root_dir, dir_name,filename), 'wb'))

---

<center><a id="part_6_6"><h3>Load Pre-Trained model fastText</h3></a></center>

---

In [None]:
%%time
if language=="fr":
    #!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz
    #!gunzip cc.fr.300.bin.gz
    pretrained = fasttext.FastText.load_model('../Pretrained-models/cc.fr.300.bin')
if language=="en":
    #!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip
    #!unzip crawl-300d-2M-subword.zip
    pretrained = fasttext.FastText.load_model('../Pretrained-models/crawl-300d-2M-subword.bin')

---

<center><a id="part_6_7"><h3>Word Embeddings</h3></a></center>

---

In [None]:
%%time 
# create a tokenizer 
token = Tokenizer()
token.fit_on_texts(df[TEXT])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=300)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=300)

# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
words = []
for word, i in tqdm(word_index.items()):
    embedding_vector = pretrained.get_word_vector(word) #embeddings_index.get(word)
    words.append(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

if save_model:
    filename = NAME_TOKEN_EMBEDDINGS
    pickle.dump(token, open(os.path.join(root_dir, dir_name,filename), 'wb'))

In [None]:
def report(clf, x, y, X_test, y_test, name='classifier', cv=5, dict_scoring=None, fit_params=None, save=save_model):
    '''
    Function create a metric report automatically with cross_validate function.
    @param clf: (model) classifier
    @param x: (list or matrix or tensor) training x data
    @param y: (list) label data 
    @param name: (string) name of the model (default classifier)
    @param cv: (int) number of fold for cross-validation (default 5)
    @param dict_scoring: (dict) dictionary of metrics and names
    @param fit_aparams: (dict) add parameters for model fitting 
    @param save: (bool) determine if the model need to be saved
    @return: (pandas.dataframe) dataframe containing all the results of the metrics 
    for each fold and the mean and std for each of them
    '''
    
    '''{'acc': accuracy_score,
               'balanced_accuracy': balanced_accuracy_score,
               'prec': precision_score,
               'recall': recall_score,
               'f1-score': f1_score,
               'tp': tp, 'tn': tn,
               'fp': fp, 'fn': fn,
               'cohens_kappa':cohen_kappa_score,
               'matthews_corrcoef':matthews_corrcoef,
               "roc_auc":roc_auc_score}'''
    
    
    if dict_scoring!=None:
        score = dict_scoring.copy() # save the original dictionary
        for i in score.keys():
            if len(set(y))>2:
                if i in ["prec", "recall", "f1-score"]:
                    score[i] = make_scorer(score[i], average = 'weighted') # make each function scorer
                elif i=="roc_auc":
                    score[i] = make_scorer(score[i], average = 'weighted', multi_class="ovo",needs_proba=True) # make each function scorer
                else:
                    score[i] = make_scorer(score[i]) # make each function scorer
                    
            else:
                score[i] = make_scorer(score[i]) # make each function scorer
            
    try:
        scores = cross_validate(clf, x, y, scoring=score,
                         cv=cv, return_train_score=False, n_jobs=-1,  fit_params=fit_params)
    except:
        scores = cross_validate(clf, x, y, scoring=score,
                         cv=cv, return_train_score=False,  fit_params=fit_params)
        
     # Train test on the overall data
    fit_start = time.time()
    _model = clf
    _model.fit(x, y)
        
    fit_end = time.time() - fit_start

    
    score_start = time.time()
    y_pred = _model.predict(X_test)#>0.5).astype(int)
    score_end = time.time() - score_start
    
    if save:
        filename= name+".sav"
        pickle.dump(_model, open(os.path.join(root_dir, dir_name,filename), 'wb'))
    # initialisation 
    index = []
    value = []
    index.append("Model")
    value.append(name)
    for i in scores:  # loop on each metric generate text and values
        if i == "estimator":
            continue
        for j in enumerate(scores[i]):
            index.append(i+"_cv"+str(j[0]+1))
            value.append(j[1])
        
        
        index.append(i+"_mean")
        value.append(np.mean(scores[i]))
        index.append(i+"_std")
        value.append(np.std(scores[i]))
    
     # add metrics averall dataset on the dictionary 
    
    for i in scores:    # compute metrics 
        if i == "fit_time":
            
            scores[i] = np.append(scores[i] ,fit_end)
            index.append(i.split("test_")[-1]+'_overall')
            value.append(fit_end)
            continue
        if i == "score_time":
            
            scores[i] = np.append(scores[i] ,score_end)
            index.append(i.split("test_")[-1]+'_overall')
            value.append(score_end)
            continue
              
        
        scores[i] = np.append(scores[i] ,score[i.split("test_")[-1]](_model, X_test, y_test))
        index.append(i.split("test_")[-1]+'_overall')
        value.append(scores[i][-1])
    
    return pd.DataFrame(data=value, index=index).T

---

<center><a id="part_6_8"><h3>Multinomial Naive Bayes</h3></a></center>

---

In [None]:
%%time
if multinomial_naive_bayes:
    df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_count,train_y_sw, xvalid_count, valid_y, name='NB_Count_Vectors', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_tfidf,train_y, xvalid_tfidf, valid_y, name='NB_WordLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram,train_y, xvalid_tfidf_ngram, valid_y, name='NB_N-Gram_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars,train_y, xvalid_tfidf_ngram_chars, valid_y, name='NB_CharLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))

---

<center><a id="part_6_9"><h3>Logistic Regression</h3></a></center>

---

In [None]:
%%time
if logistic_regression:
    df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_count,train_y_sw,xvalid_count, valid_y, name='LR_Count_Vectors', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_tfidf,train_y, xvalid_tfidf, valid_y,name='LR_WordLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_tfidf_ngram,train_y, xvalid_tfidf_ngram, valid_y,name='LR_N-Gram_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(linear_model.LogisticRegression(max_iter=1000), xtrain_tfidf_ngram_chars,train_y,xvalid_tfidf_ngram_chars, valid_y, name='LR_CharLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))

---

<center><a id="part_6_10"><h3>SVM</h3></a></center>

---

In [None]:
%%time
if svm_model:
    df_results = df_results.append(report(svm.SVC(), xtrain_count,train_y_sw,xvalid_count, valid_y, name='SVM_Count_Vectors', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(svm.SVC(), xtrain_tfidf,train_y, xvalid_tfidf, valid_y,name='SVM_WordLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(svm.SVC(), xtrain_tfidf_ngram,train_y,xvalid_tfidf_ngram,valid_y, name='SVM_N-Gram_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(svm.SVC(), xtrain_tfidf_ngram_chars,train_y,xvalid_tfidf_ngram_chars, valid_y, name='SVM_CharLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))

---

<center><a id="part_6_11"><h3>k-NN</h3></a></center>

---

In [None]:
%%time
if k_nn_model:
    df_results = df_results.append(report(KNeighborsClassifier(n_neighbors=20, weights='distance', n_jobs=-1), xtrain_count,train_y_sw,xvalid_count, valid_y, name='kNN_Count_Vectors', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(KNeighborsClassifier(n_neighbors=20, weights='distance', n_jobs=-1), xtrain_tfidf,train_y,xvalid_tfidf, valid_y, name='kNN_WordLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(KNeighborsClassifier(n_neighbors=20, weights='distance', n_jobs=-1), xtrain_tfidf_ngram,train_y, xvalid_tfidf_ngram,valid_y,name='kNN_N-Gram_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(KNeighborsClassifier(n_neighbors=20, weights='distance', n_jobs=-1), xtrain_tfidf_ngram_chars,train_y,xvalid_tfidf_ngram_chars, valid_y, name='kNN_CharLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))

---

<center><a id="part_6_12"><h3>RandomForest</h3></a></center>

---

In [None]:
%%time
if random_forest:
    df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), xtrain_count,train_y_sw, xvalid_count, valid_y,name='RF_Count_Vectors', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), xtrain_tfidf,train_y, xvalid_tfidf, valid_y,name='RF_WordLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), xtrain_tfidf_ngram,train_y, xvalid_tfidf_ngram,valid_y, name='RF_N-Gram_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(ensemble.RandomForestClassifier(bootstrap=True,min_impurity_decrease=1e-7,n_jobs=-1, random_state=42), xtrain_tfidf_ngram_chars,train_y,xvalid_tfidf_ngram_chars, valid_y, name='RF_CharLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))

---

<center><a id="part_6_13"><h3>Stochastic Gradient Descent</h3></a></center>

---

In [None]:
%%time
if sgd:
    df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), xtrain_count,train_y_sw, xvalid_count, valid_y,name='SGD_Count_Vectors', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), xtrain_tfidf,train_y,xvalid_tfidf, valid_y, name='SGD_WordLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), xtrain_tfidf_ngram,train_y, xvalid_tfidf_ngram,valid_y,name='SGD_N-Gram_Vectors', cv=CV_splits, dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3,   n_iter_no_change=10, early_stopping=True, n_jobs=-1 ), xtrain_tfidf_ngram_chars,train_y, xvalid_tfidf_ngram_chars, valid_y, name='SGD_CharLevel_Vectors', cv=CV_splits, dict_scoring=score_metrics, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_6_14"><h3>Gradient Boosting</h3></a></center>

---

In [None]:
%%time
if gradient_boosting:
    df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                               validation_fraction=0.2,
                                               n_iter_no_change=10, tol=0.01,
                                               random_state=0, verbose=0 ), xtrain_count,train_y_sw, xvalid_count, valid_y,name='GB_Count_Vectors', cv=CV_splits, dict_scoring=score_metrics, save=save_model))

In [None]:
%%time
if gradient_boosting:
    df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                               validation_fraction=0.2,
                                               n_iter_no_change=10, tol=0.01,
                                               random_state=0, verbose=0 ), xtrain_tfidf,train_y, xvalid_tfidf, valid_y,name='GB_WordLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))

In [None]:
%%time
if gradient_boosting:
    df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                               validation_fraction=0.2,
                                               n_iter_no_change=10, tol=0.01,
                                               random_state=0, verbose=0 ), xtrain_tfidf_ngram,train_y, xvalid_tfidf_ngram,valid_y, name='GB_N-Gram_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))

In [None]:
%%time
if gradient_boosting:
    df_results = df_results.append(report(ensemble.GradientBoostingClassifier(n_estimators=1000,
                                               validation_fraction=0.2,
                                               n_iter_no_change=10, tol=0.01,
                                               random_state=0, verbose=0 ), xtrain_tfidf_ngram_chars,train_y,xvalid_tfidf_ngram_chars, valid_y,  name='GB_CharLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_6_15"><h3>XGBoost Classifier</h3></a></center>

---

All the XGBoost have early stopping implemented with 10 rounds

In [None]:
%%time
if xgboost_classifier:
    fit_params={'early_stopping_rounds':10,\
                         'eval_set':[(xvalid_count, valid_y_sw)]}
    if num_gpu>0:    # Config for GPU
        df_results = df_results.append(report(XGBClassifier(tree_method='gpu_hist',n_estimators=1000, subsample=0.8), xtrain_count,train_y_sw,xvalid_count, valid_y, name='XGB_Count_Vectors', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics, save=save_model))
    else:
        # run on CPU
        df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), xtrain_count,train_y_sw, xvalid_count, valid_y, name='XGB_Count_Vectors', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics, save=save_model))
    if save_results:
        df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False) 

In [None]:
%%time
if xgboost_classifier:
    fit_params={'early_stopping_rounds':10,\
                         'eval_set':[(xvalid_tfidf, valid_y)]}
    if num_gpu>0:    # Config for GPU
        df_results = df_results.append(report(XGBClassifier(tree_method='gpu_hist', n_estimators=1000, subsample=0.8), xtrain_tfidf,train_y,xvalid_tfidf, valid_y, name='XGB_WordLevel_TF-IDF', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics, save=save_model))
    else:
        df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), xtrain_tfidf,train_y,xvalid_tfidf, valid_y, name='XGB_WordLevel_TF-IDF', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics, save=save_model))
    if save_results:
        df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

In [None]:
%%time
if xgboost_classifier:
    fit_params={'early_stopping_rounds':10,\
                         'eval_set':[(xvalid_tfidf_ngram, valid_y)]}
    if num_gpu>0:    # Config for GPU
        df_results = df_results.append(report(XGBClassifier(tree_method='gpu_hist',n_estimators=1000, subsample=0.8), xtrain_tfidf_ngram,train_y,  xvalid_tfidf_ngram, valid_y, name='XGB_N-Gram_TF-IDF', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics, save=save_model))
    else:
        df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), xtrain_tfidf_ngram,train_y, xvalid_tfidf_ngram, valid_y,name='XGB_N-Gram_TF-IDF', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics, save=save_model))
    if save_results:
        df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

In [None]:
%%time
if xgboost_classifier:
    fit_params={'early_stopping_rounds':10,\
                         'eval_set':[(xvalid_tfidf_ngram_chars, valid_y)]}
    if num_gpu>0:    # Config for GPU
        df_results = df_results.append(report(XGBClassifier(tree_method='gpu_hist',n_estimators=1000, subsample=0.8), xtrain_tfidf_ngram_chars,train_y, xvalid_tfidf_ngram_chars, valid_y, name='XGB_CharLevel_TF-IDF', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics, save=save_model))
    else:
        df_results = df_results.append(report(XGBClassifier(n_estimators=1000, subsample=0.8), xtrain_tfidf_ngram_chars,train_y, xvalid_tfidf_ngram_chars, valid_y, name='XGB_CharLevel_TF-IDF', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics, save=save_model))
    if save_results:
        df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_6_16"><h3>Adaboost Classifier</h3></a></center>

---

In [None]:
%%time 
if adaboost_classifier:
    # work in progress
    df_results = df_results.append(report(AdaBoostClassifier(n_estimators=1000), xtrain_count,train_y_sw, xvalid_count, valid_y, name='Adaboost_Count_Vectors', cv=CV_splits,  dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(AdaBoostClassifier(n_estimators=1000), xtrain_tfidf,train_y,xvalid_tfidf, valid_y,name='Adaboost_WordLevel_TF-IDF', cv=CV_splits,  dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(AdaBoostClassifier(n_estimators=1000), xtrain_tfidf_ngram,train_y, xvalid_tfidf_ngram, valid_y,name='Adaboost_N-Gram_TF-IDF', cv=CV_splits,  dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(AdaBoostClassifier(n_estimators=1000), xtrain_tfidf_ngram_chars,train_y, xvalid_tfidf_ngram_chars, valid_y,name='Adaboost_CharLevel_TF-IDF', cv=CV_splits,  dict_scoring=score_metrics, save=save_model))
    

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_6_17"><h3>Catboost Classifier</h3></a></center>

---

In [None]:
%%time 
if catboost_classifier:
    # work in progress
    if num_gpu>0:  # test gpu available
        df_results = df_results.append(report(CatBoostClassifier(n_estimators=1000, early_stopping_rounds=10, task_type="GPU"), xtrain_count,train_y_sw, xvalid_count, valid_y, name='Catboost_Count_Vectors', cv=CV_splits,  dict_scoring=score_metrics))
        df_results = df_results.append(report(CatBoostClassifier(n_estimators=1000, early_stopping_rounds=10, task_type="GPU"), xtrain_tfidf,train_y,xvalid_tfidf, valid_y,name='Catboost_WordLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics))
        df_results = df_results.append(report(CatBoostClassifier(n_estimators=1000, early_stopping_rounds=10, task_type="GPU"), xtrain_tfidf_ngram,train_y, xvalid_tfidf_ngram, valid_y,name='Catboost_N-Gram_TF-IDF', cv=CV_splits,  dict_scoring=score_metrics))
        df_results = df_results.append(report(CatBoostClassifier(n_estimators=1000, early_stopping_rounds=10, task_type="GPU"), xtrain_tfidf_ngram_chars,train_y, xvalid_tfidf_ngram_chars, valid_y,name='Catboost_CharLevel_TF-IDF', cv=CV_splits,  dict_scoring=score_metrics))
    else:
        df_results = df_results.append(report(CatBoostClassifier(n_estimators=1000, early_stopping_rounds=10), xtrain_count,train_y_sw, xvalid_count, valid_y, name='Catboost_Count_Vectors', cv=CV_splits,  dict_scoring=score_metrics))
        df_results = df_results.append(report(CatBoostClassifier(n_estimators=1000, early_stopping_rounds=10), xtrain_tfidf,train_y,xvalid_tfidf, valid_y,name='Catboost_WordLevel_TF-IDF', cv=CV_splits,  dict_scoring=score_metrics))
        df_results = df_results.append(report(CatBoostClassifier(n_estimators=1000, early_stopping_rounds=10), xtrain_tfidf_ngram,train_y, xvalid_tfidf_ngram, valid_y,name='Catboost_N-Gram_TF-IDF', cv=CV_splits,  dict_scoring=score_metrics))
        df_results = df_results.append(report(CatBoostClassifier(n_estimators=1000, early_stopping_rounds=10), xtrain_tfidf_ngram_chars,train_y, xvalid_tfidf_ngram_chars, valid_y,name='Catboost_CharLevel_TF-IDF', cv=CV_splits, dict_scoring=score_metrics))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_6_18"><h3>LightGBM Classifier</h3></a></center>

---

In [None]:
%%time 
if lightgbm_classifier:
    # work in progress
    fit_params = {'early_stopping_rounds':10,'eval_set':[(xvalid_count, valid_y)]}
    if num_gpu>0:
        df_results = df_results.append(report(LGBMClassifier(n_estimators = 1000, device = "gpu"), xtrain_count,train_y_sw, xvalid_count, valid_y, name='LGM_Count_Vectors', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics))
    else:   
        df_results = df_results.append(report(LGBMClassifier(n_estimators = 1000), xtrain_count,train_y_sw, xvalid_count, valid_y, name='LGM_Count_Vectors', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics))
    
    fit_params = {'early_stopping_rounds':10,'eval_set':[(xvalid_tfidf, valid_y)]}
    
    if num_gpu>0:
        df_results = df_results.append(report(LGBMClassifier(n_estimators = 1000, device = "gpu"), xtrain_tfidf,train_y_sw, xvalid_tfidf, valid_y, name='LGM_Count_Vectors', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics))
    else:   
        df_results = df_results.append(report(LGBMClassifier(n_estimators = 1000), xtrain_tfidf,train_y_sw, xvalid_tfidf, valid_y, name='LGM_Count_Vectors', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics))
    
    
    fit_params = {'early_stopping_rounds':10,'eval_set':[(xvalid_tfidf_ngram, valid_y)]}
    
    if num_gpu>0:
        df_results = df_results.append(report(LGBMClassifier(n_estimators = 1000, device = "gpu"), xtrain_tfidf_ngram,train_y_sw, xvalid_tfidf_ngram, valid_y, name='LGM_Count_Vectors', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics))
    else:   
        df_results = df_results.append(report(LGBMClassifier(n_estimators = 1000), xtrain_tfidf_ngram,train_y_sw, xvalid_tfidf_ngram, valid_y, name='LGM_Count_Vectors', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics))
    
    fit_params = {'early_stopping_rounds':10,'eval_set':[(xvalid_tfidf_ngram_chars, valid_y)]}
    
    if num_gpu>0:
        df_results = df_results.append(report(LGBMClassifier(n_estimators = 1000, device = "gpu"), xtrain_tfidf_ngram_chars,train_y_sw, xvalid_tfidf_ngram_chars, valid_y, name='LGM_Count_Vectors', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics))
    else:   
        df_results = df_results.append(report(LGBMClassifier(n_estimators = 1000), xtrain_tfidf_ngram_chars,train_y_sw, xvalid_tfidf_ngram_chars, valid_y, name='LGM_Count_Vectors', cv=CV_splits, fit_params=fit_params, dict_scoring=score_metrics))
    

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<h2><center><a id="part_6_19">ExtraTreesClassifier</a></center></h2>

---

In [None]:
%%time 
if extratrees_classifier:
    # work in progress
    df_results = df_results.append(report(ExtraTreesClassifier(n_estimators=100,min_impurity_decrease=1e-7, random_state=42), xtrain_count,train_y_sw, xvalid_count, valid_y, name='ExtraTrees_Count_Vectors', cv=CV_splits,  dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(ExtraTreesClassifier(n_estimators=100,min_impurity_decrease=1e-7, random_state=42), xtrain_tfidf,train_y,xvalid_tfidf, valid_y,name='ExtraTrees_WordLevel_TF-IDF', cv=CV_splits,  dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(ExtraTreesClassifier(n_estimators=100,min_impurity_decrease=1e-7, random_state=42), xtrain_tfidf_ngram,train_y, xvalid_tfidf_ngram, valid_y,name='ExtraTrees_N-Gram_TF-IDF', cv=CV_splits,  dict_scoring=score_metrics, save=save_model))
    df_results = df_results.append(report(ExtraTreesClassifier(n_estimators=100,min_impurity_decrease=1e-7, random_state=42), xtrain_tfidf_ngram_chars,train_y, xvalid_tfidf_ngram_chars, valid_y,name='ExtraTrees_CharLevel_TF-IDF', cv=CV_splits,  dict_scoring=score_metrics, save=save_model))
    

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

<center><a id="part_7"><h2>Deep Learning</h2></a></center>

[<h6>Back to top</h6>](#content)

---

<h3>Cohen’s kappa</h3>

The function [cohen_kappa_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html#sklearn.metrics.cohen_kappa_score) computes [Cohen’s kappa](https://en.wikipedia.org/wiki/Cohen%27s_kappa) statistic. This measure is intended to compare labelings by different human annotators, not a classifier versus a ground truth.

The kappa score (see docstring) is a number between -1 and 1. Scores above .8 are generally considered good agreement; zero or lower means no agreement (practically random labels).

Kappa scores can be computed for binary or multiclass problems, but not for multilabel problems (except by manually computing a per-label score) and not for more than two annotators.

<h3>Balanced Accuracy</h3>

Compute the balanced accuracy

The balanced accuracy in binary and multiclass classification problems to deal with imbalanced datasets. It is defined as the average of recall obtained on each class.

The best value is 1 and the worst value is 0 when adjusted=False

---

<h3>Early Stopping, Model saving, Class weight configuration</h3>

In [None]:
class_w = {}
for i in zip(range(len(class_weights)), class_weights):
    class_w[i[0]] = i[1]

---

In [None]:
from keras import backend as K

In [None]:
def cross_validate_NN(model, X, y, X_test, y_test,name="NN", fit_params=None, scoring=None, n_splits=5, save=save_model, batch_size = 32,  use_multiprocessing=True):
    '''
    Function create a metric report automatically with cross_validate function.
    @param model: (model) neural network model
    @param X: (list or matrix or tensor) training X data
    @param y: (list) label data 
    @param X_test: (list or matrix or tensor) testing X data
    @param y_test: (list) label test data 
    @param name: (string) name of the model (default classifier)
    @param fit_aparams: (dict) add parameters for model fitting 
    @param scoring: (dict) dictionary of metrics and names
    @param n_splits: (int) number of fold for cross-validation (default 5)
    @return: (pandas.dataframe) dataframe containing all the results of the metrics 
    for each fold and the mean and std for each of them
    '''
    # ---- Parameters initialisation
    es = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='auto', patience=3)
    seed = 42
    k = 1
    np.random.seed(seed)
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    
    # Creation of list for each metric
    if scoring==None:        # create a dictionary if none is passed
        dic_scoring = {}
    if scoring!=None:        # save the dict 
        dic_score = scoring.copy()
    
    dic_score["fit_time"] = None   # initialisation for time fitting and scoring
    dic_score["score_time"] = None
    scorer = {}
    for i in dic_score.keys(): 
        scorer[i] = []
    
    index = ["Model"]
    results = [name]
    # ---- Loop on k-fold for cross-valisation
    for train, test in kfold.split(X, y):   # training NN on each fold 
        # create model
        print(f"k-fold : {k}")
        fit_start = time.time()
        _model = tf.keras.models.clone_model(model)
        if len(np.unique(y))==2: # binary
            _model.compile(optimizer='adam',
                  loss=tf.losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy'])
        else:  # multiclass 
            _model.compile(optimizer='adam',
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])
        _model.fit(X[train], y[train],
                        epochs=1000, callbacks=[es], validation_data=(X[test], y[test]),
                         verbose=False, batch_size = batch_size,  use_multiprocessing=use_multiprocessing)
        
        fit_end = time.time() - fit_start

        score_start = time.time()
        y_pred = (_model.predict(X[test])>0.5).astype(int)
        score_end = time.time() - score_start
        #if len(set(y))>2:
        #    y_pred =np.argmax(y_pred,axis=1)
        #print(y_test[0], y_pred[0])
        if len(set(y))==2:
            print(f"Precision: {round(100*precision_score(y[test], y_pred), 3)}% , Recall: {round(100*recall_score(y[test], y_pred), 3)}%, Time \t {round(fit_end, 4)} ms")
        else: 
            print(f"Precision: {round(100*precision_score(y[test], np.argmax(y_pred,axis=1), average='weighted'), 3)}% , Recall: \
        {round(100*recall_score(y[test], np.argmax(y_pred,axis=1), average='weighted'), 3)}%, Time \t {round(fit_end, 4)} ms")
        
        
        # ---- save each metric
        for i in dic_score.keys():    # compute metrics 
            if i == "fit_time":
                scorer[i].append(fit_end)
                index.append(i+'_cv'+str(k))
                results.append(fit_end)
                continue
            if i == "score_time":
                scorer[i].append(score_end)
                index.append(i+'_cv'+str(k))
                results.append(score_end)
                continue
            
            if len(set(y))>2:
                if i in ["prec", "recall", "f1-score"]:
                    scorer[i].append(dic_score[i](y[test], np.argmax(y_pred,axis=1), average = 'weighted')) # make each function scorer

                elif i=="roc_auc":
                    scorer[i].append(dic_score[i](to_categorical(y[test]), y_pred, average = 'macro', multi_class="ovo")) # make each function scorer
                else:
                    scorer[i].append(dic_score[i]( y[test], np.argmax(y_pred,axis=1))) # make each function scorer

            else:
                scorer[i].append(dic_score[i]( y[test], y_pred)) # make each function scorer
            #scorer[i].append(dic_score[i]( y[test], y_pred))
            index.append("test_"+i+'_cv'+str(k))
            results.append(scorer[i][-1])
        K.clear_session()
        del _model
        k+=1
    
    # Train test on the overall data
    print("Overall train-test data")
    fit_start = time.time()
    _model =  tf.keras.models.clone_model(model)
    if len(np.unique(y))==2: # binary
        _model.compile(optimizer='adam',
                  loss=tf.losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    else:  # multiclass 
        _model.compile(optimizer='adam',
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])
        _model.fit(X[train], y[train],
                        epochs=1000, callbacks=[es], validation_data=(X[test], y[test]),
                         verbose=False)
    if save:
        check_p = tf.keras.callbacks.ModelCheckpoint(os.path.join(root_dir, dir_name, name+".h5"), save_best_only=True)
        _model.fit(X, y,epochs=1000, callbacks=[es, check_p], validation_split=0.2, batch_size = batch_size, 
                   verbose=False, use_multiprocessing=use_multiprocessing)
        
    else:
        _model.fit(X, y,epochs=1000, callbacks=[es],  validation_split=0.2, batch_size = batch_size, 
                   verbose=False, use_multiprocessing=use_multiprocessing)
        
    fit_end = time.time() - fit_start

    #_acc = _model.evaluate(X_test, y_test, verbose=0)

    score_start = time.time()
    y_pred = (_model.predict(X_test)>0.5).astype(int)
    score_end = time.time() - score_start
    #if len(set(y))>2:
    #    y_pred =np.argmax(y_pred,axis=1)
    if len(set(y))==2:
        print(f"Precision: {round(100*precision_score(y_test, y_pred), 3)}% , Recall: {round(100*recall_score(y_test, y_pred), 3)}%, Time \t {round(fit_end, 4)} ms")
    else: 
        print(f"Precision: {round(100*precision_score(y_test, np.argmax(y_pred,axis=1), average='weighted'), 3)}% , Recall: \
        {round(100*recall_score(y_test, np.argmax(y_pred,axis=1), average='weighted'), 3)}%, Time \t {round(fit_end, 4)} ms")

    # Compute mean and std for each metric
    for i in scorer: 
        
        results.append(np.mean(scorer[i]))
        results.append(np.std(scorer[i]))
        if i == "fit_time":
            index.append(i+"_mean")
            index.append(i+"_std")
            continue
        if i == "score_time":
            index.append(i+"_mean")
            index.append(i+"_std")
            continue
        
        index.append("test_"+i+"_mean")
        index.append("test_"+i+"_std")
    # add metrics averall dataset on the dictionary 
    for i in dic_score.keys():    # compute metrics 
        if i == "fit_time":
            scorer[i].append(fit_end)
            index.append(i+'_overall')
            results.append(fit_end)
            continue
        if i == "score_time":
            scorer[i].append(score_end)
            index.append(i+'_overall')
            results.append(score_end)
            continue
        
        if len(set(y))>2:
            if i in ["prec", "recall", "f1-score"]:
                scorer[i].append(dic_score[i](y_test, np.argmax(y_pred,axis=1), average = 'weighted')) # make each function scorer

            elif i=="roc_auc":
                scorer[i].append(dic_score[i](to_categorical(y_test), y_pred, average = 'weighted', multi_class="ovo")) # make each function scorer
            else:
                scorer[i].append(dic_score[i]( y_test, np.argmax(y_pred,axis=1))) # make each function scorer

        else:
            #scorer[i].append(dic_score[i]( y[test], y_pred))                             
            scorer[i].append(dic_score[i](_model, X_test, y_test))
        index.append(i+'_overall')
        results.append(scorer[i][-1])
    
            
    return pd.DataFrame(results, index=index).T

---

<center><a id="part_7_1"><h3>Shallow Neural Networks</h3></a></center>

---

In [None]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

In [None]:
def shallow_neural_networks(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a shallow neural network for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model) shallow neural network 
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 16)
    else:
        print("Pre-trained model used")
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
      embedded,
      
      keras.layers.GlobalAveragePooling1D(),
      
      keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    return model

In [None]:
%%time
if shallow_network:
    df_results = df_results.append(cross_validate_NN(shallow_neural_networks(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y,  name="Shallow_NN_WE", scoring=score_metrics, 
                                                     n_splits=CV_splits, save=save_model))

In [None]:
#df_ = pd.read_csv(NAME_SAVE_FILE+".csv", sep=";")
#df_results = df_results.append(df_)

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_7_2"><h3>Deep Neural Networks</h3></a></center>

---

In [None]:
def deep_neural_networks(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a deep neural network for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model) deep neural network 
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 50)
    else:
        print("Pre-trained model used")
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
      embedded,
      keras.layers.GlobalAveragePooling1D(),
      keras.layers.Dense(16, activation="relu"),#tf.nn.swish),
      keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    #print(model.summary())
    
    return model

In [None]:
%%time
if deep_nn:
    df_results = df_results.append(cross_validate_NN(deep_neural_networks(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y,  name="Deep_NN_WE",scoring=score_metrics, 
                                                     n_splits=CV_splits , save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

<h4>Deep Neural Networks variation 1</h4>

In [None]:
def deep_neural_networks_var1(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a deep neural network for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model) deep neural network 
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
      embedded,
      keras.layers.GlobalAveragePooling1D(),
      keras.layers.Dense(16, activation="relu"),#tf.nn.swish),
      keras.layers.Dense(16, activation="relu"),#tf.nn.swish),
      keras.layers.Dense(1  if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    #print(model.summary())
    
    return model

In [None]:
%%time
if deep_nn:
    df_results = df_results.append(cross_validate_NN(deep_neural_networks_var1(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y, name="Deep_NN_var1_WE", 
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

<h4>Deep Neural Networks variation 2</h4>

In [None]:
def deep_neural_networks_var2(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a deep neural network for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model) deep neural network 
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
      embedded,
      keras.layers.GlobalAveragePooling1D(),
      keras.layers.Dense(32, activation='relu'),
      keras.layers.Dense(16, activation='relu'),
      keras.layers.Dense(1  if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    
    return model


In [None]:
%%time
if deep_nn:
    df_results = df_results.append(cross_validate_NN(deep_neural_networks_var2(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y,  name="Deep_NN_var2_WE",
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_7_3"><h3>Recurent Neural Network (RNN)</h3></a></center>

---

In [None]:
def create_rnn_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a recurrent neural network for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model) recurrent neural network 
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        print("Pre-trained model used")
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SimpleRNN(32, return_sequences=True, activation='tanh'),
    keras.layers.SimpleRNN(32, return_sequences=True, activation='tanh'),
    keras.layers.SimpleRNN(32, return_sequences=True, activation='tanh'),
    keras.layers.SimpleRNN(32, activation="tanh"),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

   
    #print(model.summary())
    
    return model

In [None]:
%%time
if rnn:
    df_results = df_results.append(cross_validate_NN(create_rnn_model(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y, name="RNN_WE",
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_7_4"><h3>Convolutional Neural Network (CNN)</h3></a></center>

---

In [None]:
def create_conv_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a convulational neural network for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model) convulational neural network 
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) +1, 100)
    else:
        print("Pre-trained model used")
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Conv1D(100, 5, activation="relu"),#tf.nn.swish), # padding='same'
    keras.layers.Dropout(0.2),
    keras.layers.MaxPooling1D(pool_size=4),
    keras.layers.Conv1D(64, 5, activation="relu"),#tf.nn.swish),
    keras.layers.Dropout(0.2),
    keras.layers.MaxPooling1D(pool_size=4),
    keras.layers.Conv1D(32, 5, activation="relu"),#tf.nn.swish),
    keras.layers.Dropout(0.2),
    keras.layers.GlobalMaxPooling1D(),

    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])
    
    return model

In [None]:
%%time
if cnn:
    df_results = df_results.append(cross_validate_NN(create_conv_model(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y, name="CNN_WE",
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_7_4"><h3>Long Short Term Memory (LSTM)</h3></a></center>

---

In [None]:
def create_lstm_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a lstm for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model)lstm 
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) +1, 100)
    else:
        print("Pre-trained model used")
        embedded = keras.layers.Embedding(len(word_index)+1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.LSTM(32, activation='tanh'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    return model

In [None]:
%%time
if lstm:
    df_results = df_results.append(cross_validate_NN(create_lstm_model(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y, name="LSTM_WE",
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_7_6"><h3>CNN – LSTM</h3></a></center>

---

In [None]:
def create_cnn_lstm_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a convulational neural network lstm for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model) convulational neural network lstm
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        print("Pre-trained model used")
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Conv1D(128, 5, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.MaxPooling1D(pool_size=4),
    keras.layers.LSTM(32, activation='tanh'),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])
    
    return model

In [None]:
%%time
if cnn_lstm:
    df_results = df_results.append(cross_validate_NN(create_cnn_lstm_model(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y, name="CNN_LSTM_WE", 
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_7_7"><h3>CNN – GRU</h3></a></center>

---

In [None]:
def create_cnn_gru_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a convulational neural network GRU for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model) convulational neural network GRU
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        print("Pre-trained model used")
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Conv1D(128, 5, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.MaxPooling1D(pool_size=4),
    keras.layers.GRU(32, activation='tanh'),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    return model

In [None]:
%%time
if cnn_gru:
    df_results = df_results.append(cross_validate_NN(create_cnn_gru_model(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y, name="CNN_GRU_WE", 
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_7_8"><h3>Gated Recurrent Units – GRU</h3></a></center>

---

tf.keras.layers.GRU(
    units, activation='tanh', recurrent_activation='sigmoid', use_bias=True,
    kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal',
    bias_initializer='zeros', kernel_regularizer=None, recurrent_regularizer=None,
    bias_regularizer=None, activity_regularizer=None, kernel_constraint=None,
    recurrent_constraint=None, bias_constraint=None, dropout=0.0,
    recurrent_dropout=0.0, implementation=2, return_sequences=False,
    return_state=False, go_backwards=False, stateful=False, unroll=False,
    time_major=False, reset_after=True, **kwargs
)

In [None]:
def create_gru_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a GRU for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model) GRU
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        print("Pre-trained model used")
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.GRU(32, activation='tanh'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    return model

In [None]:
%%time
if gru:
    df_results = df_results.append(cross_validate_NN(create_gru_model(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y, name="GRU_WE", 
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_7_9"><h3>Bidirectional RNN</h3></a></center>

---

In [None]:
def create_bidirec_rnn_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a bidirectionnal rnn for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model) bidirectionnal rnn
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        print("Pre-trained model used")
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Bidirectional(keras.layers.SimpleRNN(32, return_sequences=True, activation="tanh")),
    keras.layers.Bidirectional(keras.layers.SimpleRNN(32, return_sequences=True, activation="tanh")),
    keras.layers.Bidirectional(keras.layers.SimpleRNN(32, return_sequences=True, activation="tanh")),
    keras.layers.Bidirectional(keras.layers.SimpleRNN(32, activation="tanh")),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    return model

In [None]:
%%time
if bidirectional_rnn:
    df_results = df_results.append(cross_validate_NN(create_bidirec_rnn_model(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y, name="BiRNN_WE",
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_7_10"><h3>Bidirectional LSTM</h3></a></center>

---

In [None]:
def create_bidirec_lstm_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a bidirectionnal lstm for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model) bidirectionnal lstm
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Bidirectional(keras.layers.LSTM(32, activation="tanh")),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    return model

In [None]:
%%time
if bidirectional_lstm:
    df_results = df_results.append(cross_validate_NN(create_bidirec_lstm_model(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y, name="BiLSTM_WE",
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_7_11"><h3>Bidirectional GRU</h3></a></center>

---

In [None]:
def create_bidirec_gru_model(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a bidirectionnal gru for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model) bidirectionnal gru
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.Bidirectional(keras.layers.GRU(32, activation="tanh")),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    
    
    return model

In [None]:
%%time
if bidirectional_gru:
    df_results = df_results.append(cross_validate_NN(create_bidirec_gru_model(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y, name="BiGRU_WE",
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_7_12"><h3>Recurrent Convolutional Neural Network (RCNN)</h3></a></center>

---

In [None]:
def create_rcnn(X, word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a rcnn for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model)  rcnn
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300,input_length=X.shape[1], weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.GRU(32,return_sequences=True, activation="tanh")),
    keras.layers.Convolution1D(32, 3, activation="tanh"),
    keras.layers.GlobalMaxPool1D(),
    keras.layers.Dense(25, activation="relu"),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    
    
    return model

In [None]:
%%time
if rcnn:
    df_results = df_results.append(cross_validate_NN(create_rcnn(train_seq_x, word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y, name="RCNN_WE",
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<h3>Recurrent Convolutional Neural Network variation 1</h3>

---

In [None]:
def create_rcnn_var1(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a rcnn for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model)  rcnn
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.LSTM(32,return_sequences=True, activation="tanh")),
    keras.layers.Convolution1D(32, 3, activation="relu"),
    keras.layers.GlobalMaxPool1D(),
    keras.layers.Dense(25, activation="relu"),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    
    return model

In [None]:
%%time
if rcnn:
    df_results = df_results.append(cross_validate_NN(create_rcnn_var1(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y, name="RCNN_var1_WE",
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<h3>Recurrent Convulational Neural Network variation 2</h3>

---

In [None]:
def create_rcnn_var2(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    '''
    Function to generate a rcnn for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model)  rcnn
    '''
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.GRU(32,return_sequences=True, activation="tanh")),
    keras.layers.Bidirectional(keras.layers.GRU(32,return_sequences=True, activation="tanh")),
    keras.layers.Convolution1D(32, 3, activation="relu"),
    keras.layers.GlobalMaxPool1D(),
    keras.layers.Dense(25, activation="relu"),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    
    return model

In [None]:
%%time
if rcnn:
    df_results = df_results.append(cross_validate_NN(create_rcnn_var2(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y, name="RCNN_var2_WE",
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<h3>Recurrent Convulational Neural Network variation 3</h3>

---

In [None]:
def create_rcnn_var3(word_index, label=labels, embedding_matrix=embedding_matrix, pre_trained=False):
    if pre_trained==False:
        embedded = keras.layers.Embedding(len(word_index) + 1, 100)
    else:
        embedded = keras.layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)
    
    model = keras.Sequential([
    embedded,
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.Bidirectional(keras.layers.GRU(32,return_sequences=True, activation="tanh")),
    keras.layers.Bidirectional(keras.layers.LSTM(32,return_sequences=True, activation="tanh")),
    keras.layers.Convolution1D(32, 3, activation="relu"),
    keras.layers.GlobalMaxPool1D(),
    keras.layers.Dense(25, activation="relu"),
    keras.layers.Dropout(0.25),
    keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")])

    return model

In [None]:
%%time
if rcnn:
    df_results = df_results.append(cross_validate_NN(create_rcnn_var3(word_index, pre_trained=pre_trained), 
                                                     train_seq_x, train_y, valid_seq_x, valid_y,name="RCNN_var3_WE",
                                                     scoring=score_metrics, n_splits=CV_splits, save=save_model))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_7_13"><h3>Transformers</h3></a></center>

Tutorial available on Keras documentation, code example written by Apoorv Nandan (<a href="https://keras.io/examples/nlp/text_classification_with_transformer/">source: keras.io</a>)

---

In [None]:
from tensorflow.keras import layers

In [None]:
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def get_config(self):

        config = super(MultiHeadSelfAttention, self).get_config().copy()
        config.update({
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'projection_dim': self.projection_dim,
            'query_dense': self.query_dense,
            'key_dense': self.key_dense,
            'value_dense': self.value_dense,
            'combine_heads':self.combine_heads
        })
        return config
        
        
    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)
    

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation=tf.nn.swish), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def get_config(self):

        config = super(TokenAndPositionEmbedding, self).get_config().copy()
        config.update({
            'token_emb': self.token_emb,
            'pos_emb': self.pos_emb,
           
        })
        return config
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [None]:
def transformers_classifier(word_index, label=labels):
    '''
    Function to generate a rcnn for binary or multiclass classification.
    @param word_index: (matrix) unique token in corpus
    @param label: (list) list of labels to determine if it,s a binary or multiclass
    @param embedding_matrix: (matrix) matrix of integer for each word in the 
    @param pre_trained: (bool) determine if the model will use pretrained model
    @return: (model)  rcnn
    '''
    embed_dim = 32  # Embedding size for each token
    num_heads = 2  # Number of attention heads
    ff_dim = 32  # Hidden layer size in feed forward network inside transformer
    vocab_size = len(word_index)+1
    maxlen = 300
    inputs = layers.Input(shape=(maxlen,))
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(20, activation="relu")(x) #tf.nn.swish
    x = layers.Dropout(0.1)(x)
    #outputs = layers.Dense(1, activation="sigmoid")(x)
    
    outputs = keras.layers.Dense(1 if len(label)<=2 else len(label), activation='sigmoid' if len(label)<=2 else "softmax")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    
    
    return model


In [None]:
if transformers:
    
    df_results = df_results.append(cross_validate_NN(transformers_classifier(word_index, label=labels), 
                                                     train_seq_x[:25], train_y[:25], valid_seq_x, valid_y,name="transformers",
                                                     scoring=score_metrics, n_splits=CV_splits, save=False))

In [None]:
if save_results:
    df_results.to_csv(NAME_SAVE_FILE+".csv", sep=";", index=False)

---

<center><a id="part_8"><h2>Results</h2></a></center>

---

In [None]:
df_results[[ "Model","test_acc_mean","test_acc_std", "acc_overall",
                       "test_prec_mean", "test_prec_std", "prec_overall",
                        "test_recall_mean","test_recall_std", "recall_overall",
                       "test_f1-score_mean", "test_f1-score_std", "f1-score_overall",
                       "test_cohens_kappa_mean", "test_cohens_kappa_std", "cohens_kappa_overall",
                       "test_matthews_corrcoef_mean","test_matthews_corrcoef_std", "matthews_corrcoef_overall",
                       "test_roc_auc_mean", "test_roc_auc_std","roc_auc_overall"]].sort_values(
    by=["matthews_corrcoef_overall", "recall_overall"], ascending=False)[:25]

---

<center><a id="part_9"><h2>Visualization</h2></a></center>

---

In [None]:
df_=df_[df_results["matthews_corrcoef_overall"]>0.85]

In [None]:
fig, ax = plt.subplots(figsize=(25,25))
#plt.axis([0.85,1,0.85,1])
ax.scatter(df_["test_matthews_corrcoef_mean"], df_["test_acc_mean"])

for i, txt in enumerate(df_["Model"]):
    ax.annotate(txt, (df_["test_matthews_corrcoef_mean"].iloc[i], df_["test_acc_mean"].iloc[i]))

plt.xlabel("Matthews Corrcoef")
plt.ylabel("Recall")

plt.grid(True)
