# AI Challenge Notebook

### Team name: France-INSA/ENSEEIHT/VALDOM-UPENDO
### Team Members: 
* GHOMSI KONGA Serge
* KEITA Alfousseyni
* RIDA Moumni
* SANOU Désiré
* WAFFA PAGOU Brondon

## Introduction

For this project, we tested several models including BERT, Logistic Regression, Embedding,lstm-gru-cnn-glove and SVC. With individual models, we didn't get the accuracy we were excepting. Therefore, we chosed those giving the best accuracies ( i.e Bert, SVC and lstm-gru-cnn-glove), and performed a maojority voting on them. <br> BERT had the best accuracy among the three models, so we gave it the priority in case all the three predictions are different. <br>
In this notebook are the implementations of the three models including preporcessing and the majority voting code.

#  Data Analysis and Preprocessing

## Imports

In [None]:
import pandas as pd
from collections import Counter

In [None]:
import unicodedata 
import time
import pandas as pd
import numpy as np
import random
import nltk
import re 
import collections
import itertools
import pickle
import warnings
from tqdm import tqdm
import plotly.offline as pof
import plotly.graph_objects as go
warnings.filterwarnings("ignore")
import sklearn.metrics as smet
import matplotlib.pyplot as plt
import seaborn as sb
from scipy import sparse
from sklearn.metrics import classification_report
sb.set_style("whitegrid")
import sklearn.model_selection as sms
!pip install git+https://github.com/abhishekkrthakur/tez.git
!pip install transformers==3.5.0

# BERT Implementation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Load Data

In [None]:
import pandas as pd
import pickle

DATA_PATH = "/content/drive/MyDrive/AI_DEFI/defi-ia-insa-toulouse"
train_df = pd.read_json(DATA_PATH+"/train.json")
test_df = pd.read_json(DATA_PATH+"/test.json")
train_label = pd.read_csv(DATA_PATH+"/train_label.csv")
categories_string = pd.read_csv(DATA_PATH+"/categories_string.csv")

In [None]:
import nltk
nltk.download("stopwords")

## Text Preprocessing

In [None]:
import re
import unicodedata
from string import digits
from bs4 import BeautifulSoup #Nettoyage d'HTML

digits_list = digits

class CleanText:
    def __init__(self):
        french_stopwords = nltk.corpus.stopwords.words('english')
        self.stopwords = [self.remove_accent(sw) for sw in french_stopwords]

        self.stemmer = nltk.stem.SnowballStemmer('english')
    @staticmethod
    def remove_html_code(txt):
        txt = BeautifulSoup(txt, "html.parser", from_encoding='utf-8').get_text()
        return txt
    @staticmethod
    def convert_text_to_lower_case(txt):
        return txt.lower()
    @staticmethod
    def remove_accent(txt):
        return unicodedata.normalize('NFD', txt).encode('ascii', 'ignore').decode("utf-8")
    @staticmethod
    def remove_non_letters(txt):
        return re.sub('[^a-z_]', ' ', txt)
    def remove_stopwords(self, txt):
        return [w for w in txt.split() if (w not in self.stopwords)]
    def get_stem(self, tokens):
        return [self.stemmer.stem(token) for token in tokens]

In [None]:
cleaner = CleanText()
def apply_all_transformation(txt):
    cleaned_txt = cleaner.remove_html_code(txt)
    cleaned_txt = cleaner.convert_text_to_lower_case(cleaned_txt)
    cleaned_txt = cleaner.remove_accent(cleaned_txt)
    cleaned_txt = cleaner.remove_non_letters(cleaned_txt)
    cleaned_txt = cleaner.remove_stopwords(cleaned_txt)
    cleaned_txt = cleaner.get_stem(cleaned_txt)
    return cleaned_txt

In [None]:
def clean_df_column(dataset, column, cleaned_column):
    dirty_column = dataset[str(column)]
    clean = [" ".join(apply_all_transformation(x)) for x in tqdm(dirty_column)]
    dataset[str(cleaned_column)] = clean
clean_df_column(train_df,'description','description_cleaned')
clean_df_column(test_df,'description','description_cleaned')

## Train Model

In [None]:
import pandas as pd
import tez
import torch
import torch.nn as nn
import transformers
from sklearn import metrics, model_selection, preprocessing
from transformers import AdamW, get_linear_schedule_with_warmup


class BERTDataset:
    def __init__(self, text, target):
        self.text = text
        self.target = target
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.max_len = 64

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.long),
        }


class BERTBaseUncased(tez.Model):
    def __init__(self, num_train_steps, num_classes):
        super().__init__()
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.bert = transformers.BertModel.from_pretrained("bert-base-uncased")
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, num_classes)

        self.num_train_steps = num_train_steps
        self.step_scheduler_after = "batch"

    def fetch_optimizer(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.bias"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        opt = AdamW(optimizer_parameters, lr=3e-5)
        return opt

    def fetch_scheduler(self):
        sch = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=0, num_training_steps=self.num_train_steps
        )
        return sch

    def loss(self, outputs, targets):
        if targets is None:
            return None
        return nn.CrossEntropyLoss()(outputs, targets)

    def monitor_metrics(self, outputs, targets):
        if targets is None:
            return {}
        outputs = torch.argmax(outputs, dim=1).cpu().detach().numpy()
        targets = targets.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(targets, outputs)
        return {"accuracy": accuracy}

    def forward(self, ids, mask, token_type_ids, targets=None):
        _, o_2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        b_o = self.bert_drop(o_2)
        output = self.out(b_o)
        loss = self.loss(output, targets)
        acc = self.monitor_metrics(output, targets)
        return output, loss, acc

In [None]:
lbl_enc = preprocessing.LabelEncoder()
category = lbl_enc.fit_transform(train_label.Category.values)

### Split dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df.description_cleaned.values,train_label.Category.values, test_size=0.20, random_state=42)

### Training

In [None]:
train_dataset = BERTDataset(text=X_train, target=y_train)
valid_dataset = BERTDataset(text=X_test, target=y_test)
n_train_steps = int(len(X_train) / 32 * 10)
model = BERTBaseUncased(num_train_steps=n_train_steps, num_classes=train_label.Category.nunique())

tb_logger = tez.callbacks.TensorBoardLogger(log_dir=".logs/")
es = tez.callbacks.EarlyStopping(monitor="valid_loss", model_path="model.bin")
model.fit(
        train_dataset,
        valid_dataset=valid_dataset,
        train_bs=32,
        device="cuda",
        epochs=3,
        callbacks=[tb_logger, es],
        fp16=True,
    )
model.save("model.bin")

### Test

In [None]:
test_dataset = BERTDataset(text=X_test, target= np.full((len(X_test), ), 0))
pred = model.predict(test_dataset, device="cuda")

res = []
for p in pred:
  res.append(p)
results = [torch.argmax(torch.from_numpy(elem), axis=1).numpy() for elem in res]
predict_test = []
for elem in results:
  for e in elem:
    predict_test.append(e)

print(classification_report(y_test, predict_test))

### Prediction

In [None]:
tar =  np.full((len(test_df.description_cleaned.values), ), 0)
test_dataset = BERTDataset(text=test_df.description_cleaned.values, target= tar)
pred = model.predict(test_dataset, device="cuda")

In [None]:
res = []
for p in pred:
  res.append(p)

results_pred = [torch.argmax(torch.from_numpy(elem), axis=1).numpy() for elem in res]

predict_csv = []
for elem in results_pred:
  for e in elem:
    predict_csv.append(e)

In [None]:
test_df["Category"] = predict_csv
bert_pred = test_df[["Id","Category"]]


# SVC Implementation

In [None]:
nltk.download('punkt')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
test =np.append(train_df["description_cleaned"].values,test_df["description_cleaned"].values)

TF_IDF = CountVectorizer(ngram_range=(1,2))
TF_IDF.fit(test)
train_df_TFIDF = TF_IDF.transform(train_df["description_cleaned"].values)
test_df_TFIDF = TF_IDF.transform(test_df["description_cleaned"].values)


## Split dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df_TFIDF, train_label.Category.values, test_size=0.2, random_state=42)

## Train Model

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

svc = LinearSVC()
svc.fit(X_train, y_train)

## Test

In [None]:
y_test_pred =svc.predict(X_test)
print(classification_report(y_test, y_test_pred, digits=3))

## Prediction

In [None]:
test_df["Category"] = svc.predict(test_df_TFIDF)
svc_pred = test_df[["Id","Category"]]

# LSTM-GRU-CNN-Glove Implementation

## Load Libraries

In [None]:
from sklearn.model_selection import train_test_split
import sys, os, re, csv, codecs, numpy as np, pandas as pd
np.random.seed(13)
os.environ["OMP_NUM_THREADS"] = "4"
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer

import logging
from keras.callbacks import Callback

## Load Data

In [None]:
DATA_PATH = "/content/drive/MyDrive/AI_DEFI/defi-ia-insa-toulouse"
data_df = pd.read_json(DATA_PATH+"/train.json")
data_df = data_df.set_index('Id',drop=False)
data_df.index.name = None

test_df = pd.read_json(DATA_PATH+"/test.json")
test_df = test_df.set_index('Id',drop=False)
test_df.index.name = None

data_label = pd.read_csv(DATA_PATH+"/train_label.csv")
categories_string = pd.read_csv(DATA_PATH+"/categories_string.csv")

! mkdir data_glove
! wget -P data_glove https://drive.google.com/file/d/1QwQs-kS1HtH_QZ_k5Mf-chvLBHDTOysR/view?usp=sharing
#embedding_path = DATA_PATH+"/fasttext-crawl-300d-2m/crawl-300d-2M.vec"
embedding_path = data_glove+"/glove840b300dtxt/glove.840B.300d.txt"


embed_size = 300
max_features = 130000
max_len = 220


## Text Processing



In [None]:
X_train_test, X_valid, Y_train_test, Y_valid = train_test_split(data_df, data_label, test_size=0.2, random_state=13)

X_train, X_test, Y_train, Y_test = train_test_split(X_train_test, Y_train_test, test_size=0.2, random_state=13)


raw_text_train = X_train["description"].str.lower()
raw_text_valid = X_valid["description"].str.lower()
raw_text_test = X_test["description"].str.lower()
raw_text_df_test = test_df["description"].str.lower()

tk = Tokenizer(num_words = max_features, lower = True)
tk.fit_on_texts(raw_text_train)
X_train["description_seq"] = tk.texts_to_sequences(raw_text_train)
X_valid["description_seq"] = tk.texts_to_sequences(raw_text_valid)
X_test["description_seq"] = tk.texts_to_sequences(raw_text_test)
test_df["description_seq"] = tk.texts_to_sequences(raw_text_df_test)

X_train = pad_sequences(X_train.description_seq, maxlen = max_len)
X_valid = pad_sequences(X_valid.description_seq, maxlen = max_len)
X_test = pad_sequences(X_test.description_seq, maxlen = max_len)
test = pad_sequences(test_df.description_seq, maxlen = max_len)


Y_train = Y_train.Category.values
Y_valid = Y_valid.Category.values
Y_test = Y_test.Category.values

## Utilities functions and class

In [None]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')



class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            y_pred = np.argmax(y_pred, axis=1)
            
            f1_score_macro = f1_score(self.y_val, y_pred, average="macro")
            print("\n f1_score_macro - epoch: {:d} - f1_score_macro: {:.6f}".format(epoch+1, f1_score_macro))

## Training

In [None]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')

embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D

file_path = "best_model.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
ra_val = RocAucEvaluation(validation_data=(X_valid, Y_valid), interval = 1)
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)

def build_model(lr = 0.0, lr_d = 0.0, units = 0, dr = 0.0,epochs=3):
    inp = Input(shape = (max_len,))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(dr)(x)

    x = Bidirectional(GRU(units, return_sequences = True))(x1)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    
    y = Bidirectional(LSTM(units, return_sequences = True))(x1)
    y = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(y)
    
    avg_pool1 = GlobalAveragePooling1D()(x)
    max_pool1 = GlobalMaxPooling1D()(x)
    
    avg_pool2 = GlobalAveragePooling1D()(y)
    max_pool2 = GlobalMaxPooling1D()(y)
    
    
    x = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])

    x = Dense(28, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss = "sparse_categorical_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, Y_train, batch_size = 128, epochs = epochs, validation_data = (X_valid, Y_valid), 
                        verbose = 1, callbacks = [ra_val, check_point, early_stop])
    model = load_model(file_path)
    return model

In [None]:
model = build_model(lr = 1e-3, lr_d = 0, units = 128, dr = 0.2,epochs=4)


## Test

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score,accuracy_score

y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

# Print accuracy, f1, precision, and recall scores
print("accuracy :", accuracy_score(Y_test, y_pred))
print("precision :", precision_score(Y_test, y_pred , average="macro"))
print("recall :", recall_score(Y_test, y_pred , average="macro"))
print("f1_score :", f1_score(Y_test, y_pred , average="macro"))

## Prediction

In [None]:
y_pred_test = model.predict(test)
lstm_gru_cnn_glove_pred = np.argmax(y_pred_test , axis=1)


# Final Majority Voting Implementation

### Function implementation

In [None]:
def GetKey(dictA, val):
    for key, value in dictA.items():
        if val == value:
            return key
    return "key doesn't exist"
    
def make_final_prediction(prior_prediction, prediction2, prediction3):
    predictions = []
    for prior_pred, pred2, pred3 in zip(prior_prediction, prediction2, prediction3):
        preds=[prior_pred, pred2, pred3]
        pred_occurrence = Counter(preds)
        if len(pred_occurrence) < len(preds):
            val=dict(pred_occurrence).values()
            majoritary_vote = max( val )
            prediction= GetKey(dict(pred_occurrence),majoritary_vote)
        else:
            prediction = prior_pred
        predictions.append(prediction)
    return predictions

### Function call

In [None]:
#prior prediction is bert_pred
predictions=make_final_prediction(bert_pred["Category"], svc_pred["Category"] , lstm_gru_cnn_glove_pred)

len(predictions)

In [None]:
test_df["Category"] = predictions
predictions_csv = test_df[["Id","Category"]]
predictions_csv.to_csv("data/predictions/final_prediction.csv", index=False)

# Conclusion

On kaggle, the final accuracies we had were:
* Public score:  0.78049
* Private score: 0.78013

Based on thoses results we are can conclude that our final model, wasn't overfitted to the test data.
This challenge helped us explore data preprocessing technics, different models and have a better knowlegde of their algorithms.