In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler,ReduceLROnPlateau
import seaborn as sns
import transformers

import nltk
import re


from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

plt.style.use('seaborn')
pd.set_option('display.max_colwidth',1000)

In [None]:
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

# Data import

In [None]:
DATA_PATH = "/kaggle/input/defi-ia-insa-toulouse"
OUTPUT_PATH = "/kaggle/working"
train_df = pd.read_json(DATA_PATH+"/train.json")
test_df = pd.read_json(DATA_PATH+"/test.json")
train_label = pd.read_csv(DATA_PATH+"/train_label.csv")

# Data Observation

In [None]:
# Identify missing values
train_df.apply(lambda x: sum(x.isnull()), axis=0)

In [None]:
# Check the target class balance
train_label["Category"].value_counts()

In [None]:
train_df.description.head()

# Data Preprocessing

## Cleaning Process

In [None]:
from bs4 import BeautifulSoup
import nltk

In [None]:
train_df["description_modified"] = [x.lower() for x in train_df.description]
test_df["description_modified"] = [x.lower() for x in test_df.description]

In [None]:
#Removing the html strips if it exists
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

In [None]:
#Apply function on the description column
train_df['description_modified']=train_df['description_modified'].apply(denoise_text)
test_df['description_modified']=test_df['description_modified'].apply(denoise_text)

In [None]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

In [None]:
#Apply function on review column
train_df['description_modified']=train_df['description_modified'].apply(remove_special_characters)
test_df['description_modified']=test_df['description_modified'].apply(remove_special_characters)

In [None]:
#Apply function on review column

#train_df['description_modified']=train_df['description_modified'].apply(simple_stemmer)
#test_df['description_modified']=test_df['description_modified'].apply(simple_stemmer)

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer

#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list= stopwords.words('english')

#stopword_list = ['in', 'of', 'at', 'a', 'the']

#set stopwords to english
#stop=set(stopwords.words('english'))
stop = set(stopword_list)
print(stop)

#removing the stopwords
def remove_stopwords(text, is_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
#Apply function on review column

train_df['description_modified']=train_df['description_modified'].apply(remove_stopwords)
test_df['description_modified']=test_df['description_modified'].apply(remove_stopwords)

## Observe the sequence length distribution

In [None]:
# we try to see the proportion of the length of the sentences
length_stats = [len(x.split()) for x in train_df['description_modified']]

length_stats_serie = pd.Series(length_stats)
length_stats_serie.value_counts()

In [None]:
#Observe the long datas to see if the important parts of the sentences are the first or last sentences
#for sen in train_df["description"]:
#    if len(sen)>120:
#        print(train_df)

In [None]:
import statistics as st

stdev = st.stdev(length_stats)
mean = st.mean(length_stats)
quantile = np.quantile(length_stats, 0.7)

print(stdev)
print(mean)
print(quantile)

## Take the best sequence length for the next part

In [None]:
 # CHOSEN sequence length
#CHOSEN_SEQ_LEN = int(quantile)

CHOSEN_SEQ_LEN = 45

# Creation Of The XLNet Model

In [None]:
from transformers import TFXLNetModel, XLNetTokenizer

In [None]:
# This is the identifier of the model. The library need this ID to download the weights and initialize the architecture
# here is all the supported ones:
# https://huggingface.co/transformers/pretrained_models.html
xlnet_model = 'xlnet-large-cased'
xlnet_tokenizer = XLNetTokenizer.from_pretrained(xlnet_model)

In [None]:
def create_xlnet(mname, shape = 120):
    """ Creates the model. It is composed of the XLNet main block and then
    a classification head its added
    """
    # Define token ids as inputs
    word_inputs = Input(shape=(shape,), name='word_inputs', dtype='int32')

    # Call XLNet model
    xlnet = TFXLNetModel.from_pretrained(mname)
    x = xlnet(word_inputs)[0]

    ##########
    #change the shape of x to remove the sequence length
    x = tf.squeeze(x[:, -1:, :], axis=1)
    
    # Add a hidden layer
    x = Dense(units=1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    
    # Add a hidden layer
    x = Dense(units=1024, activation='relu')(x)
    x = Dropout(0.1)(x)
    
    # Final output 
    outputs = Dense(28, activation='softmax', name='outputs')(x)

    # Compile model
    model = Model(inputs=[word_inputs], outputs=[outputs])

    return model

In [None]:
xlnet = create_xlnet(xlnet_model, CHOSEN_SEQ_LEN)
xlnet.summary()

# Final Work on Data

## Split Train Val

In [None]:
#shuffle the train and test sets
X_train, X_val, y_train, y_val = train_test_split(train_df['description_modified'], train_label['Category'], shuffle = True, test_size=0.2)

y_train = to_categorical(y_train)
y_val =  to_categorical(y_val)

## Transform the input

In [None]:
def get_inputs(data, tokenizer, max_len=120):
    """ Gets tensors from text using the tokenizer provided"""
    inps = [tokenizer.encode_plus(t, max_length=max_len, pad_to_max_length=True, add_special_tokens=True) for t in data]
    inp_tok = np.array([a['input_ids'] for a in inps])
    ids = np.array([a['attention_mask'] for a in inps])
    segments = np.array([a['token_type_ids'] for a in inps])
    return inp_tok, ids, segments

def warmup(epoch, lr):
    """Used for increasing the learning rate slowly, this tends to achieve better convergence.
    However, as we are finetuning for few epoch it's not crucial.
    """
    return max(lr +1e-6, 2e-5)

# Compile the model

In [None]:
xlnet.compile(optimizer= Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Training

In [None]:
inp_tok, ids, segments = get_inputs(X_train, xlnet_tokenizer, CHOSEN_SEQ_LEN)

Create Checkpoints

In [None]:
filepath=OUTPUT_PATH+"/model_xlnet-{epoch:02d}-{val_accuracy:.3f}.hdf5"

In [None]:
checkpoint_callback = ModelCheckpoint(
    filepath, monitor='val_accuracy', verbose=1,
    save_best_only=False, save_weights_only=False,
    save_frequency=1)

Create the different callbacks

In [None]:
early_stopping_callback = EarlyStopping(monitor='val_accuracy', 
                  patience=4, min_delta=0.02, 
                  restore_best_weights=True)

In [None]:
lr_scheduler = LearningRateScheduler(warmup, verbose=0)

In [None]:
reducelr_callback = ReduceLROnPlateau(monitor='val_accuracy',
                      factor=1e-6, patience=2, verbose=0, 
                      mode='auto', min_delta=0.001, cooldown=0, min_lr=1e-6)

In [None]:
callbacks = [
    early_stopping_callback
]

In [None]:
hist = xlnet.fit(x=inp_tok, y=y_train, 
                 epochs=2, 
                 batch_size=32,
                 callbacks=callbacks)

# Testing

In [None]:
inp_tok, ids, segments = get_inputs(X_val, xlnet_tokenizer)

In [None]:
preds = np.argmax(xlnet.predict(inp_tok, verbose=True), axis=1)

# Analyse the performance

In [None]:
# Predict on test dataset
from sklearn.metrics import classification_report

print(classification_report(np.argmax(y_val,axis=1), preds))

# Submission File Generation

In [None]:
inp_tok, ids, segments = get_inputs(test_df['description_modified'], xlnet_tokenizer)

predictions = np.argmax(xlnet.predict(inp_tok), axis=1)

In [None]:
test_df["Category"] = predictions
results_file = test_df[["Id","Category"]]
results_file.to_csv(OUTPUT_PATH+"/results_xlnet.csv", index=False)