# Data and Libraries

In [None]:
# Import modules
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import  Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tqdm import tqdm
import matplotlib.pyplot as plt

print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)
pd.set_option('display.max_colwidth',1000)


In [None]:
!pip install transformers

# Loading data

In [None]:
DATA_PATH = "/kaggle/input/defi-ia-insa-toulouse"
OUTPUT_PATH = "/kaggle/working"
train_df = pd.read_json(DATA_PATH+"/train.json")
test_df = pd.read_json(DATA_PATH+"/test.json")
train_label = pd.read_csv(DATA_PATH+"/train_label.csv")

# Analyse data


In [None]:
train_df.description.head()

In [None]:
# Identify missing values
train_df.apply(lambda x: sum(x.isnull()), axis=0)

In [None]:
# Check the target class balance
train_label["Category"].value_counts()

# Preprocessing

In [None]:
#put all the text in lower case
train_df["description_lower"] = [x.lower() for x in train_df.description]
test_df["description_lower"] = [x.lower() for x in test_df.description]

In [None]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
import re,string,unicodedata

In [None]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
#stopword_list=nltk.corpus.stopwords.words('english')

stopword_list = ['in', 'of', 'at', 'a', 'the']

Functions to remove html strips and noise text and special characters (if they exist)

In [None]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

In [None]:
#Apply function on the description column
train_df['description_lower']=train_df['description_lower'].apply(denoise_text)
test_df['description_lower']=test_df['description_lower'].apply(denoise_text)

In [None]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

In [None]:
#Apply function on review column
train_df['description_lower']=train_df['description_lower'].apply(remove_special_characters)
test_df['description_lower']=test_df['description_lower'].apply(remove_special_characters)

In [None]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

In [None]:
#Apply function on review column
#train_df['description_lower']=train_df['description_lower'].apply(simple_stemmer)
#test_df['description_lower']=test_df['description_lower'].apply(simple_stemmer)

Remove StopWords

In [None]:
#set stopwords to english
#stop=set(stopwords.words('english'))
stop = set(stopword_list)
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
#Apply function on review column
#train_df['description_lower']=train_df['description_lower'].apply(remove_stopwords)
#test_df['description_lower']=test_df['description_lower'].apply(remove_stopwords)

### Observe the sequence length distribution

In [None]:
# we try to see the proportion of the length of the sentences
length_stats = [len(x.split()) for x in train_df['description_lower']]

length_stats_serie = pd.Series(length_stats)
length_stats_serie.value_counts()

In [None]:
import statistics as st

stdev = st.stdev(length_stats)
mean = st.mean(length_stats)
quantile = np.quantile(length_stats, 0.9)

print(stdev)
print(mean)
print(quantile)

# Tokenization

In [None]:
# can be up to 512 for BERT
max_length = 128

# Encoding train and test dataset

In [None]:
def convert_sentences(sentences, bert_tokenizer):
    input_ids=[]
    attention_masks=[]

    for sent in sentences:
        bert_inp=bert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =max_length,
                                            pad_to_max_length = True,return_attention_mask = True)
        input_ids.append(bert_inp['input_ids'])
        attention_masks.append(bert_inp['attention_mask'])

    input_ids=np.asarray(input_ids)
    attention_masks=np.array(attention_masks)
    
    return [input_ids, attention_masks]

# Creation of the model

In [None]:
import transformers
from transformers import TFBertModel, BertTokenizerFast, BertConfig

model_name = 'bert-large-uncased'

# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config, do_lower_case=True)

# Import the needed model(Bert, Roberta or DistilBert) with output_hidden_states=True
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

In [None]:
bert = transformer_model.layers[0]

# Build your model input
inputs = Input(shape=(max_length,), name='input_ids', dtype='int32')

# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)


# Then build your model output
job = Dense(units=28, name='job', activation='softmax')(pooled_output)

# And combine it all in a model object
model = Model(inputs=inputs, outputs=job, name='BERT_job_prediction')

In [None]:
model.summary()

In [None]:
learning_rate = 3e-5

number_of_epochs = 1

batch_size = 8

# classifier Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08,
                                    decay=0.01,
                                    clipnorm=1.0)


model.compile(optimizer=optimizer, 
              loss='categorical_crossentropy', 
              metrics=['accuracy', tf.keras.metrics.Precision(),
                       tf.keras.metrics.Recall()])

# Fine Tuning

## Creating Checkpoints

In [None]:
#for THE BASE BERT
#filepath=OUTPUT_PATH+"/model_bert768-{epoch:02d}-{val_accuracy:.3f}.hdf5"

#FOR THE LARGE BERT
filepath=OUTPUT_PATH+"/bert-checkpoint.hdf5"

In [None]:
checkpoint_callback = ModelCheckpoint(
    filepath, monitor='val_accuracy', verbose=1,
    save_best_only=False, save_weights_only=False,
    save_frequency=1)

## Training

In [None]:
#shuffle the train and test sets
X_train, X_val, y_train, y_val = train_test_split(train_df['description_lower'], train_label['Category'], shuffle = True, test_size=0.10)

X_train = convert_sentences(X_train, tokenizer)
X_val = convert_sentences(X_val, tokenizer)

y_train = to_categorical(y_train)
y_val =  to_categorical(y_val)

In [None]:
# Fit the data to the model
history = model.fit(X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=number_of_epochs,
                    batch_size=batch_size,
                    verbose = 1,
                    validation_freq=1,
                    callbacks=[checkpoint_callback]
                   )

## Save the model

In [None]:
# Save the trained model

#for the base bert
model.save(OUTPUT_PATH+'/nlp_model_job_prediction_bert_classification.h5') 

In [None]:
# Predict on test dataset
model.evaluate(X_val, y_val)

# Analyze the performance

In [None]:
# Load the pretrained nlp_model
from tensorflow.keras.models import load_model
new_model = load_model(OUTPUT_PATH+'/nlp_model_job_prediction_bert_classification.h5',custom_objects={'KerasLayer':hub.KerasLayer})

In [None]:
X_test2 = convert_sentences(test_df['description_lower'], tokenizer)
predictions = np.argmax(new_model.predict(X_test2), axis=1)

## File Generation

In [None]:
test_df["Category"] = predictions
baseline_file = test_df[["Id","Category"]]
baseline_file.to_csv("/kaggle/working/baseline.csv", index=False)