In [None]:
%%capture
!pip install textblob pip install googletrans

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
### -------- Chargement des libraries ------- 
import os
import tensorflow as tf

# Ce dont nous avons besoin depuis tensorflow.keras

from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy,BinaryCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

from transformers import TFDistilBertModel, DistilBertConfig

# Et pandas pour l'importation de donnees + sklearn pour le decoupage des donnees.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from tqdm import tqdm

## Instanciation TPU 

In [None]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

## Importation des données 

In [None]:
### --------- Importation et pretraitement des donées --------- 
## Importation des données

path = '/kaggle/input/defi-ia-insa-toulouse/'

names=pd.read_csv(path+'categories_string.csv')['0'].to_dict()

df_X = pd.read_json(path + 'train.json')
df_label=pd.read_csv(path + 'train_label.csv')

data=pd.merge(df_X, df_label).drop(['Id'], axis = 1)

## Fonctions utiles
### Fairness (Disparate Impact)

In [None]:
def macro_disparate_impact(dataset,dico_jobs):
    """
    Calcul le DI d'un dataset 
    dataset : au moins 2 colonnes (Job & Gender)
    --------------------
     Id job gender
     0  1   F
     1  7   M
     2  1   M
     3  23  M
     4  23  M
    """
    jobs = dataset.Category.map(names)
    jobs = jobs.rename('job')
    people=pd.concat((jobs,dataset.gender), axis='columns')
    counts = people.groupby(['job', 'gender']).size().unstack('gender')
    counts['disparate_impact'] = counts[["M", "F"]].max(axis='columns') / counts[['M', 'F']].min(axis='columns')
    return(counts['disparate_impact'],counts['disparate_impact'].mean())

In [None]:
DI_Global=macro_disparate_impact(data,names)
print(DI_Global[1])

### Nettoyage de Texte

In [None]:
import re
import unidecode
def cleanText(string: str, punctuations=r'''!()-[]{};:'"\,<>./?@#$%^&*_~''', stop_words=['the', 'a', 'and', 'is', 'be', 'will','on'])->str:
    """ A method to clean text """
    string=unidecode.unidecode(string)
    # Cleaning the urls
    string = re.sub(r'https?://\S+|www\.\S+', '', string)
    
    # Nettoyage email
    #string = re.sub(r'[at]', '@', string)
    #string = re.sub(r'[dot]', '.', string)
        
    # Cleaning the urls
    string = re.sub(r'\[.*\]', '', string)
    
    string = re.sub(r'\]', '', string)
    string = re.sub(r'\[', '', string)
    
    string = re.sub(r'\([A-Z]+\)', '', string)
    string = re.sub(r'\([0-9]+\)', '', string)
    string = re.sub(r'[0-9]+', '#number', string)
    string = re.sub(r'\.+', '.', string)
    
    string = re.sub(r'Dr\.', 'Doctor', string)
    

    # Removing the punctuations
    #for x in string.lower(): 
    #    if x in punctuations: 
    #        string = string.replace(x, "") 

    # Converting the text to lower
    #string = string.lower()

    # Removing stop words
    #string = ' '.join([word for word in string.split() if word not in stop_words])

    # Cleaning the whitespaces
    string = re.sub(r'\s+', ' ', string).strip()

    return string  

### Augentation du dataset
#### Calcul des classes à augmenter

In [None]:
def compute_classe2augment(df):
    """renvoie la liste des classes à augmenter"""
    # group by Category
    gouped=df.groupby(["Category"]).count().sort_values(by="description")#
    gouped["name"]=gouped.index.map(names)
    return(gouped[gouped.description<=gouped.description.quantile(0.12)])

In [None]:
classes2augment=compute_classe2augment(data)
print(classes2augment)

#### Découpage des description en phrases

In [None]:
def split_desc(txt):
    """Découpe le texte en phrase"""
    txt=str(txt)
    #print(re.compile("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s").split(txt))
    nptxt=np.array([[txt.strip()] for txt in re.compile("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s").split(txt)])
    n=len(nptxt)
    nptxt=nptxt.reshape(1, n)
    return(nptxt[0])

#### Intervertion des phrases

In [None]:
def augment_swap(txt):
    # Découpage en phrase
    array_txt=split_desc(txt)
    # Assemblage random
    np.random.shuffle(array_txt)
    new_txt=''.join(array_txt)
    return(new_txt)

In [None]:
#### Swap Sentences
def swap_sentences(dataset,k_list,frac_new=0.5):
    new_text=[]
    for k in k_list:        
        ## Selection des exemples à augmenter
        df_n=dataset[dataset.Category==k].reset_index(drop=True).sample(frac=frac_new,replace=False)
        ## data augmentation loop
        for i in tqdm(range(0,len(df_n))):
            single_desc = df_n.iloc[i]['description']
            new_phrase = augment_swap(single_desc)
            new_text.append({'description':new_phrase,'Category':k})
    
    return new_text

In [None]:
### TEST : Swap Sentences

#df_count=compute_classe2augment(data)
#new_desc_swap=swap_sentences(data,k_list=df_count.index,frac_new=0.5)

#### Augmentation en CrossOver 

In [None]:
def augment_crossover(desc_list,k=None):
    """Mélange des decription de la même classe"""
    new_desc=''
    for desc in desc_list:
        desc= split_desc(desc)
        desc=np.random.choice(desc,size=int(0.1*len(desc))+1, replace=False)
        ## Selection aléatoire de la moitié de chaque  phrase 
        new_desc+=''.join(desc)    
    return(new_desc)

In [None]:
def cross_over(dataset,k_list,num_new=10,frac_crossover=0.5):
    new_text=[]
    
    for k in k_list:        
        ## Selection des exemples à augmenter
        df=dataset[dataset.Category==k].reset_index(drop=True)
        
        for num_new in tqdm(range(0,num_new)):
            df_n=df.sample(frac=frac_crossover,replace=False)
            new_phrase = augment_crossover(df_n.description)
            new_text.append({'description':new_phrase,'Category':k})

    return new_text

In [None]:
### TEST : CrossOver

df_count=compute_classe2augment(train_data)
df_count.description=1000-df_count.description

new_desc=[]
for k in df_count.index:
    num=df_count.description[k]
    for i in range(num):
        new_desc.append({'description':augment_crossover(train_data,k=k),'Category':k})

#### Augmentation par Traduction

In [None]:
from deep_translator import GoogleTranslator, MyMemoryTranslator
from textblob import TextBlob

def translate_en_fr_deeptrans(desc_list):
    
    translator_chinese= MyMemoryTranslator(source='auto', target='chinese (simplified)')
    translator_fr= MyMemoryTranslator(source='auto', target='fr')
    translator_en= MyMemoryTranslator(source='auto', target='en')
    translated_list= translator_chinese.translate_batch(desc_list)
    sleep(10)
    #translated_list = translator_en.translate_batch()
    return(translated_list)

def translate_en_fr_TextBlob(desc):
    blob=TextBlob(desc)
    blob=blob.translate(to='zh-CN')
    blob=blob.translate(to='en')
     #translated_list = translator_en.translate_batch()
    return(blob)

In [None]:
def DoubleTranslation_DeepTrans(dataset,k_list,frac_new=0.5):
    new_text=[]
    for k in k_list:
        ## Selection des exemples à augmenter
        df_n=dataset[dataset.Category==k].reset_index(drop=True).sample(frac=frac_new,replace=False)
        ## data augmentation loop
        new_phrase = translate_en_fr_deeptrans(df_n.description.to_list())
        print(new_phrase)
        #new_text.append({'description':new_phrase,'Category':k})
    return new_text

In [None]:
#### Swap Sentences
def DoubleTranslation_Textblob(dataset,k_list,frac_new=0.5):
    new_text=[]
    for k in k_list:        
        ## Selection des exemples à augmenter
        df_n=dataset[dataset.Category==k].reset_index(drop=True).sample(frac=frac_new,replace=False)
        ## data augmentation loop
        for i in tqdm(range(0,len(df_n))):
            sleep(1)
            single_desc = df_n.iloc[i]['description']
            new_phrase = translate_en_fr_TextBlob(single_desc)
            new_text.append({'description':new_phrase,'Category':k})
    
    return new_text

In [None]:
print(data.iloc[0]['description'])
print("\n\n")
print(translate_en_fr_TextBlob((data.iloc[0]['description'])))

In [None]:
data['gender'] = pd.Categorical(data['gender'])
data['Category'] = pd.Categorical(data['Category'])
data['description']=data.description.apply(lambda x: cleanText(x))
# Découpage Train - Validation 
train_data, valid_data = train_test_split(data, test_size = 0.4)

In [None]:
names

In [None]:
## Augment_Swap
df_count=compute_classe2augment(data)
new_desc_swap=swap_sentences(new_train_data,k_list=df_count.index,frac_new=0.2)
new_train_data=pd.concat([new_train_data,pd.DataFrame(new_desc_swap)])

In [None]:
## Augment_CrossOver
df_count=compute_classe2augment(data)
new_desc_cross_over=cross_over(train_data,df_count.index,num_new=100,frac_crossover=0.7)
new_train_data=pd.concat([train_data,pd.DataFrame(new_desc_cross_over)])

In [None]:
## Augment_Translation
df_count=compute_classe2augment(data)
new_desc_translated=DoubleTranslation_Textblob(train_data,df_count.index,frac_new=0.2)
#new_train_data=pd.concat([train_data,pd.DataFrame(new_desc_translated)])

In [None]:
new_train_data

In [None]:
print(train_data.shape)
#print(new_train_data.shape)


print(valid_data.shape)

print(train_data.head())

## Instanciation du modèle

In [None]:
### --------- Setup BERT ---------- #
from transformers import DistilBertTokenizer, RobertaTokenizer,DistilBertConfig,TFRobertaModel,RobertaConfig

MAX_LENGTH=300
distil_bert = 'distilbert-base-cased' # Pick any desired pre-trained model
roberta = 'roberta-large'

# Defining DistilBERT tokonizer
Distiltokenizer = DistilBertTokenizer.from_pretrained(distil_bert, do_lower_case=True, add_special_tokens=True,max_length=MAX_LENGTH, pad_to_max_length=True)
RobTokenizer=RobertaTokenizer.from_pretrained(roberta, do_lower_case=True, add_special_tokens=True,max_length=MAX_LENGTH, pad_to_max_length=True)

In [None]:
def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=MAX_LENGTH, pad_to_max_length=True, 
                                             return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')

In [None]:
###############
## Autre Model
###############

def create_model_bis(BertModel='distilbert-base-uncased',max_length=MAX_LENGTH):
  
    transformer_model = TFRobertaModel.from_pretrained(BertModel)
    
    # Couche d'entrée du modèle
    input_ids_in = Input(shape=(max_length,), name='input_token', dtype='int32')
    input_masks_in = Input(shape=(max_length,), name='masked_token', dtype='int32') 

    hidden_states = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
    cls_token = hidden_states[:,0,:]
    
    
    X= tf.keras.layers.BatchNormalization()(cls_token)
    
    X = tf.keras.layers.Dense(256, activation='sigmoid')(X)
    X = tf.keras.layers.Dropout(0.3)(X)
    X = tf.keras.layers.Dense(128, activation='sigmoid')(X)
    X = tf.keras.layers.Dropout(0.3)(X)
    X = tf.keras.layers.Dense(28, activation='softmax')(X)
    
    model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

    return model

In [None]:
def create_model_bis(BertModel='distilbert-base-uncased',max_length=300):
  
    transformer= TFRobertaModel.from_pretrained(BertModel, output_hidden_states=True)
    
    # Couche d'entrée du modèle
    input_ids_in = Input(shape=(max_length,), name='input_token', dtype='int32')
    input_masks_in = Input(shape=(max_length,), name='masked_token', dtype='int32') 

    hidden_states = transformer(input_ids_in,attention_mask=input_masks_in)[2]#, 

    merged = tf.keras.layers.concatenate(tuple([hidden_states[i] for i in [-3,-2, -1]]))
    X= tf.keras.layers.BatchNormalization()(X)
    X= tf.keras.layers.BatchNormalization()()
    
    X=tf.keras.layers.Conv1D(filters=6, kernel_size=4,strides=3,padding='valid', activation='relu')(merged)
    X=tf.keras.layers.Conv1D(filters=6, kernel_size=10,strides=3,padding='valid', activation='relu')(merged)
    
    X=tf.keras.layers.MaxPool1D(pool_size=2)(X)
    X=tf.keras.layers.Conv1D(filters=6, kernel_size=4,strides=1,padding='valid', activation='relu')(X)
    X=tf.keras.layers.MaxPool1D(pool_size=2)(X)
    X=tf.keras.layers.Conv1D(filters=128, kernel_size=4,strides=1,padding='valid', activation='relu')(X)
    X = tf.keras.layers.Flatten()(X)
    
    X = tf.keras.layers.Dense(32,activation='relu')(X)
    #X = tf.keras.layers.Dense(128, activation='relu')(X)
    X = tf.keras.layers.Dropout(0.2)(X)
    X = tf.keras.layers.Dense(28, activation='softmax')(X)
    model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)#

    return model


In [None]:
new_train_data=new_train_data.sample(frac=1).reset_index(drop=True)

In [None]:
new_train_data

In [None]:
# Tokenization / Creation des inputs : TrainSet
input_ids,input_masks,input_segments=tokenize(train_data.description, RobTokenizer)

In [None]:
# Tokenization / Creation des inputs : ValidationSet
valid_input_ids,valid_input_masks,valid_input_segments=tokenize(valid_data.description, RobTokenizer)

In [None]:
#########################
### Outils pour le réseau
#########################
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint

optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy()
earlyStopping=EarlyStopping(monitor='val_f1',patience=5, verbose=1,mode='max')

filepath="Model_{epoch:02d}_{val_f1}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_f1', verbose=1, save_weights_only=True,save_best_only=True,mode='max')

In [None]:
### Important pour le TPU
with tpu_strategy.scope():
    model = create_model_bis(BertModel=roberta)
    model.summary()
    model.compile(optimizer=optimizer,loss=loss,metrics=['acc']) # Ca ne sert à rien de mettre le F1 Score ici

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score

class Metrics(tf.keras.callbacks.Callback):
    def __init__(self, valid_data):
        super(Metrics, self).__init__()
        self.validation_data = valid_data

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        val_predict = np.argmax(self.model.predict(self.validation_data[0]), -1)
        val_targ = self.validation_data[1]
        if len(val_targ.shape) == 2 and val_targ.shape[1] != 1:
            val_targ = np.argmax(val_targ, -1)

        _val_f1 = f1_score(val_targ, val_predict, average='macro',zero_division=0)
        _val_recall = recall_score(val_targ, val_predict, average='macro',zero_division=0)
        _val_precision = precision_score(val_targ, val_predict, average='macro',zero_division=0)

        logs['val_f1'] = _val_f1
        logs['val_recall'] = _val_recall
        logs['val_precision'] = _val_precision
        print(" — val_f1: %f — val_precision: %f — val_recall: %f" % (_val_f1, _val_precision, _val_recall))
        return

In [None]:
###########################
### Apprentissage du Réseau
###########################
BATCH_SIZE=64
EPOCHS=50
history=model.fit([input_ids,input_masks],train_data.Category,
                  batch_size=BATCH_SIZE,epochs=EPOCHS,
                  callbacks=[Metrics(valid_data=([valid_input_ids,valid_input_masks],valid_data.Category)),earlyStopping,checkpoint],
                  verbose=1,shuffle=True)

## Pour le model 3 UNIQUEMENT
## model.fit([input_ids,input_masks],y={'gender': train_data.gender, 'jobs': train_data.Category},verbose=2,validation_split=0.3,batch_size=BATCH_SIZE,epochs=EPOCHS)

In [None]:
from sklearn.metrics import f1_score

def Load_Model(path):
    new_model = create_model_bis(BertModel=roberta)
    new_model.load_weights(path)
    return(new_model)

def predict_classes(new_model,valid_inputs):
    proba=new_model.predict(valid_inputs)
    classes=proba.argmax(axis=-1)
    return(classes)

def inspect_results(y_pred,y_true):
    # Voir les prédictions ou il se trompe
    # Regarder dans quelle classe il classe mal ?
    # Quelles têtes ont les descriptions ? 
    ### Important pour le TPU
    #y_pred["name"]=y_pred.map(names)
    print(f1_score(y_pred,y_true))
    

In [None]:
new_model=Load_Model(path="./Model_08_0.7986206856061193.hdf5")

In [None]:
classes=predict_classes(new_model,[valid_input_ids,valid_input_masks])

In [None]:
### --------- Importation et pretraitement des donées --------- 
## Importation des données

path = '/kaggle/input/defi-ia-insa-toulouse/'
df_X_test = pd.read_json(path + 'test.json')

data_test=df_X_test.drop(['Id'], axis = 1)

data_test['gender'] = pd.Categorical(data_test['gender'].replace({'M': 0, 'F': 1}))

In [None]:
print(data_test.shape)

In [None]:
input_ids_test,input_masks_test,input_segments_test=tokenize(data_test.description, RobTokenizer)

In [None]:
proba_test=model.predict([input_ids_test,input_masks_test])

In [None]:
classesproba_test=proba_test.argmax(axis=-1)

In [None]:
print(classesproba_test.shape)

In [None]:
submis=pd.DataFrame(classesproba_test,columns =['Category'])
print(submis.shape)
print(submis.head())

In [None]:
submis.to_csv('sub031220')