In [None]:
!pip install bert-for-tf2
!pip install transformers
!pip install sentencepiece

In [8]:
import numpy as np
import pandas as pd
import os
import math

try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import transformers # Modèle Transformers sur lesquels reposent BERT+ variantess.
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.model_selection import train_test_split
import bert
import tensorflow_addons as tfa
from sklearn.metrics import f1_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
path = '/content/drive/MyDrive/data_sd/'
df_X = pd.read_json(path + 'train.json')
df_label=pd.read_csv(path + 'train_label.csv')

df=pd.merge(df_X, df_label).drop(['Id','gender'], axis = 1)
df['Category'] = pd.Categorical(df['Category'])

In [16]:
def load_bert_tokenizer(path, trainable = False):
  """
  Charge un tokenizer bert du package hub de tensorflow
  """
  BertTokenizer = bert.bert_tokenization.FullTokenizer
  bert_layer = hub.KerasLayer(path, trainable=trainable)
  vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
  to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
  return BertTokenizer(vocabulary_file, to_lower_case)

def tokenize_description(text):
  """
  représentation de text avec le tokenizer
  """
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) 


def data_to_tf(X, Y = None, BATCH_SIZE = 32):
  """
  prépare les données pour le passer dans un modèle tensorflow
  X : data frame pandas des variables prédictive
  Y : si None, data_to_tf renvoie un jeu de donnée à prédire
      sinon un jeu de donnée d'entrainement
  """
  X = [tokenize_description(description) for description in X]
  if Y is None:
    data = [(a, 0) for a in X]
  else:
    data = [(a, b) for a,b in zip(X, Y)]
 
  processed_dataset = tf.data.Dataset.from_generator(lambda: data, output_types=(tf.int32, tf.int32)) 
  batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))
  return batched_dataset


In [None]:
tokenizer = load_bert_tokenizer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3")

In [28]:
trainset, validset=train_test_split(df,shuffle=True,train_size=0.8)
trainset_tf =  data_to_tf(trainset['description'], trainset['Category'])

In [29]:
# là c'est du copier coller pure
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")

    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [30]:
VOCAB_LENGTH = len(tokenizer.vocab)
OUTPUT_CLASSES = 28

# Hyperparametre modifiable
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
DROPOUT_RATE = 0.2
NB_EPOCHS = 2

In [31]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [32]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

In [None]:
text_model.fit(trainset_tf, epochs=NB_EPOCHS)

Epoch 1/2
   2536/Unknown - 169s 67ms/step - loss: 0.9853 - sparse_categorical_accuracy: 0.7205

In [24]:
 validset_tf = data_to_tf(validset['description'])

In [25]:
res = text_model.predict(validset_tf)
res1 = [i.argmax() for i in res]

In [27]:
f1_score(res1,validset['Category'],average='macro')

TypeError: ignored