### Classificateur de texte basique (intro to Tf_hub) - FAST 

Avec la base de données IMBD (50 000 avis sur des films pos/neg)

Reprise du notebook associé *Intro_TextClassification* avec l'essentiel (en code)

In [1]:
import os 
import re 
import string

import tensorflow as tf

In [6]:
path = "/Users/erwan/Programmes/Stage/dlexperiments/Erwan/Text_Classification/datasets/aclImdb"
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    path + '/train', 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='training', 
    seed=seed)

    # Ensemble de validation :
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    path + '/train', 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

# Ensemble de test :
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
        path + '/test', 
    batch_size=batch_size)
    

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [5]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    strip_punc = tf.strings.strip()
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    
    return tf.strings.regex_replace(stripped_html,
    '[%s]' % re.escape(string.punctuation),
    '')

In [9]:
vocab_size = 10000
max_length = 200

tokenizer_layer = tf.keras.layers.TextVectorization(
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    max_tokens=vocab_size,
    output_sequence_length=max_length,
)

tokenizer_layer.adapt(raw_train_ds.map(lambda text, label: text))

In [10]:
print("10 ---> ",tokenizer_layer.get_vocabulary()[10])
print("80 ---> ",tokenizer_layer.get_vocabulary()[80])
print("6458 ---> ",tokenizer_layer.get_vocabulary()[6458])
print('Vocabulary size: {}'.format(len(tokenizer_layer.get_vocabulary())))

10 --->  i
80 --->  other
6458 --->  expectation
Vocabulary size: 10000


In [11]:
# Fonction de segmentation de l'ensemble des données
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    vectorized_text = tokenizer_layer(text)
    return vectorized_text, label

In [12]:
# On segmente les ensembles de données
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

# Et on les prépare à l'entraînement
### Sans Autotune c'est environ 2 sec de plus d'entraînement sur la 
### première epoch et une seconde de plus sur toutes les autres, même
### sur une base de données aussi petite et un avec modèle aussi simple.

AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [13]:
train_ds_iterator = iter(train_ds)

In [14]:
text, label = next(train_ds_iterator)
text.shape, label.shape

(TensorShape([32, 200]), TensorShape([32]))

In [10]:
embedding_dim = 150 # Dimension avec laquelle on représente nos jetons


logits_model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1),  # activation=None de base, la sortie n'est donc pas normalisée
])

logits_model.compile(
  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
  optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
  metrics=tf.metrics.BinaryAccuracy(threshold=0.0)
)

logits_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 150)         1500000   
                                                                 
 dropout (Dropout)           (None, None, 150)         0         
                                                                 
 global_average_pooling1d (G  (None, 150)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_1 (Dropout)         (None, 150)               0         
                                                                 
 dense (Dense)               (None, 1)                 151       
                                                                 
Total params: 1,500,151
Trainable params: 1,500,151
Non-trainable params: 0
______________________________________________

In [17]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 150),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid') # La sortie est normalisée ici
])

model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
    metrics=tf.metrics.BinaryAccuracy(threshold=0.0)
)

model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, None, 150)         1500000   
                                                                 
 dropout_10 (Dropout)        (None, None, 150)         0         
                                                                 
 global_average_pooling1d_5   (None, 150)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_11 (Dropout)        (None, 150)               0         
                                                                 
 dense_5 (Dense)             (None, 1)                 151       
                                                                 
Total params: 1,500,151
Trainable params: 1,500,151
Non-trainable params: 0
____________________________________________

In [15]:
epochs = 8      # Pour emb_dim = 150, sur-apprend au-délà de 8 époques
logits_history = logits_model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [18]:
epochs = 8      # Pour emb_dim = 150, sur-apprend au-délà de 8 époques
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [17]:
loss, accuracy = model.evaluate(test_ds)

print(f"Erreur : {loss}")
print(f"Précision: {accuracy}")

Erreur : 0.32067352533340454
Précision: 0.8668000102043152


In [18]:
# Exportation du modèle
export_model = tf.keras.Sequential([
  tokenizer_layer,
  model,
  tf.keras.layers.Activation('sigmoid')
])

export_model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

In [19]:
loss, accuracy = export_model.evaluate(raw_test_ds) # Noter qu'avec la couche de Vect on faire passer l'ensemble de données au format string
print(accuracy)

0.8668000102043152
