## C'est parti pour du faire Transformer

In [1]:
import tensorflow as tf
import math

### Datasets

In [2]:
def build_datasets():
    seed=1
    batch_size = 32

    raw_train_ds = tf.keras.utils.text_dataset_from_directory(
        '/Users/erwan/Programmes/Stage/Text_Classification/aclImdb/train',
        batch_size=batch_size,
        validation_split=0.2,
        subset='training',
        seed=seed
    )

    raw_val_ds = tf.keras.utils.text_dataset_from_directory(
        '/Users/erwan/Programmes/Stage/Text_Classification/aclImdb/train',
        batch_size=batch_size,
        validation_split=0.2,
        subset='validation',
        seed=seed
    )

    raw_test_ds = tf.keras.utils.text_dataset_from_directory(
        '/Users/erwan/Programmes/Stage/Text_Classification/aclImdb/test',
        batch_size=batch_size,
    )

    AUTOTUNE = tf.data.AUTOTUNE
    train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)
    val_ds = raw_val_ds.cache().prefetch(buffer_size=AUTOTUNE)
    test_ds = raw_test_ds.cache().prefetch(buffer_size=AUTOTUNE)

    return train_ds, val_ds, test_ds

### Modèle - Partie Encoder avec la Functional API

In [79]:
def my_First_Encoder(tokenizer, hidden_dim):
    
    # Layers 
    embedding_layer = tf.keras.layers.Embedding(
        input_dim=len(tokenizer.get_vocabulary()),
        output_dim=hidden_dim,
    )

    query_layer = tf.keras.layers.Dense(hidden_dim)
    key_layer = tf.keras.layers.Dense(hidden_dim)
    value_layer = tf.keras.layers.Dense(hidden_dim)

    norm_layer = tf.keras.layers.LayerNormalization()
    linear_attention_layer = tf.keras.layers.Dense(hidden_dim)

    linear_feed_forward_layer = tf.keras.layers.Dense(hidden_dim, activation='relu')
    feed_forward_output_layer = tf.keras.layers.Dense(hidden_dim)

    pooling_layer = tf.keras.layers.GlobalAveragePooling1D(name='pooling')
    classification_layer = tf.keras.layers.Dense(1)

    # Model 
    # Embedding (pas encore de positional encoding donc pas positional de embedding)
    text_input = tf.keras.Input(shape=(), dtype=tf.string, name="input")
    tokenized_outputs = tokenizer(text_input)
    embedded_outputs =  embedding_layer(tokenized_outputs)

    # Attention zone
    q_attention = query_layer(embedded_outputs)
    k_attention = key_layer(embedded_outputs)
    v_attention = value_layer(embedded_outputs)

    attention_scores = tf.matmul(q_attention, k_attention, transpose_b=True)
    attention_scores /= math.sqrt(hidden_dim)

    attention_probs = tf.nn.softmax(attention_scores, axis=1)
    z = tf.matmul(attention_probs, v_attention)

    output = linear_attention_layer(z)
    norm_output = norm_layer(embedded_outputs + output)

    # Feed Forward 
    linear = linear_feed_forward_layer(norm_output)
    ff_output = feed_forward_output_layer(linear)

    # Classification
    pooled_encoder_output = pooling_layer(ff_output)
    output = classification_layer(pooled_encoder_output) 


    return tf.keras.Model(text_input, output)

### Mise en oeuvre

In [70]:
train_ds, val_ds, test_ds = build_datasets()

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [71]:
vocab_size = 10000
output_length = 100

tokenizer = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_sequence_length=output_length
)

tokenizer.adapt(train_ds.map(lambda text, label: text))

In [86]:
simple_encoder = my_First_Encoder(
    tokenizer=tokenizer,
    hidden_dim=5,
)

In [81]:
simple_encoder.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None,)]            0           []                               
                                                                                                  
 text_vectorization_5 (TextVect  (None, 100)         0           ['input[0][0]']                  
 orization)                                                                                       
                                                                                                  
 embedding_16 (Embedding)       (None, 100, 5)       50000       ['text_vectorization_5[3][0]']   
                                                                                                  
 dense_80 (Dense)               (None, 100, 5)       30          ['embedding_16[0][0]']     

In [64]:
batch_text, batch_label = next(iter(train_ds))
f_text = batch_text[0]
input = tf.expand_dims(f_text, -1) # Pour le passer en format (1, )

f_text.shape, input.shape, input

(TensorShape([]),
 TensorShape([1]),
 <tf.Tensor: shape=(1,), dtype=string, numpy=
 array([b"Kurosawa is a proved humanitarian. This movie is totally about people living in poverty. You will see nothing but angry in this movie. It makes you feel bad but still worth. All those who's too comfortable with materialization should spend 2.5 hours with this movie."],
       dtype=object)>)

In [82]:
simple_encoder(input).shape

TensorShape([1, 1])

### Entraînement

In [83]:
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)
metric = tf.keras.metrics.BinaryAccuracy()

simple_encoder.compile(
    loss=loss_fn,
    optimizer=optimizer,
    metrics=metric
)

In [84]:
# Pour TensorBoard
nump_train = '001'
logdir = "logs/scalars/" + nump_train
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)

In [85]:
# Entraînement 
epochs=5

history = simple_encoder.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=[tensorboard_callback],
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
