In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import tqdm 

from sklearn.utils import shuffle

### Initialisation de wandb

In [2]:
import wandb

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33merwanlbv[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
%env "WANDB_NOTEBOOK_NAME" "PureEmbdeding with Sweep and Tensorflow"

env: "WANDB_NOTEBOOK_NAME"="PureEmbdeding with Sweep and Tensorflow"


### Récupération de la base de données

In [4]:
path = "/Users/erwan/Programmes/Stage/dlexperiments/Erwan/Text_Classification/datasets/Tweeter/french_tweets.csv"

df = pd.read_csv(path)
shuff_df = shuffle(df)

In [5]:
df_size = 1000
small_df = shuff_df[:df_size]
len(small_df), small_df['label'].value_counts()

(1000,
 1    515
 0    485
 Name: label, dtype: int64)

### Construction du tokenizer & des ensembles de données 

In [7]:
def build_tokenizer_and_datasets(df, config):
    print("------------")
    
    def create_ds(df, size):
        shuffled_df = shuffle(df)[:size]
        text_seq = shuffled_df['text']
        target_seq = shuffled_df['label']
        ds = tf.data.Dataset.from_tensor_slices((text_seq, target_seq))
        
        return ds
    
    ds = create_ds(df, config.global_ds_size)
    ds_size = len(ds)    
    print(f" Ensemble de données créé, taille : {ds_size}")   
    print("------------")


    train_size = int(config.train_split * ds_size)
    val_size = int(config.val_split * ds_size)
    print(f" Taille des ensembles de données : {train_size}, {val_size}")
    
    ds.shuffle(1)

    str_train_ds = ds.take(train_size).batch(config.batch_size)
    str_val_ds = ds.skip(train_size).take(val_size).batch(config.batch_size)
    str_test_ds = ds.skip(train_size + val_size).batch(config.batch_size)

    print("Fin du chargement des bases de données")
    print(len(str_train_ds) * config.batch_size, len(str_val_ds) * config.batch_size, len(str_test_ds) * config.batch_size)
    print("------------")

    tokenizer_layer = tf.keras.layers.TextVectorization(
        standardize='lower_and_strip_punctuation',
        split='whitespace',
        max_tokens=config.vocab_size,
        output_sequence_length=config.max_length,
    )

    # On entraine le tokenizer sur l'ensemble de données d'entraînement
    tokenizer_layer.adapt(str_train_ds.map(lambda text, label: text))
    print("Fin de l'entraînement du tokenizer")
    print("------------")


    # Préparation des ensembles de données : 
    def tokenize_text(text, label):
        text = tf.expand_dims(text, -1) # Explication -1 -> tf.data.Dataset -> "map"
        res = tokenizer_layer(text)
        
        return res, label

    train_ds = str_train_ds.map(tokenize_text)
    val_ds = str_val_ds.map(tokenize_text)
    test_ds = str_test_ds.map(tokenize_text)

    AUTOTUNE = tf.data.AUTOTUNE

    train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
    test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

    print("Fin de la préparation des bases de donneés")
    print(len(train_ds) * config.batch_size + len(val_ds) * config.batch_size + len(test_ds) * config.batch_size)
    print("-----------")

    return tokenizer_layer, train_ds, val_ds, test_ds

### Construction du modèle 

In [8]:
def build_model(config):
    
    model = tf.keras.Sequential([
    tf.keras.layers.Embedding(config.vocab_size, config.embedding_dim),
    tf.keras.layers.Dropout(config.drop1),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(config.drop2),
    tf.keras.layers.Dense(1),  # activation=None de base, la sortie n'est donc pas normalisée
    ])
    
    return model

### Définition des callbacks

In [9]:
def build_callback(config, name):

    log_dir = config.callbacks_log_dir

    log_dir = log_dir + name
    log_model = log_dir + name + '/models'

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=log_dir, 
        histogram_freq=1
    )

    model_checkpoint_callbacks = tf.keras.callbacks.ModelCheckpoint(
        filepath=log_model,
        save_weights_only=True,
        monitor='val_binary_accuracy',
        mode='max',
        save_best_only=True,
        initial_value_threshold=0.60,
        verbose=1
    )

    early_stopping_callbacks = tf.keras.callbacks.EarlyStopping(
        monitor='val_binary_accuracy',
        min_delta=config.early_stopping_min_delta,
        patience=5,
        verbose=1,
        mode='auto',
        baseline=None,
        restore_best_weights=False
    )

    CALLBACKS = [tensorboard_callback, model_checkpoint_callbacks, early_stopping_callbacks]

    return CALLBACKS    

### Configuration de Sweep

In [10]:
import yaml
import pprint

In [16]:
sweep_config_path = r'sweep_config.yaml'

with open(sweep_config_path) as file:
    sweep_config = yaml.load(file, Loader=yaml.FullLoader)

pprint.pprint(sweep_config)

{'method': 'random',
 'metric': {'goal': 'maximize', 'name': 'val_binary_acc'},
 'parameters': {'batch_size': {'value': 50},
                'callbacks_log_dir': {'value': 'logs/run/'},
                'drop1': {'values': [0.2, 0.3, 0.4]},
                'drop2': {'values': [0.2, 0.3, 0.4]},
                'early_stopping_min_delta': {'value': 0.001},
                'embedding_dim': {'values': [100, 200, 500]},
                'epochs': {'value': 5},
                'global_ds_size': {'value': 1000},
                'lr': {'value': '8e-4'},
                'max_length': {'value': 200},
                'train_split': {'value': 0.7},
                'val_split': {'value': 0.15},
                'vocab_size': {'values': [15000, 20000, 30000]}}}


### Entraînement

In [20]:
def sweep_train():

  with wandb.init():

    wandb.config.architecture_name = "PureEmbedding"
    wandb.config.dataset_name = "tweeter-fr"

    tokenizer, train_ds, val_ds, test_ds = build_tokenizer_and_datasets(
      df=df,  
      config=wandb.config
    )

    callbacks = build_callback(wandb.config, wandb.run.name)

    model = build_model(wandb.config)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=wandb.config.lr)
    loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    acc_metric = tf.keras.metrics.BinaryAccuracy()

    model.compile(
      optimizer=optimizer,
      loss=loss_fn,
      metrics=[acc_metric]
    )

    print(f"Entraînement lancé \n - Nom : {wandb.run.name} \n - Configuration : {wandb.config} \n\n")

    history = model.fit(
      train_ds,
      validation_data=val_ds,
      epochs=wandb.config.epochs, # Est capable de le trouver tout seul
      callbacks=callbacks,
    )

    # On enregistre la meilleure précision de l'epoch sur l'ensemble de validation
    print(history.history)
    print(max(history.history['val_binary_accuracy']))
    wandb.config.best_val_bin_acc = max(history.history['val_binary_accuracy'])  

In [18]:
sweep_id = wandb.sweep(sweep_config, project="Remote-PureEmb-tf")

Create sweep with ID: xn3q70sq
Sweep URL: https://wandb.ai/erwanlbv/Remote-PureEmb-tf/sweeps/xn3q70sq


In [21]:
wandb.agent(sweep_id, function=sweep_train, count=4)

[34m[1mwandb[0m: Agent Starting Run: ccsal6db with config:
[34m[1mwandb[0m: 	batch_size: 50
[34m[1mwandb[0m: 	callbacks_log_dir: logs/run/
[34m[1mwandb[0m: 	drop1: 0.4
[34m[1mwandb[0m: 	drop2: 0.2
[34m[1mwandb[0m: 	early_stopping_min_delta: 0.001
[34m[1mwandb[0m: 	embedding_dim: 500
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	global_ds_size: 1000
[34m[1mwandb[0m: 	lr: 0.0008
[34m[1mwandb[0m: 	max_length: 200
[34m[1mwandb[0m: 	train_split: 0.7
[34m[1mwandb[0m: 	val_split: 0.15
[34m[1mwandb[0m: 	vocab_size: 20000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


------------
 Ensemble de données créé, taille : 1000
------------
 Taille des ensembles de données : 700, 150
Fin du chargement des bases de données
700 150 150
------------
Fin de l'entraînement du tokenizer
------------
Fin de la préparation des bases de donneés
1000
-----------
Entraînement lancé 
 - Nom : summer-sweep-5 
 - Configuration : {'batch_size': 50, 'callbacks_log_dir': 'logs/run/', 'drop1': 0.4, 'drop2': 0.2, 'early_stopping_min_delta': 0.001, 'embedding_dim': 500, 'epochs': 5, 'global_ds_size': 1000, 'lr': 0.0008, 'max_length': 200, 'train_split': 0.7, 'val_split': 0.15, 'vocab_size': 20000, 'architecture_name': 'PureEmbedding', 'dataset_name': 'tweeter-fr'} 


Epoch 1/5
Epoch 1: val_binary_accuracy did not improve from 0.60000
Epoch 2/5
Epoch 2: val_binary_accuracy did not improve from 0.60000
Epoch 3/5
Epoch 3: val_binary_accuracy did not improve from 0.60000
Epoch 4/5
Epoch 4: val_binary_accuracy did not improve from 0.60000
Epoch 5/5
Epoch 5: val_binary_accuracy did

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

[34m[1mwandb[0m: Agent Starting Run: hc8fr947 with config:
[34m[1mwandb[0m: 	batch_size: 50
[34m[1mwandb[0m: 	callbacks_log_dir: logs/run/
[34m[1mwandb[0m: 	drop1: 0.4
[34m[1mwandb[0m: 	drop2: 0.2
[34m[1mwandb[0m: 	early_stopping_min_delta: 0.001
[34m[1mwandb[0m: 	embedding_dim: 500
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	global_ds_size: 1000
[34m[1mwandb[0m: 	lr: 0.0008
[34m[1mwandb[0m: 	max_length: 200
[34m[1mwandb[0m: 	train_split: 0.7
[34m[1mwandb[0m: 	val_split: 0.15
[34m[1mwandb[0m: 	vocab_size: 15000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


------------
 Ensemble de données créé, taille : 1000
------------
 Taille des ensembles de données : 700, 150
Fin du chargement des bases de données
700 150 150
------------
Fin de l'entraînement du tokenizer
------------
Fin de la préparation des bases de donneés
1000
-----------
Entraînement lancé 
 - Nom : fresh-sweep-6 
 - Configuration : {'batch_size': 50, 'callbacks_log_dir': 'logs/run/', 'drop1': 0.4, 'drop2': 0.2, 'early_stopping_min_delta': 0.001, 'embedding_dim': 500, 'epochs': 5, 'global_ds_size': 1000, 'lr': 0.0008, 'max_length': 200, 'train_split': 0.7, 'val_split': 0.15, 'vocab_size': 15000, 'architecture_name': 'PureEmbedding', 'dataset_name': 'tweeter-fr'} 


Epoch 1/5
Epoch 1: val_binary_accuracy did not improve from 0.60000
Epoch 2/5
Epoch 2: val_binary_accuracy did not improve from 0.60000
Epoch 3/5
Epoch 3: val_binary_accuracy did not improve from 0.60000
Epoch 4/5
Epoch 4: val_binary_accuracy did not improve from 0.60000
Epoch 5/5
Epoch 5: val_binary_accuracy did 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

[34m[1mwandb[0m: Agent Starting Run: yz8u7nfm with config:
[34m[1mwandb[0m: 	batch_size: 50
[34m[1mwandb[0m: 	callbacks_log_dir: logs/run/
[34m[1mwandb[0m: 	drop1: 0.2
[34m[1mwandb[0m: 	drop2: 0.2
[34m[1mwandb[0m: 	early_stopping_min_delta: 0.001
[34m[1mwandb[0m: 	embedding_dim: 200
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	global_ds_size: 1000
[34m[1mwandb[0m: 	lr: 0.0008
[34m[1mwandb[0m: 	max_length: 200
[34m[1mwandb[0m: 	train_split: 0.7
[34m[1mwandb[0m: 	val_split: 0.15
[34m[1mwandb[0m: 	vocab_size: 20000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


------------
 Ensemble de données créé, taille : 1000
------------
 Taille des ensembles de données : 700, 150
Fin du chargement des bases de données
700 150 150
------------
Fin de l'entraînement du tokenizer
------------
Fin de la préparation des bases de donneés
1000
-----------
Entraînement lancé 
 - Nom : peachy-sweep-7 
 - Configuration : {'batch_size': 50, 'callbacks_log_dir': 'logs/run/', 'drop1': 0.2, 'drop2': 0.2, 'early_stopping_min_delta': 0.001, 'embedding_dim': 200, 'epochs': 5, 'global_ds_size': 1000, 'lr': 0.0008, 'max_length': 200, 'train_split': 0.7, 'val_split': 0.15, 'vocab_size': 20000, 'architecture_name': 'PureEmbedding', 'dataset_name': 'tweeter-fr'} 


Epoch 1/5
Epoch 1: val_binary_accuracy did not improve from 0.60000
Epoch 2/5
Epoch 2: val_binary_accuracy did not improve from 0.60000
Epoch 3/5
Epoch 3: val_binary_accuracy did not improve from 0.60000
Epoch 4/5
Epoch 4: val_binary_accuracy did not improve from 0.60000
Epoch 5/5
Epoch 5: val_binary_accuracy did

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

[34m[1mwandb[0m: Agent Starting Run: p8s7641q with config:
[34m[1mwandb[0m: 	batch_size: 50
[34m[1mwandb[0m: 	callbacks_log_dir: logs/run/
[34m[1mwandb[0m: 	drop1: 0.4
[34m[1mwandb[0m: 	drop2: 0.3
[34m[1mwandb[0m: 	early_stopping_min_delta: 0.001
[34m[1mwandb[0m: 	embedding_dim: 100
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	global_ds_size: 1000
[34m[1mwandb[0m: 	lr: 0.0008
[34m[1mwandb[0m: 	max_length: 200
[34m[1mwandb[0m: 	train_split: 0.7
[34m[1mwandb[0m: 	val_split: 0.15
[34m[1mwandb[0m: 	vocab_size: 30000
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


------------
 Ensemble de données créé, taille : 1000
------------
 Taille des ensembles de données : 700, 150
Fin du chargement des bases de données
700 150 150
------------
Fin de l'entraînement du tokenizer
------------
Fin de la préparation des bases de donneés
1000
-----------
Entraînement lancé 
 - Nom : leafy-sweep-8 
 - Configuration : {'batch_size': 50, 'callbacks_log_dir': 'logs/run/', 'drop1': 0.4, 'drop2': 0.3, 'early_stopping_min_delta': 0.001, 'embedding_dim': 100, 'epochs': 5, 'global_ds_size': 1000, 'lr': 0.0008, 'max_length': 200, 'train_split': 0.7, 'val_split': 0.15, 'vocab_size': 30000, 'architecture_name': 'PureEmbedding', 'dataset_name': 'tweeter-fr'} 


Epoch 1/5
Epoch 1: val_binary_accuracy did not improve from 0.60000
Epoch 2/5
Epoch 2: val_binary_accuracy did not improve from 0.60000
Epoch 3/5
Epoch 3: val_binary_accuracy did not improve from 0.60000
Epoch 4/5
Epoch 4: val_binary_accuracy did not improve from 0.60000
Epoch 5/5
Epoch 5: val_binary_accuracy did 

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…