In [None]:
# prelude
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np

from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

from pathlib import Path
k_Current_dir = Path.cwd()
k_AssetsDir = "assets"
k_sms_max_len = 100

# Regarding the warning : TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
# Solution ? 
# ! NOT TESTED YET - Does it apply to Jupyter "in" VSCode ?
# https://saturncloud.io/blog/importerror-iprogress-not-found-please-update-jupyter-and-ipywidgets-although-it-is-installed/
# conda install -c conda-forge ipywidgets
# jupyter nbextension enable --py widgetsnbextension

In [2]:
# -----------------------------------------------------------------------------
# drop empty cols and duplicates, rename cols...
def cleaner(df):
    df.drop(columns="Unnamed: 2", inplace=True)
    df.drop(columns="Unnamed: 3", inplace=True)
    df.drop(columns="Unnamed: 4", inplace=True)

    df.drop_duplicates(inplace=True)

    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace("/", "_")

    df.rename(columns={"v1": "labels"}, inplace=True)
    df.rename(columns={"v2": "texts"}, inplace=True)

    df["labels"] = df["labels"].map({"ham": 0, "spam": 1})

    return df

In [3]:
# -----------------------------------------------------------------------------
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = Path(f"{k_Current_dir/k_AssetsDir/fig_id}.{fig_extension}")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    return

In [None]:

# ! 210 minutes...

# -----------------------------------------------------------------------------
df = pd.read_csv(k_Current_dir / k_AssetsDir / "spam.csv", encoding="cp1252")
df = cleaner(df)
labels = df['labels'].tolist()
texts = df['texts'].tolist()

# BERT - Bidirectional Encoder Representations from Transformers
# It use multiple inputs (input_ids & attention_mask)
# We cannot use model = tf.keras.Sequential([...])

# define both inputs 
# input_ids = id of the token as defined in the pre-trainned model 
# Attention_masks are used to indicate which parts of the sequence should be taken into account by the model
# "Hello, how are you?"
# [7592, 1010, 2129, 2024, 2017, 1029]        input_ids
# [7592, 1010, 2129, 2024, 2017, 1029, 0, 0]  input_ids with padding
# [   1,    1,    1,    1,    1,    1, 0, 0]  attention_masks with padding
input_ids       = Input(shape=(k_sms_max_len,), dtype=tf.int32, name="input_ids")
attention_masks = Input(shape=(k_sms_max_len,), dtype=tf.int32, name="attention_mask")

# Affichage des formes des données
# print(f"Forme des input_ids : {encoded_data['input_ids'].shape}")
# print(f"Forme des attention_mask : {encoded_data['attention_mask'].shape}")

# Load pretrained model 
# 12-layers, 768-hidden-nodes, 12-attention-heads, 110M parameters
# bert-base-uncased : cat & CAT the same
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
# Freeze all trainable parameters from all the layers of BERT
for layer in bert_model.layers:
    layer.trainable = False

# ! ATTENTION
# If we want to freeze all but NOT the last 2 layers
# BERT basic is made up of 12 stacked layers of transformers 
# Each transformer layer is made up of sub-layers, including attention mechanisms and feed-forward neural networks.
# So before to "unfreeze" the last layer, some research might be required in order to unfreeze the layers correclty
# for layer in bert_model.encoder.layer[-2:]:
#     layer.trainable = False

embeddings = bert_model(input_ids, attention_mask=attention_masks)[0]

# Get the CLS token from the embeddings
cls_token = embeddings[:, 0, :]

# Add a "custom" dense layer with sigmoid activation to BERT
output = Dense(1, activation='sigmoid')(cls_token)

# Define the model
model = Model(inputs=[input_ids, attention_masks], outputs=output)

model.summary()

path = Path(f"{k_Current_dir/k_AssetsDir/'bert_base_uncased_arch.png'}")
tf.keras.utils.plot_model(model, path, show_shapes=True)

# encode sms with BERT tokenizer 
# DONE : make a test with bert-base-uncased then bert-base-cased and compare
# uncased : all tokens in lowercase
# cased   : the model take the case into account
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    
encoded_data = tokenizer(
    texts,
    max_length=k_sms_max_len,
    padding='max_length',       # sequences will be padded according the value of the parameter max_length
    truncation=True,
    return_tensors='tf'
)


X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    encoded_data['input_ids'].numpy(), 
    encoded_data['attention_mask'].numpy(), 
    labels, 
    test_size=0.2, 
    random_state=42
)

# convert the datasets into tensors
X_train_ids     = tf.convert_to_tensor(X_train_ids)
X_test_ids      = tf.convert_to_tensor(X_test_ids)
X_train_mask    = tf.convert_to_tensor(X_train_mask)
X_test_mask     = tf.convert_to_tensor(X_test_mask)
y_train         = tf.convert_to_tensor(y_train)
y_test          = tf.convert_to_tensor(y_test)

# Gather encoded data into dictionaries for training
X_train = {'input_ids': X_train_ids, 'attention_mask': X_train_mask}
X_test = {'input_ids': X_test_ids, 'attention_mask': X_test_mask}

early_stopping = EarlyStopping(
    monitor='val_loss',           # can be 'val_accuracy' if needed 
    patience=3,          
    restore_best_weights=True  
)

# Reduces the learning rate when it stops improving
# helps to converge more quickly to a minimum
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.2,       # reduction factor of learning rate
    patience=2,       
    min_lr=1e-7       # minimal value for learning rate
)

path = Path(f"{k_Current_dir/k_AssetsDir/'bert_base_uncased_best_model.h5'}")
checkpoint = ModelCheckpoint(
    path,                       # model's path
    monitor='val_loss', 
    save_best_only=True, 
    mode='min'
)

tensorboard = TensorBoard(log_dir='logs', histogram_freq=1)
print(f"\n\n--------------------------------------------------")
print(f"Once the model runs, open a terminal, make sure you are in the directory of the project and type in : ")
print(f"tensorboard --logdir=logs")
print(f"Then visit the URL")

model.compile(
    optimizer=Adam(learning_rate=3e-5), 
    loss='binary_crossentropy', 
    metrics=[tf.keras.metrics.Recall(name="recall"), tf.keras.metrics.Precision(name="precision"), "accuracy"],       # name=... avoid recall_1 for example
)

history = model.fit(
    [X_train['input_ids'], X_train['attention_mask']],
    y_train,
    validation_data=([X_test['input_ids'], X_test['attention_mask']], y_test),
    batch_size = 32,
    epochs = 50,
    callbacks=[early_stopping, reduce_lr, checkpoint, tensorboard]  
)


In [None]:
path = Path(f"{k_Current_dir/k_AssetsDir/'bert_base_uncased_arch.png'}")
tf.keras.utils.plot_model(model, path, show_shapes=True)

In [None]:
plt.plot(history.history["loss"], color="b", label="Train Loss")
plt.plot(history.history["val_loss"], color="r", label="Val Loss")
plt.ylabel("Values")
plt.xlabel("Epochs")
plt.title("Bert Base Uncased : Loss")
plt.legend()
plt.ylim(0,1)
save_fig("bert_base_uncased_loss", "png")
plt.show()

display([[round(f, 6) for f in history.history['loss'][-10:]]])
display([[round(f, 6) for f in history.history['val_loss'][-10:]]])


In [None]:
plt.plot(history.history["accuracy"], color="b", label="Train Accuracy")
plt.plot(history.history["val_accuracy"], color="r", label="Val Accuracy")
plt.ylabel("Values")
plt.xlabel("Epochs")
plt.title("Bert Base Uncased : Accuracy")
plt.legend()
plt.ylim(0,1)
save_fig("bert_base_uncased_accuracy", "png")
plt.show()

display([[round(f, 6) for f in history.history['accuracy'][-10:]]])
display([[round(f, 6) for f in history.history['val_accuracy'][-10:]]])


In [None]:
plt.plot(history.history["precision"], color="b", label="Train Precision")
plt.plot(history.history["val_precision"], color="r", label="Val Precision")
plt.ylabel("Values")
plt.xlabel("Epochs")
plt.title("Bert Base Uncased : Precision")
plt.legend()
plt.ylim(0,1)
save_fig("bert_base_uncased_precision", "png")
plt.show()

display([[round(f, 6) for f in history.history['precision'][-10:]]])
display([[round(f, 6) for f in history.history['val_precision'][-10:]]])

In [None]:
plt.plot(history.history["recall"], color="b", label="Train Recall")
plt.plot(history.history["val_recall"], color="r", label="Val Recall")
plt.ylabel("Values")
plt.xlabel("Epochs")
plt.title("Bert Base Uncased : Recall")
plt.legend()
plt.ylim(0,1)
save_fig("bert_base_uncased_recall", "png")
plt.show()

display([[round(f, 6) for f in history.history['recall'][-10:]]])
display([[round(f, 6) for f in history.history['val_recall'][-10:]]])

In [10]:
def f1_calculus(name, rec, prec):
    df_tmp=pd.DataFrame()
    df_tmp[name] = 2*np.array(rec)*np.array(prec)/(np.array(rec)+np.array(prec)+tf.keras.backend.epsilon()) # epsilon avoid runtimeWarning: divide by zero encountered in divide...
    return df_tmp

In [None]:
df_tmp = f1_calculus("f1", history.history["recall"], history.history["precision"])
df_val_tmp = f1_calculus("val_f1", history.history["val_recall"], history.history["val_precision"])

plt.plot(df_tmp["f1"], color="b", label="Train F1")
plt.plot(df_val_tmp["val_f1"], color="r", label="Val F1")
plt.ylabel("Values")
plt.xlabel("Epochs")
plt.title("Bert Base Uncased : F1")
plt.legend()
plt.ylim(0,1)
save_fig("bert_base_uncased_f1", "png")
plt.show()

display(df_tmp.tail(10))
display(df_val_tmp.tail(10))

### <span style="color:orange"><b>Comments :</b></span>

* Precision : 0.99 vs 0.98
* Recall    : 0.95 vs 0.89 
* F1        : 0.97 vs 0.93
* With `bert-base-uncased` we are able to keep the Precision while improving the Recall by 6% (& F1 by 4%)
* However 
    * the number of trainable parameters grows from 8_000 to 100 millions
    * the training time grows from 12 sec to 2H

 