# AT&T Spam Detector - BertForSequenceClassification - bert-base-uncased

## <span style="color:orange"><b>DOES'NT WORK</b></span>

This code use 
* BertForSequenceClassification
* bert-base-uncased (where cat & CAT are the same)


In [None]:
# prelude

import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input
from transformers import TFBertForSequenceClassification, BertTokenizer # 'TF' for TensorFlow models
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import numpy as np

from pathlib import Path
k_Current_dir = Path.cwd()
k_AssetsDir = "assets"
k_sms_max_len = 100
k_random_state  = 42
k_test_size     = 0.3


In [2]:
# -----------------------------------------------------------------------------
# drop empty cols and duplicates, rename cols...
def cleaner(df):
    df.drop(columns="Unnamed: 2", inplace=True)
    df.drop(columns="Unnamed: 3", inplace=True)
    df.drop(columns="Unnamed: 4", inplace=True)

    df.drop_duplicates(inplace=True)

    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace("/", "_")

    df.rename(columns={"v1": "labels"}, inplace=True)
    df.rename(columns={"v2": "texts"}, inplace=True)

    df["labels"] = df["labels"].map({"ham": 0, "spam": 1})

    return df

In [None]:
df = pd.read_csv(k_Current_dir / k_AssetsDir / "spam.csv", encoding="cp1252")
df = cleaner(df)
labels = df['labels'].tolist()
texts = df['texts'].tolist()

input_ids       = Input(shape=(k_sms_max_len,), dtype=tf.int32, name="input_ids")
attention_masks = Input(shape=(k_sms_max_len,), dtype=tf.int32, name="attention_mask")

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # modèle TensorFlow
# ! Fige toutes les couches, y compris la tête de classification, ce qui rend tout le modèle non entraînable.
# Pas une bonne idée
# for layer in model.layers:

# Ne fige que les couches internes de BERT. Permet d'entraîner la tête de classification
# Plus malin
for layer in model.bert.encoder.layer:
    layer.trainable = False

model.summary()

path = Path(f"{k_Current_dir/k_AssetsDir/'TFBertForSequenceClassification_arch.png'}")
tf.keras.utils.plot_model(model, path, show_shapes=True)

# bert-base-uncased : cat & CAT are equal
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
encoded_data = tokenizer(
    texts,
    max_length=k_sms_max_len,
    padding='max_length',       # sequences will be padded according the value of the parameter max_length
    truncation=True,
    return_tensors='tf'         # "tf" for TensorFlow
)

X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    encoded_data['input_ids'].numpy(), 
    encoded_data['attention_mask'].numpy(), 
    labels, 
    test_size=k_test_size, 
    random_state=k_random_state
)

# convert the datasets into tensors
X_train_ids     = tf.convert_to_tensor(X_train_ids)
X_test_ids      = tf.convert_to_tensor(X_test_ids)
X_train_mask    = tf.convert_to_tensor(X_train_mask)
X_test_mask     = tf.convert_to_tensor(X_test_mask)
y_train         = tf.convert_to_tensor(y_train)
y_test          = tf.convert_to_tensor(y_test)

# Gather encoded data into dictionaries for training
X_train = {'input_ids': X_train_ids, 'attention_mask': X_train_mask}
X_test = {'input_ids': X_test_ids, 'attention_mask': X_test_mask}

early_stopping = EarlyStopping(
    monitor='val_loss',           # can be 'val_accuracy' if needed 
    patience=3,          
    restore_best_weights=True  
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.2,       # reduction factor of learning rate
    patience=2,       
    min_lr=1e-7       # minimal value for learning rate
)

path = Path(f"{k_Current_dir/k_AssetsDir/'bert_base_cased_best_model.h5'}")
checkpoint = ModelCheckpoint(
    path,                       # model's path
    monitor='val_loss', 
    save_best_only=True, 
    mode='min'
)

tensorboard = TensorBoard(log_dir='logs', histogram_freq=1)
print(f"\n\n--------------------------------------------------")
print(f"Once the model runs, open a terminal, make sure you are in the directory of the project and type in : ")
print(f"tensorboard --logdir=logs")
print(f"Then visit the URL")

model.compile(
    optimizer=Adam(learning_rate=3e-5), 
    loss='binary_crossentropy', 
    metrics=[tf.keras.metrics.Recall(name="recall"), tf.keras.metrics.Precision(name="precision"), "accuracy"],       # name=... avoid recall_1 for example
)

history = model.fit(
    [X_train['input_ids'], X_train['attention_mask']],
    y_train,
    validation_data=([X_test['input_ids'], X_test['attention_mask']], y_test),
    batch_size = 32,
    epochs = 50,
    callbacks=[early_stopping, reduce_lr, checkpoint, tensorboard]  
)


In [None]:
path = Path(f"{k_Current_dir/k_AssetsDir/'bert_base_uncased_arch.png'}")
tf.keras.utils.plot_model(model, path, show_shapes=True)

In [None]:
# -----------------------------------------------------------------------------
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = Path(f"{k_Current_dir/k_AssetsDir/fig_id}.{fig_extension}")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    return

In [None]:
plt.plot(history.history["loss"], color="b", label="Train Loss")
plt.plot(history.history["val_loss"], color="r", label="Val Loss")
plt.ylabel("Values")
plt.xlabel("Epochs")
plt.title("Bert Base Uncased : Loss")
plt.legend()
plt.ylim(0,1)
save_fig("bert_base_uncased_loss", "png")
plt.show()

display([[round(f, 6) for f in history.history['loss'][-10:]]])
display([[round(f, 6) for f in history.history['val_loss'][-10:]]])


In [None]:
plt.plot(history.history["accuracy"], color="b", label="Train Accuracy")
plt.plot(history.history["val_accuracy"], color="r", label="Val Accuracy")
plt.ylabel("Values")
plt.xlabel("Epochs")
plt.title("Bert Base Uncased : Accuracy")
plt.legend()
plt.ylim(0,1)
save_fig("bert_base_uncased_accuracy", "png")
plt.show()

display([[round(f, 6) for f in history.history['accuracy'][-10:]]])
display([[round(f, 6) for f in history.history['val_accuracy'][-10:]]])


In [None]:
plt.plot(history.history["precision"], color="b", label="Train Precision")
plt.plot(history.history["val_precision"], color="r", label="Val Precision")
plt.ylabel("Values")
plt.xlabel("Epochs")
plt.title("Bert Base Uncased : Precision")
plt.legend()
plt.ylim(0,1)
save_fig("bert_base_uncased_precision", "png")
plt.show()

display([[round(f, 6) for f in history.history['precision'][-10:]]])
display([[round(f, 6) for f in history.history['val_precision'][-10:]]])

In [None]:
plt.plot(history.history["recall"], color="b", label="Train Recall")
plt.plot(history.history["val_recall"], color="r", label="Val Recall")
plt.ylabel("Values")
plt.xlabel("Epochs")
plt.title("Bert Base Uncased : Recall")
plt.legend()
plt.ylim(0,1)
save_fig("bert_base_uncased_recall", "png")
plt.show()

display([[round(f, 6) for f in history.history['recall'][-10:]]])
display([[round(f, 6) for f in history.history['val_recall'][-10:]]])

In [10]:
def f1_calculus(name, rec, prec):
    df_tmp=pd.DataFrame()
    df_tmp[name] = 2*np.array(rec)*np.array(prec)/(np.array(rec)+np.array(prec)+tf.keras.backend.epsilon()) # epsilon avoid runtimeWarning: divide by zero encountered in divide...
    return df_tmp

In [None]:
df_tmp = f1_calculus("f1", history.history["recall"], history.history["precision"])
df_val_tmp = f1_calculus("val_f1", history.history["val_recall"], history.history["val_precision"])

plt.plot(df_tmp["f1"], color="b", label="Train F1")
plt.plot(df_val_tmp["val_f1"], color="r", label="Val F1")
plt.ylabel("Values")
plt.xlabel("Epochs")
plt.title("Bert Base Uncased : F1")
plt.legend()
plt.ylim(0,1)
save_fig("bert_base_uncased_f1", "png")
plt.show()

display(df_tmp.tail(10))
display(df_val_tmp.tail(10))