In [None]:

df = pd.read_csv(k_Current_dir / k_AssetsDir / "spam.csv", encoding="cp1252")
df = cleaner(df)

labels = df['labels'].tolist()
texts = df['texts'].tolist()

# BERT - Bidirectional Encoder Representations from Transformers
# It use multiple inputs (input_ids & attention_mask)
# We cannot use model = tf.keras.Sequential([...])

# define both inputs 
# input_ids = id of the token as defined in the pre-trainned model 
# Attention_masks are used to indicate which parts of the sequence should be taken into account by the model
# "Hello, how are you?"
# [7592, 1010, 2129, 2024, 2017, 1029]        input_ids
# [7592, 1010, 2129, 2024, 2017, 1029, 0, 0]  input_ids with padding
# [   1,    1,    1,    1,    1,    1, 0, 0]  attention_masks with padding
# input_ids       = Input(shape=(k_sms_max_len,), dtype=tf.int32, name="input_ids")
# attention_masks = Input(shape=(k_sms_max_len,), dtype=tf.int32, name="attention_mask")

# Affichage des formes des données
# print(f"Forme des input_ids : {encoded_data['input_ids'].shape}")
# print(f"Forme des attention_mask : {encoded_data['attention_mask'].shape}")

# Load TensorFlow pretrained model from Hugging Face 
# 12-layers, 768-hidden-nodes, 12-attention-heads, 110M parameters
# bert-base-uncased : cat & CAT are the same
# bert_model = TFBertModel.from_pretrained('bert-base-uncased')
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # modèle TensorFlow
# Freeze all trainable parameters from all the layers of BERT model
# for layer in bert_model.layers:
#     layer.trainable = False

# ! ATTENTION
# If we want to freeze all but NOT the last 2 layers
# BERT basic is made up of 12 stacked layers of transformers 
# Each transformer layer is made up of sub-layers, including attention mechanisms and feed-forward neural networks.
# So before to "unfreeze" the last layer, some research might be required in order to unfreeze the layers correclty
# for layer in bert_model.encoder.layer[-2:]:
#     layer.trainable = False

# embeddings = bert_model(input_ids, attention_mask=attention_masks)[0]

# Get the CLS token from the embeddings
# cls_token = embeddings[:, 0, :]

# Add a "custom" dense layer with sigmoid activation to BERT
# output = Dense(1, activation='sigmoid')(cls_token)

# Define the model
# model = Model(inputs=[input_ids, attention_masks], outputs=output)

# model.summary()

# path = Path(f"{k_Current_dir/k_AssetsDir/'bert_base_uncased_arch.png'}")
# tf.keras.utils.plot_model(model, path, show_shapes=True)

# encode sms with BERT tokenizer 
# DONE : make a test with bert-base-uncased then bert-base-cased and compare
# uncased : the model does not take the case into account 
# cased   : the model takes the case into account
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


texts_train, texts_eval, labels_train, labels_eval = train_test_split(texts, labels, test_size=0.2)

encoded_data = tokenizer(
    texts,
    max_length=k_sms_max_len,
    padding='max_length',       # sequences will be padded according the value of the parameter max_length
    truncation=True,
    return_tensors='tf'         # "tf" for TensorFlow
)


X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    encoded_data['input_ids'].numpy(), 
    encoded_data['attention_mask'].numpy(), 
    labels, 
    test_size=0.2, 
    random_state=42
)

# convert the datasets into tensors
X_train_ids     = tf.convert_to_tensor(X_train_ids)
X_test_ids      = tf.convert_to_tensor(X_test_ids)
X_train_mask    = tf.convert_to_tensor(X_train_mask)
X_test_mask     = tf.convert_to_tensor(X_test_mask)
y_train         = tf.convert_to_tensor(y_train)
y_test          = tf.convert_to_tensor(y_test)

# Gather encoded data into dictionaries for training
X_train = {'input_ids': X_train_ids, 'attention_mask': X_train_mask}
X_test = {'input_ids': X_test_ids, 'attention_mask': X_test_mask}

early_stopping = EarlyStopping(
    monitor='val_loss',           # can be 'val_accuracy' if needed 
    patience=3,          
    restore_best_weights=True  
)

# Reduces the learning rate when it stops improving
# helps to converge more quickly to a minimum
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.2,       # reduction factor of learning rate
    patience=2,       
    min_lr=1e-7       # minimal value for learning rate
)

path = Path(f"{k_Current_dir/k_AssetsDir/'bert_base_uncased_best_model.h5'}")
checkpoint = ModelCheckpoint(
    path,                       # model's path
    monitor='val_loss', 
    save_best_only=True, 
    mode='min'
)

tensorboard = TensorBoard(log_dir='logs', histogram_freq=1)
print(f"\n\n--------------------------------------------------")
print(f"Once the model runs, open a terminal, make sure you are in the directory of the project and type in : ")
print(f"tensorboard --logdir=logs")
print(f"Then visit the URL")

model.compile(
    optimizer=Adam(learning_rate=3e-5), 
    loss='binary_crossentropy', 
    metrics=[tf.keras.metrics.Recall(name="recall"), tf.keras.metrics.Precision(name="precision"), "accuracy"],       # name=... avoid recall_1 for example
)

history = model.fit(
    [X_train['input_ids'], X_train['attention_mask']],
    y_train,
    validation_data=([X_test['input_ids'], X_test['attention_mask']], y_test),
    batch_size = 32,
    epochs = 50,
    callbacks=[early_stopping, reduce_lr, checkpoint, tensorboard]  
)


In [None]:

# # ! 210 minutes...

# # -----------------------------------------------------------------------------
# df = pd.read_csv(k_Current_dir / k_AssetsDir / "spam.csv", encoding="cp1252")
# df = cleaner(df)
# labels = df['labels'].tolist()
# texts = df['texts'].tolist()

# # BERT - Bidirectional Encoder Representations from Transformers
# # It use multiple inputs (input_ids & attention_mask)
# # We cannot use model = tf.keras.Sequential([...])

# # define both inputs 
# # input_ids = id of the token as defined in the pre-trainned model 
# # Attention_masks are used to indicate which parts of the sequence should be taken into account by the model
# # "Hello, how are you?"
# # [7592, 1010, 2129, 2024, 2017, 1029]        input_ids
# # [7592, 1010, 2129, 2024, 2017, 1029, 0, 0]  input_ids with padding
# # [   1,    1,    1,    1,    1,    1, 0, 0]  attention_masks with padding
# input_ids       = Input(shape=(k_sms_max_len,), dtype=tf.int32, name="input_ids")
# attention_masks = Input(shape=(k_sms_max_len,), dtype=tf.int32, name="attention_mask")

# # Affichage des formes des données
# # print(f"Forme des input_ids : {encoded_data['input_ids'].shape}")
# # print(f"Forme des attention_mask : {encoded_data['attention_mask'].shape}")

# # Load TensorFlow pretrained model from Hugging Face 
# # 12-layers, 768-hidden-nodes, 12-attention-heads, 110M parameters
# # bert-base-uncased : cat & CAT are the same
# bert_model = TFBertModel.from_pretrained('bert-base-uncased')
# # Freeze all trainable parameters from all the layers of BERT model
# for layer in bert_model.layers:
#     layer.trainable = False

# # ! ATTENTION
# # If we want to freeze all but NOT the last 2 layers
# # BERT basic is made up of 12 stacked layers of transformers 
# # Each transformer layer is made up of sub-layers, including attention mechanisms and feed-forward neural networks.
# # So before to "unfreeze" the last layer, some research might be required in order to unfreeze the layers correclty
# # for layer in bert_model.encoder.layer[-2:]:
# #     layer.trainable = False

# embeddings = bert_model(input_ids, attention_mask=attention_masks)[0]

# # Get the CLS token from the embeddings
# cls_token = embeddings[:, 0, :]

# # Add a "custom" dense layer with sigmoid activation to BERT
# output = Dense(1, activation='sigmoid')(cls_token)

# # Define the model
# model = Model(inputs=[input_ids, attention_masks], outputs=output)

# model.summary()

# path = Path(f"{k_Current_dir/k_AssetsDir/'bert_base_uncased_arch.png'}")
# tf.keras.utils.plot_model(model, path, show_shapes=True)

# # encode sms with BERT tokenizer 
# # DONE : make a test with bert-base-uncased then bert-base-cased and compare
# # uncased : the model does not take the case into account 
# # cased   : the model takes the case into account
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    
# encoded_data = tokenizer(
#     texts,
#     max_length=k_sms_max_len,
#     padding='max_length',       # sequences will be padded according the value of the parameter max_length
#     truncation=True,
#     return_tensors='tf'         # "tf" for TensorFlow
# )


# X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
#     encoded_data['input_ids'].numpy(), 
#     encoded_data['attention_mask'].numpy(), 
#     labels, 
#     test_size=0.2, 
#     random_state=42
# )

# # convert the datasets into tensors
# X_train_ids     = tf.convert_to_tensor(X_train_ids)
# X_test_ids      = tf.convert_to_tensor(X_test_ids)
# X_train_mask    = tf.convert_to_tensor(X_train_mask)
# X_test_mask     = tf.convert_to_tensor(X_test_mask)
# y_train         = tf.convert_to_tensor(y_train)
# y_test          = tf.convert_to_tensor(y_test)

# # Gather encoded data into dictionaries for training
# X_train = {'input_ids': X_train_ids, 'attention_mask': X_train_mask}
# X_test = {'input_ids': X_test_ids, 'attention_mask': X_test_mask}

# early_stopping = EarlyStopping(
#     monitor='val_loss',           # can be 'val_accuracy' if needed 
#     patience=3,          
#     restore_best_weights=True  
# )

# # Reduces the learning rate when it stops improving
# # helps to converge more quickly to a minimum
# reduce_lr = ReduceLROnPlateau(
#     monitor='val_loss', 
#     factor=0.2,       # reduction factor of learning rate
#     patience=2,       
#     min_lr=1e-7       # minimal value for learning rate
# )

# path = Path(f"{k_Current_dir/k_AssetsDir/'bert_base_uncased_best_model.h5'}")
# checkpoint = ModelCheckpoint(
#     path,                       # model's path
#     monitor='val_loss', 
#     save_best_only=True, 
#     mode='min'
# )

# tensorboard = TensorBoard(log_dir='logs', histogram_freq=1)
# print(f"\n\n--------------------------------------------------")
# print(f"Once the model runs, open a terminal, make sure you are in the directory of the project and type in : ")
# print(f"tensorboard --logdir=logs")
# print(f"Then visit the URL")

# model.compile(
#     optimizer=Adam(learning_rate=3e-5), 
#     loss='binary_crossentropy', 
#     metrics=[tf.keras.metrics.Recall(name="recall"), tf.keras.metrics.Precision(name="precision"), "accuracy"],       # name=... avoid recall_1 for example
# )

# history = model.fit(
#     [X_train['input_ids'], X_train['attention_mask']],
#     y_train,
#     validation_data=([X_test['input_ids'], X_test['attention_mask']], y_test),
#     batch_size = 32,
#     epochs = 50,
#     callbacks=[early_stopping, reduce_lr, checkpoint, tensorboard]  
# )
