# This notebook uses the stratified train/validation split instead of just a purely random split. And it also incorporates the KeyWord Argument from the Kaggle Dataset that I previously ignored

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/certification/BaltimoreCyberTrustRoot.crt.pem


In [2]:
import numpy as np
import pandas as pd
import random
import os
import re
from transformers import set_seed, BertTokenizer, TFBertForSequenceClassification, BertConfig
import tensorflow as tf
%pip install evaluate
import evaluate
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import json

# Install necessary packages for Azure SQL connection
%pip install mysql-connector-python 
%pip install PyMySQL

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)
set_seed(42)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# Load the training data
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
kaggle_test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Split the data into 75% training and 25% validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42, stratify=train_data['target'])

# Clean the text data
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'[^\w\s#]', '', text)  # Remove punctuation except hashtags
    text = text.lower()                  # Convert to lowercase
    return text

train_data['clean_text'] = train_data['text'].apply(clean_text)
val_data['clean_text'] = val_data['text'].apply(clean_text)
kaggle_test_data['clean_text'] = kaggle_test_data['text'].apply(clean_text)

# Function to combine keyword and text
def combine_keyword_and_text(row):
    keyword = str(row['keyword']) if pd.notna(row['keyword']) else ''
    text = row['clean_text']
    return '[CLS] ' + keyword + ' [SEP] ' + text + ' [SEP]'

# Apply the function to combine keyword and text
train_data['combined_text'] = train_data.apply(combine_keyword_and_text, axis=1)
val_data['combined_text'] = val_data.apply(combine_keyword_and_text, axis=1)
kaggle_test_data['combined_text'] = kaggle_test_data.apply(combine_keyword_and_text, axis=1)

# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        max_length=64,
        padding=True,
        truncation=True,
        return_tensors='tf'
    )

# Encode the combined text data
train_encodings = tokenize_texts(train_data['combined_text'])
val_encodings = tokenize_texts(val_data['combined_text'])
kaggle_test_encodings = tokenize_texts(kaggle_test_data['combined_text'])

train_labels = tf.convert_to_tensor(train_data['target'].values)
val_labels = tf.convert_to_tensor(val_data['target'].values)

# Load the F1 metric from the evaluate library
metric = evaluate.load("f1", trust_remote_code=True)

def compute_metrics(predictions, labels):
    predictions = np.argmax(predictions, axis=1)
    f1 = metric.compute(predictions=predictions, references=labels)['f1']
    accuracy = accuracy_score(labels, predictions)
    return {'f1': f1, 'accuracy': accuracy}

def create_tf_dataset(encodings, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((encodings, labels))
    return dataset.shuffle(10000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

# Define precision and recall metrics outside of the custom metric function
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()

def f1_score(y_true, y_pred):
    # Convert logits to predicted labels
    y_pred = tf.argmax(y_pred, axis=1)
    
    # Ensure true labels are in integer format
    y_true = tf.cast(y_true, tf.int64)
    
    # Update the state of precision and recall
    precision.update_state(y_true, y_pred)
    recall.update_state(y_true, y_pred)
    
    # Compute precision and recall values
    precision_result = precision.result()
    recall_result = recall.result()
    
    # Compute F1 score
    f1 = 2 * ((precision_result * recall_result) / (precision_result + recall_result + tf.keras.backend.epsilon()))
    
    return f1

strategy = tf.distribute.MirroredStrategy()

# Directory to save models
model_save_dir = './saved_models'
os.makedirs(model_save_dir, exist_ok=True)

# Track top 3 models
top_n_models = []

def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    num_epochs = trial.suggest_int("num_epochs", 3, 10)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)
    lr_scheduler_type = trial.suggest_categorical("lr_scheduler_type", ["constant", "linear", "cosine", "cosine_with_restarts"])

    num_gpus = strategy.num_replicas_in_sync

    train_dataset = create_tf_dataset(dict(train_encodings), train_labels, batch_size // num_gpus)
    val_dataset = create_tf_dataset(dict(val_encodings), val_labels, batch_size // num_gpus)
    kaggle_test_dataset = tf.data.Dataset.from_tensor_slices(dict(kaggle_test_encodings)).batch(batch_size // num_gpus).prefetch(tf.data.experimental.AUTOTUNE)

    with strategy.scope():
        config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2, hidden_dropout_prob=dropout_rate)
        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

        if lr_scheduler_type == "linear":
            lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
                initial_learning_rate=learning_rate,
                decay_steps=10000,
                end_learning_rate=0.0,
                power=1.0
            )
        elif lr_scheduler_type == "cosine":
            lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
                initial_learning_rate=learning_rate,
                decay_steps=10000
            )
        elif lr_scheduler_type == "cosine_with_restarts":
            lr_schedule = tf.keras.optimizers.schedules.CosineDecayRestarts(
                initial_learning_rate=learning_rate,
                first_decay_steps=1000
            )
        else:
            lr_schedule = learning_rate

        optimizer = tf.keras.optimizers.experimental.AdamW(
            learning_rate=lr_schedule,
            weight_decay=weight_decay,
            epsilon=1e-8
        )

        model.compile(optimizer=optimizer, 
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                      metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy'), f1_score])

    model.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset, verbose=1)

    # Evaluate on validation set
    val_loss, val_accuracy, val_f1_score = model.evaluate(val_dataset, verbose=1)
    print(f"f1 score: {val_f1_score} and accuracy: {val_accuracy}")
    
    avg_score = (val_accuracy + val_f1_score) / 2

    if len(top_n_models) < 3 or avg_score > min(top_n_models, key=lambda x: x[1])[1]:  # Top-3 method
        model_save_path = os.path.join(model_save_dir, f"{studyName}_model_trial_{trial.number}_avg_score_{avg_score:.4f}")
        model.save(model_save_path, save_format="tf")
        top_n_models.append((trial.number, avg_score))
        top_n_models.sort(key=lambda x: x[1], reverse=True)
        if len(top_n_models) > 3:
            top_n_models.pop()

        # Fine-tune the model on the validation dataset
        fine_tune_encodings = tokenize_texts(val_data['combined_text'])
        fine_tune_labels = tf.convert_to_tensor(val_data['target'].values)
        fine_tune_dataset = tf.data.Dataset.from_tensor_slices((
            dict(fine_tune_encodings),
            fine_tune_labels
        )).batch(batch_size // num_gpus).prefetch(tf.data.experimental.AUTOTUNE)

        # Calculate the ratio of training data size to epochs
        training_data_size = len(train_data)
        fine_tune_data_size = len(val_data)
        fine_tune_epochs = max(1, round((fine_tune_data_size / training_data_size) * num_epochs))

        model.fit(fine_tune_dataset, epochs=fine_tune_epochs, verbose=1)

        # Make predictions on the Kaggle test dataset
        kaggle_test_predictions = model.predict(kaggle_test_dataset).logits
        kaggle_test_predicted_labels = tf.argmax(kaggle_test_predictions, axis=1).numpy()

        # Create a submission DataFrame
        submission = pd.DataFrame({'id': kaggle_test_data['id'], 'target': kaggle_test_predicted_labels})
        
        # Save the submission
        submission_file = f"{studyName}_model_trial_{trial.number}_avg_score_{avg_score:.4f}_f1_{val_f1_score}_accuracy_{val_accuracy}" + '_submission.csv'  # Corrected naming convention
        submission_path = os.path.join(model_save_dir, submission_file)
        submission.to_csv(submission_path, index=False)
        print(f"Predictions saved for model: {model_save_path}")

    return avg_score

# Define your Optuna study, using the MySQL connection string
optuna_storage = 'mysql+pymysql://<username>:<password>@<host>/<database>?ssl_ca=<path_to_CA_cert>&ssl_verify_cert=true'

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
db_password = user_secrets.get_secret("DB_PASSWORD")# This uses the secrets inside of Kaggle so I don't have to explicitly type my password out in code

# Example with your details (replace '<password>' with your real password and '<database>' with your database name)
optuna_storage = f'mysql+pymysql://MichaelAzure:{db_password}@kaggle-third-sql.mysql.database.azure.com/kaggle_disaster_database?ssl_ca=/kaggle/input/certification&ssl_verify_cert=true'

studyName = 'disaster_keyword_6'
study = optuna.create_study(study_name=studyName, # name of the study
                            storage=optuna_storage,  # URL for the mySQL schema
                            direction='maximize', # maximize the log loss
                            load_if_exists=True, # makes it so that if the study_name already exists in the schema, then it will append the new trials with the old trials and essentially resume the study. It will also remember the previous trials so it really is resuming the study
                            )

study.optimize(objective, n_trials=10)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


2024-06-11 22:52:13.361549: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-11 22:52:13.361671: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-11 22:52:13.496975: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  pid, fd = os.forkpty()


Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m591.7 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2
Note: you may need to restart the kernel to use updated packages.


  pid, fd = os.forkpty()


Collecting mysql-connector-python
  Downloading mysql_connector_python-8.4.0-cp310-cp310-manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading mysql_connector_python-8.4.0-cp310-cp310-manylinux_2_17_x86_64.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: mysql-connector-python
Successfully installed mysql-connector-python-8.4.0
Note: you may need to restart the kernel to use updated packages.
Collecting PyMySQL
  Downloading PyMySQL-1.1.1-py3-none-any.whl.metadata (4.4 kB)
Downloading PyMySQL-1.1.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.0/45.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMySQL
Successfully installed PyMySQL-1.1.1
Note: you may need to restart the kernel to use updated packages.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

[I 2024-06-11 22:53:26,694] Using an existing study with name 'disaster_keyword_6' instead of creating a new one.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Cause: for/else statement not yet supported


I0000 00:00:1718146580.118448     127 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
f1 score: 0.7692841291427612 and accuracy: 0.832457959651947
Epoch 1/3
Epoch 2/3
Epoch 3/3
Predictions saved for model: ./saved_models/disaster_keyword_6_model_trial_28_avg_score_0.8009


[I 2024-06-11 23:30:54,667] Trial 28 finished with value: 0.8008710443973541 and parameters: {'learning_rate': 1.0907557221213543e-06, 'batch_size': 64, 'num_epochs': 10, 'dropout_rate': 0.1551681394301923, 'weight_decay': 0.04658928851031885, 'lr_scheduler_type': 'cosine'}. Best is trial 22 with value: 0.8558858633041382.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9
f1 score: 0.6612130999565125 and accuracy: 0.5703781247138977
Epoch 1/3
Epoch 2/3
Epoch 3/3
Predictions saved for model: ./saved_models/disaster_keyword_6_model_trial_35_avg_score_0.6158


[I 2024-06-12 00:11:54,207] Trial 35 finished with value: 0.6157956123352051 and parameters: {'learning_rate': 8.998152186733024e-05, 'batch_size': 32, 'num_epochs': 9, 'dropout_rate': 0.10381097108071544, 'weight_decay': 0.09227624797251477, 'lr_scheduler_type': 'constant'}. Best is trial 22 with value: 0.8558858633041382.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
f1 score: 0.7341610789299011 and accuracy: 0.8251050710678101
Epoch 1/3
Epoch 2/3
Epoch 3/3
Predictions saved for model: ./saved_models/disaster_keyword_6_model_trial_39_avg_score_0.7796


[I 2024-06-12 00:56:15,632] Trial 39 finished with value: 0.7796330749988556 and parameters: {'learning_rate': 2.878485452417547e-05, 'batch_size': 32, 'num_epochs': 10, 'dropout_rate': 0.10339350592848691, 'weight_decay': 0.006416297762913817, 'lr_scheduler_type': 'constant'}. Best is trial 22 with value: 0.8558858633041382.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
f1 score: 0.7659012079238892 and accuracy: 0.819327712059021
Epoch 1/3
Epoch 2/3
Epoch 3/3
Predictions saved for model: ./saved_models/disaster_keyword_6_model_trial_50_avg_score_0.7926


[I 2024-06-12 01:33:55,706] Trial 50 finished with value: 0.7926144599914551 and parameters: {'learning_rate': 5.752135041619165e-05, 'batch_size': 32, 'num_epochs': 8, 'dropout_rate': 0.2253162753772589, 'weight_decay': 0.05594104240633256, 'lr_scheduler_type': 'linear'}. Best is trial 22 with value: 0.8558858633041382.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
f1 score: 0.7827867269515991 and accuracy: 0.8125
Epoch 1/2
Epoch 2/2
Predictions saved for model: ./saved_models/disaster_keyword_6_model_trial_55_avg_score_0.7976


[I 2024-06-12 02:06:59,632] Trial 55 finished with value: 0.7976433634757996 and parameters: {'learning_rate': 5.904408661565012e-05, 'batch_size': 32, 'num_epochs': 7, 'dropout_rate': 0.14239165345178362, 'weight_decay': 0.03949359513890614, 'lr_scheduler_type': 'linear'}. Best is trial 22 with value: 0.8558858633041382.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5


[W 2024-06-12 02:07:53,254] Trial 61 failed with parameters: {'learning_rate': 1.488443982498887e-05, 'batch_size': 32, 'num_epochs': 5, 'dropout_rate': 0.2922823018976862, 'weight_decay': 0.014678714877120858, 'lr_scheduler_type': 'cosine_with_restarts'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_34/1721709195.py", line 171, in objective
    model.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset, verbose=1)
  File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 1229, in fit
    return super().fit(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/tf_keras/src/utils/traceback_utils.py", line 65, in error_handler
    return fn(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/tf_keras/src/engine/training.py", 

KeyboardInterrupt: 