In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/certification/BaltimoreCyberTrustRoot.crt.pem
/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
import numpy as np
import pandas as pd
import random
import os
import re
from transformers import set_seed, BertTokenizer, TFBertForSequenceClassification, BertConfig
import tensorflow as tf
%pip install evaluate
import evaluate
import optuna
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
import json

# Install necessary packages for Azure SQL connection
%pip install mysql-connector-python 
%pip install PyMySQL

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)
set_seed(42)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# Load the training data
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
kaggle_test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Split the data into 70% training and 30% validation/test sets
train_data, val_test_data = train_test_split(train_data, test_size=0.3, random_state=42)

# Split the 30% validation/test set into 50% validation and 50% test sets
val_data, split_test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)

# Clean the text data
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'[^\w\s#]', '', text)  # Remove punctuation except hashtags
    text = text.lower()                  # Convert to lowercase
    return text

train_data['clean_text'] = train_data['text'].apply(clean_text)
val_data['clean_text'] = val_data['text'].apply(clean_text)
split_test_data['clean_text'] = split_test_data['text'].apply(clean_text)
kaggle_test_data['clean_text'] = kaggle_test_data['text'].apply(clean_text)

# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        max_length=64,
        padding=True,
        truncation=True,
        return_tensors='tf'
    )

# No need to tokenize train and val separately as we are combining them for cross-validation. And split_test_encodings are made later
# split_test_encodings = tokenize_texts(split_test_data['clean_text'])
kaggle_test_encodings = tokenize_texts(kaggle_test_data['clean_text'])

# Labels here are never used. They are initialized later
# train_labels = tf.convert_to_tensor(train_data['target'].values)
# val_labels = tf.convert_to_tensor(val_data['target'].values)
# split_test_labels = tf.convert_to_tensor(split_test_data['target'].values)

# Load the F1 metric from the evaluate library
metric = evaluate.load("f1", trust_remote_code=True)

def compute_metrics(predictions, labels):
    predictions = np.argmax(predictions, axis=1)
    f1 = metric.compute(predictions=predictions, references=labels)['f1']
    accuracy = accuracy_score(labels, predictions)
    return {'f1': f1, 'accuracy': accuracy}

def create_tf_dataset(encodings, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((encodings, labels))
    return dataset.shuffle(10000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

# Define precision and recall metrics outside of the custom metric function
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()

def f1_score(y_true, y_pred):
    # Convert logits to predicted labels
    y_pred = tf.argmax(y_pred, axis=1)
    
    # Ensure true labels are in integer format
    y_true = tf.cast(y_true, tf.int64)
    
    # Update the state of precision and recall
    precision.update_state(y_true, y_pred)
    recall.update_state(y_true, y_pred)
    
    # Compute precision and recall values
    precision_result = precision.result()
    recall_result = recall.result()
    
    # Compute F1 score
    f1 = 2 * ((precision_result * recall_result) / (precision_result + recall_result + tf.keras.backend.epsilon()))
    
    return f1

strategy = tf.distribute.MirroredStrategy()

# Directory to save models
model_save_dir = './saved_models'
os.makedirs(model_save_dir, exist_ok=True)

# Track top 3 models
top_n_models = []

def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [32])
    num_epochs = trial.suggest_int("num_epochs", 2, 4)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)
    lr_scheduler_type = trial.suggest_categorical("lr_scheduler_type", ["constant", "linear", "cosine", "cosine_with_restarts"])

    # Ensure batch_size is evenly divisible by the number of GPUs
    num_gpus = strategy.num_replicas_in_sync
    if batch_size % num_gpus != 0:
        raise optuna.exceptions.TrialPruned(f"Batch size {batch_size} not divisible by number of GPUs {num_gpus}")

    kfold = KFold(n_splits=3, shuffle=True, random_state=42)
    fold_scores = []

    combined_train_val_data = pd.concat([train_data, val_data])
    combined_encodings = tokenize_texts(combined_train_val_data['clean_text'])
    combined_labels = tf.convert_to_tensor(combined_train_val_data['target'].values)

    for fold, (train_index, val_index) in enumerate(kfold.split(combined_train_val_data)):
        train_fold_data = combined_train_val_data.iloc[train_index]
        val_fold_data = combined_train_val_data.iloc[val_index]

        train_encodings_fold = tokenize_texts(train_fold_data['clean_text'])
        val_encodings_fold = tokenize_texts(val_fold_data['clean_text'])
        split_test_encodings = tokenize_texts(split_test_data['clean_text'])

        train_labels_fold = tf.convert_to_tensor(train_fold_data['target'].values)
        val_labels_fold = tf.convert_to_tensor(val_fold_data['target'].values)
        split_test_labels = tf.convert_to_tensor(split_test_data['target'].values)

        train_dataset_fold = tf.data.Dataset.from_tensor_slices((
            dict(train_encodings_fold),
            train_labels_fold
        )).batch(batch_size // num_gpus).prefetch(tf.data.experimental.AUTOTUNE)

        val_dataset_fold = tf.data.Dataset.from_tensor_slices((
            dict(val_encodings_fold),
            val_labels_fold
        )).batch(batch_size // num_gpus).prefetch(tf.data.experimental.AUTOTUNE)
        
        split_test_dataset = tf.data.Dataset.from_tensor_slices((
            dict(split_test_encodings),
            split_test_labels
        )).batch(batch_size // num_gpus).prefetch(tf.data.experimental.AUTOTUNE)

        kaggle_test_dataset = tf.data.Dataset.from_tensor_slices(dict(kaggle_test_encodings)).batch(batch_size // num_gpus)

        with strategy.scope():
            config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2, hidden_dropout_prob=dropout_rate)
            model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

            if lr_scheduler_type == "linear":
                lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
                    initial_learning_rate=learning_rate,
                    decay_steps=10000,
                    end_learning_rate=0.0,
                    power=1.0
                )
            elif lr_scheduler_type == "cosine":
                lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
                    initial_learning_rate=learning_rate,
                    decay_steps=10000
                )
            elif lr_scheduler_type == "cosine_with_restarts":
                lr_schedule = tf.keras.optimizers.schedules.CosineDecayRestarts(
                    initial_learning_rate=learning_rate,
                    first_decay_steps=1000
                )
            else:
                lr_schedule = learning_rate

            optimizer = tf.keras.optimizers.experimental.AdamW(
                learning_rate=lr_schedule,
                weight_decay=weight_decay,
                epsilon=1e-8
            )

            model.compile(optimizer=optimizer, 
                          loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                          metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy'), f1_score])

        model.fit(train_dataset_fold, epochs=num_epochs, validation_data=val_dataset_fold, verbose=1)

        # Evaluate on split_test_dataset
        predictions = model.predict(split_test_dataset).logits
        y_test_fold = np.concatenate([y.numpy() for _, y in split_test_dataset], axis=0)
        metrics = compute_metrics(predictions, y_test_fold)
        f1 = metrics['f1']
        accuracy = metrics['accuracy']
        
        avg_score = (f1 + accuracy) / 2
        fold_scores.append(avg_score)  # This is average from f1 and accuracy

        if len(top_n_models) < 3 or avg_score > min(top_n_models, key=lambda x: x[1])[1]:  # Top-3 method
            model_save_path = os.path.join(model_save_dir, f"{studyName}_model_trial_{trial.number}_fold_{fold}_avg_score_{avg_score:.4f}")
            model.save(model_save_path, save_format="tf")
            top_n_models.append((trial.number, avg_score))
            top_n_models.sort(key=lambda x: x[1], reverse=True)
            if len(top_n_models) > 3:
                top_n_models.pop()

            # Fine-tune the model on the validation fold and split_test_dataset
            fine_tune_data = pd.concat([val_fold_data, split_test_data])
            fine_tune_encodings = tokenize_texts(fine_tune_data['clean_text'])
            fine_tune_labels = tf.convert_to_tensor(fine_tune_data['target'].values)
            fine_tune_dataset = tf.data.Dataset.from_tensor_slices((
                dict(fine_tune_encodings),
                fine_tune_labels
            )).batch(batch_size // num_gpus).prefetch(tf.data.experimental.AUTOTUNE)

            # Calculate the ratio of training data size to epochs
            training_data_size = len(train_fold_data)
            fine_tune_data_size = len(fine_tune_data)
            fine_tune_epochs = max(1, round((fine_tune_data_size / training_data_size) * num_epochs))

            model.fit(fine_tune_dataset, epochs=fine_tune_epochs, verbose=1)

            # Make predictions on the Kaggle test dataset
            kaggle_test_predictions = model.predict(kaggle_test_dataset).logits
            kaggle_test_predicted_labels = tf.argmax(kaggle_test_predictions, axis=1).numpy()

            # Create a submission DataFrame
            submission = pd.DataFrame({'id': kaggle_test_data['id'], 'target': kaggle_test_predicted_labels})
            
            # Save the submission
            submission_file = f"{studyName}_model_trial_{trial.number}_fold_{fold}_avg_score_{avg_score:.4f}" + '_submission.csv'  # Corrected naming convention
            submission_path = os.path.join(model_save_dir, submission_file)
            submission.to_csv(submission_path, index=False)
            print(f"Predictions saved for model: {model_save_path}")

    return np.mean(fold_scores)

# Define your Optuna study, using the MySQL connection string
optuna_storage = 'mysql+pymysql://<username>:<password>@<host>/<database>?ssl_ca=<path_to_CA_cert>&ssl_verify_cert=true'

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
db_password = user_secrets.get_secret("DB_PASSWORD")# This uses the secrets inside of Kaggle so I don't have to explicitly type my password out in code

# Example with your details (replace '<password>' with your real password and '<database>' with your database name)
optuna_storage = f'mysql+pymysql://MichaelAzure:{db_password}@kaggle-third-sql.mysql.database.azure.com/kaggle_disaster_database?ssl_ca=/kaggle/input/certification&ssl_verify_cert=true'

studyName = 'disaster_test_mike2_4'
study = optuna.create_study(study_name=studyName, # name of the study
                            storage=optuna_storage,  # URL for the mySQL schema
                            direction='maximize', # maximize the log loss
                            load_if_exists=True, # makes it so that if the study_name already exists in the schema, then it will append the new trials with the old trials and essentially resume the study. It will also remember the previous trials so it really is resuming the study
                            )

study.optimize(objective, n_trials=6)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


2024-06-07 23:54:43.033012: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-07 23:54:43.033067: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-07 23:54:43.034542: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  pid, fd = os.forkpty()


Note: you may need to restart the kernel to use updated packages.


  pid, fd = os.forkpty()


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


[I 2024-06-07 23:55:30,988] A new study created in RDB with name: disaster_test_mike2_4
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Cause: for/else statement not yet supported


I0000 00:00:1717804699.395988    3134 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/3
Epoch 3/3
Epoch 1/2
Epoch 2/2
Predictions saved for model: ./saved_models/disaster_test_mike2_4_model_trial_0_fold_0_avg_score_0.8226


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/2
Epoch 2/2
Predictions saved for model: ./saved_models/disaster_test_mike2_4_model_trial_0_fold_1_avg_score_0.8036


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/2
Epoch 2/2
Predictions saved for model: ./saved_models/disaster_test_mike2_4_model_trial_0_fold_2_avg_score_0.8226


[I 2024-06-08 00:47:21,461] Trial 0 finished with value: 0.8162878717078342 and parameters: {'learning_rate': 1.3989036829307529e-05, 'batch_size': 32, 'num_epochs': 3, 'dropout_rate': 0.30662536221018866, 'weight_decay': 0.029868057651167724, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.8162878717078342.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2
Predictions saved for model: ./saved_models/disaster_test_mike2_4_model_trial_1_fold_0_avg_score_0.8110


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2


[W 2024-06-08 01:02:18,548] Trial 1 failed with parameters: {'learning_rate': 1.3442320804616691e-05, 'batch_size': 32, 'num_epochs': 2, 'dropout_rate': 0.4277489225349599, 'weight_decay': 0.08795068900394121, 'lr_scheduler_type': 'linear'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_3071/1113082598.py", line 200, in objective
    model.fit(train_dataset_fold, epochs=num_epochs, validation_data=val_dataset_fold, verbose=1)
  File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 1229, in fit
    return super().fit(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/tf_keras/src/utils/traceback_utils.py", line 65, in error_handler
    return fn(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/tf_keras/src/engine/training.py", lin

KeyboardInterrupt: 

In [None]:
# # After hyperparameter tuning, use the saved top models to make predictions on the Kaggle test dataset

# # Directory where models are saved
# saved_model_dir = './saved_models'

# # Get the list of saved models
# saved_models = [f for f in os.listdir(saved_model_dir) if os.path.isdir(os.path.join(saved_model_dir, f))]

# # Load the Kaggle test dataset
# kaggle_test_dataset = tf.data.Dataset.from_tensor_slices(dict(kaggle_test_encodings)).batch(study.best_trial.params['batch_size'])

# # Iterate over the saved models
# for model_dir in saved_models:
#     model_path = os.path.join(saved_model_dir, model_dir)
#     loaded_model = tf.keras.models.load_model(model_path, custom_objects={"f1_score": f1_score})
    
#     # Make predictions
#     kaggle_test_predictions = loaded_model.predict(kaggle_test_dataset)
#     kaggle_test_logits = kaggle_test_predictions['logits']
#     kaggle_test_predicted_labels = tf.argmax(kaggle_test_logits, axis=1).numpy()

#     # Create a submission DataFrame
#     submission = pd.DataFrame({'id': kaggle_test_data['id'], 'target': kaggle_test_predicted_labels})
    
#     # Save the submission
#     submission_file = 'Submission: ' + model_dir + '_.csv'
#     submission_path = os.path.join(saved_model_dir, submission_file)
#     submission.to_csv(submission_path, index=False)
    
#     print(f"Predictions saved for model: {model_dir}")


In [None]:
# # After hyperparameter tuning, train final model on combined training and validation sets
# final_train_data = pd.concat([train_data, val_data])

# final_train_encodings = tokenize_texts(final_train_data['clean_text'])
# final_train_labels = tf.convert_to_tensor(final_train_data['target'].values)
# final_train_dataset = tf.data.Dataset.from_tensor_slices((
#     dict(final_train_encodings),
#     final_train_labels
# )).batch(study.best_trial.params['batch_size']).prefetch(tf.data.experimental.AUTOTUNE)

# with strategy.scope():
#     final_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
#     final_model.compile(optimizer=optimizer, 
#                         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
#                         metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy'), f1_score])
    
# final_model.fit(final_train_dataset, epochs=study.best_trial.params['num_epochs'], verbose=1)

# # Evaluate on the split test set
# split_test_dataset = tf.data.Dataset.from_tensor_slices((
#     dict(split_test_encodings),
#     split_test_labels
# )).batch(study.best_trial.params['batch_size']).prefetch(tf.data.experimental.AUTOTUNE)

# split_test_predictions = final_model.predict(split_test_dataset).logits
# split_test_y_val = np.concatenate([y.numpy() for _, y in split_test_dataset], axis=0)
# split_test_metrics = compute_metrics(split_test_predictions, split_test_y_val)

# print(f"Split Test F1 Score: {split_test_metrics['f1']}")
# print(f"Split Test Accuracy: {split_test_metrics['accuracy']}")

# # Predictions on the Kaggle test dataset
# kaggle_test_dataset = tf.data.Dataset.from_tensor_slices(dict(kaggle_test_encodings)).batch(study.best_trial.params['batch_size'])
# kaggle_test_predictions = final_model.predict(kaggle_test_dataset).logits
# kaggle_test_predicted_labels = tf.argmax(kaggle_test_predictions, axis=1).numpy()

# # Create a submission DataFrame
# submission = pd.DataFrame({'id': kaggle_test_data['id'], 'target': kaggle_test_predicted_labels})
# submission.to_csv('submission.csv', index=False)