In [None]:
import os
import numpy as np
import pandas as pd
import re
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import set_seed, BertTokenizer, TFBertForSequenceClassification, BertConfig
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import optuna

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
set_seed(42)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# Initialize TPU
try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.TPUStrategy(resolver)
    tpu_cores = strategy.num_replicas_in_sync
    print(f"TPU cores available: {tpu_cores}")
except ValueError:
    print("TPU not found")
    raise SystemExit
    
# Set fixed batch size and learning rate parameters
base_learning_rate = 1e-5
batch_size_per_core = 32
tpu_cores = 8
batch_size = batch_size_per_core * tpu_cores
learning_rate = base_learning_rate * (batch_size / (batch_size_per_core * tpu_cores))


# Load the training data
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
kaggle_test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Split the data into 75% training and 25% validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42, stratify=train_data['target'])

# Clean the text data
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'[^\w\s#]', '', text)  # Remove punctuation except hashtags
    text = text.lower()                  # Convert to lowercase
    return text

train_data['clean_text'] = train_data['text'].apply(clean_text)
val_data['clean_text'] = val_data['text'].apply(clean_text)
kaggle_test_data['clean_text'] = kaggle_test_data['text'].apply(clean_text)

# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        max_length=64,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

# Define your TFRecord parsing and loading functions
def parse_tfrecord_fn(example, include_target=True):
    feature_description = {
        'id': tf.io.FixedLenFeature([], tf.int64),
        'clean_text': tf.io.FixedLenFeature([], tf.string),
    }
    if include_target:
        feature_description['target'] = tf.io.FixedLenFeature([], tf.int64)
    
    example = tf.io.parse_single_example(example, feature_description)
    example['clean_text'] = tf.strings.reduce_join(example['clean_text'])
    
    if include_target:
        return example['clean_text'], example['target']
    return example['clean_text']

def load_tfrecord_dataset(file_pattern, batch_size, include_target=True, repeat=True):
    files = tf.data.Dataset.list_files(file_pattern)
    dataset = files.interleave(tf.data.TFRecordDataset, cycle_length=4)
    dataset = dataset.map(lambda x: parse_tfrecord_fn(x, include_target), num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    if repeat:
        dataset = dataset.repeat()
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

train_tfrecord_dataset = load_tfrecord_dataset('/kaggle/input/tfrecord-disaster/train_data.tfrecord', batch_size)
val_tfrecord_dataset = load_tfrecord_dataset('/kaggle/input/tfrecord-disaster/val_data.tfrecord', batch_size)
fine_tune_tfrecord_dataset = load_tfrecord_dataset('/kaggle/input/tfrecord-disaster/fine_tune_data.tfrecord', batch_size)
kaggle_test_tfrecord_dataset = load_tfrecord_dataset('/kaggle/input/tfrecord-disaster/kaggle_test_data.tfrecord', batch_size, include_target=False, repeat=False)

# Tokenize datasets
def tokenize_tfrecord_dataset(dataset, include_target=True):
    def tokenize_fn(text, target=None):
        encodings = tokenizer(
            [str(t, 'utf-8') for t in text.numpy()],
            max_length=64,
            padding='max_length',
            truncation=True,
            return_tensors='tf'
        )
        if include_target:
            return encodings['input_ids'], target
        return encodings['input_ids']

    def map_fn(text, target=None):
        if include_target:
            input_ids, target = tf.py_function(tokenize_fn, [text, target], [tf.int32, tf.int64])
            input_ids.set_shape([None, 64])
            target.set_shape([None])
            return {'input_ids': input_ids}, target
        input_ids = tf.py_function(tokenize_fn, [text], tf.int32)
        input_ids.set_shape([None, 64])
        return {'input_ids': input_ids}

    if include_target:
        return dataset.map(map_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return dataset.map(map_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)

train_tfrecord_dataset = tokenize_tfrecord_dataset(train_tfrecord_dataset)
val_tfrecord_dataset = tokenize_tfrecord_dataset(val_tfrecord_dataset)
fine_tune_tfrecord_dataset = tokenize_tfrecord_dataset(fine_tune_tfrecord_dataset)
kaggle_test_tfrecord_dataset = tokenize_tfrecord_dataset(kaggle_test_tfrecord_dataset, include_target=False)

# Define and compile your model
class CustomBertForSequenceClassification(tf.keras.Model):
    def __init__(self, model):
        super(CustomBertForSequenceClassification, self).__init__()
        self.model = model
    
    def call(self, inputs):
        outputs = self.model(inputs)
        logits = tf.cast(outputs.logits, tf.float32)
        return logits

# Directory to save models
model_save_dir = './saved_models'
os.makedirs(model_save_dir, exist_ok=True)

# File to store top 5 model predictions
top_predictions_file = 'top_5_predictions.json'

# Load existing top 5 predictions
if os.path.exists(top_predictions_file):
    with open(top_predictions_file, 'r') as file:
        top_predictions = json.load(file)
else:
    top_predictions = []

# Function to save top predictions
def save_top_predictions(pre_fine_tuning_file, post_fine_tuning_file, val_accuracy, model_number):
    global top_predictions

    new_entry = {
        'model_number': model_number,
        'val_accuracy': val_accuracy,
        'pre_fine_tuning_file': pre_fine_tuning_file,
        'post_fine_tuning_file': post_fine_tuning_file
    }

    # Add the new entry and sort by validation accuracy
    top_predictions.append(new_entry)
    top_predictions = sorted(top_predictions, key=lambda x: x['val_accuracy'], reverse=True)

    # If there are more than 5 entries, remove the one with the lowest accuracy
    if len(top_predictions) > 5:
        removed_entry = top_predictions.pop()
        # Check if the files exist before attempting to remove them
        if os.path.exists(removed_entry['pre_fine_tuning_file']):
            os.remove(removed_entry['pre_fine_tuning_file'])
            print(f"File {removed_entry['pre_fine_tuning_file']} has been removed.")
        else:
            print(f"File {removed_entry['pre_fine_tuning_file']} does not exist and cannot be removed.")
        
        if os.path.exists(removed_entry['post_fine_tuning_file']):
            os.remove(removed_entry['post_fine_tuning_file'])
            print(f"File {removed_entry['post_fine_tuning_file']} has been removed.")
        else:
            print(f"File {removed_entry['post_fine_tuning_file']} does not exist and cannot be removed.")

    # Save the updated top predictions to file
    with open(top_predictions_file, 'w') as file:
        json.dump(top_predictions, file, indent=4)
    
# Define steps per epoch and validation steps
steps_per_epoch = len(train_data) // batch_size
validation_steps = len(val_data) // batch_size
fine_tune_steps_per_epoch = len(val_data) // batch_size  # Added step calculation for fine-tuning dataset

# Define the objective function for Optuna
def objective(trial):
    num_epochs = trial.suggest_int("num_epochs", 1, 20)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)
    lr_scheduler_type = trial.suggest_categorical("lr_scheduler_type", ["constant", "linear", "cosine", "cosine_with_restarts"])
    gradient_clip_norm = trial.suggest_float("gradient_clip_norm", 0.0, 1.0)

    with strategy.scope():
        precision = tf.keras.metrics.Precision()
        recall = tf.keras.metrics.Recall()

        def f1_score_custom(y_true, y_pred):
            y_pred = tf.argmax(y_pred, axis=1)
            y_true = tf.cast(y_true, tf.int64)
            precision.update_state(y_true, y_pred)
            recall.update_state(y_true, y_pred)
            precision_result = precision.result()
            recall_result = recall.result()
            f1 = 2 * ((precision_result * recall_result) / (precision_result + recall_result + tf.keras.backend.epsilon()))
            return f1

        config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2, hidden_dropout_prob=dropout_rate)
        base_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
        model = CustomBertForSequenceClassification(base_model)

        if lr_scheduler_type == "linear":
            lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
                initial_learning_rate=learning_rate,
                decay_steps=10000,
                end_learning_rate=0.0,
                power=1.0
            )
        elif lr_scheduler_type == "cosine":
            lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
                initial_learning_rate=learning_rate,
                decay_steps=10000
            )
        elif lr_scheduler_type == "cosine_with_restarts":
            lr_schedule = tf.keras.optimizers.schedules.CosineDecayRestarts(
                initial_learning_rate=learning_rate,
                first_decay_steps=1000
            )
        else:
            lr_schedule = learning_rate

        optimizer = tf.keras.optimizers.AdamW(
            learning_rate=lr_schedule,
            weight_decay=weight_decay,
            epsilon=1e-8,
            clipnorm=gradient_clip_norm
        )

        model.compile(
            optimizer=optimizer, 
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy'), f1_score_custom],
            steps_per_execution=1
        )

    checkpoint_filepath = './best_model.keras'
    checkpoint_callback = ModelCheckpoint(
        filepath=checkpoint_filepath,
        monitor='val_accuracy',
        save_best_only=True,
        save_weights_only=False,
        mode='max',
        verbose=1
    )

    early_stopping_callback = EarlyStopping(
        monitor='val_accuracy',
        patience=3,  # Number of epochs to wait for improvement before stopping
        mode='max',
        verbose=1,
        restore_best_weights=True  # Restore model weights from the epoch with the best validation accuracy
    )

    history = model.fit(
        train_tfrecord_dataset,
        epochs=num_epochs,
        validation_data=val_tfrecord_dataset,
        steps_per_epoch=steps_per_epoch,
        validation_steps=validation_steps,
        callbacks=[checkpoint_callback, early_stopping_callback]
    )

    val_loss, val_accuracy, val_f1_score = model.evaluate(val_tfrecord_dataset, steps=validation_steps, verbose=1)
    print(f"f1 score: {val_f1_score} and accuracy: {val_accuracy}")

    avg_score = (val_accuracy + val_f1_score) / 2

    if len(top_predictions) < 5 or val_accuracy > min(top_predictions, key=lambda x: x['val_accuracy'])['val_accuracy']:
        kaggle_test_predictions = model.predict(kaggle_test_tfrecord_dataset, steps=(len(kaggle_test_data) + batch_size - 1) // batch_size).logits
        
        # Ensure the prediction length matches the test data length
        kaggle_test_predicted_labels = tf.argmax(kaggle_test_predictions, axis=1).numpy()[:len(kaggle_test_data)]
        
        pre_fine_tuning_predictions_file = os.path.join(model_save_dir, f"{studyName}_model_trial_{trial.number}_accuracy_{val_accuracy:.4f}_avg_score_{avg_score:.4f}_f1_{val_f1_score:.4f}_pre_fine_tuning_submission.csv") 
        submission = pd.DataFrame({'id': kaggle_test_data['id'], 'target': kaggle_test_predicted_labels})
        submission.to_csv(pre_fine_tuning_predictions_file, index=False)

        training_data_size = len(train_data)
        fine_tune_data_size = len(val_data)
        fine_tune_epochs = max(1, round((fine_tune_data_size / training_data_size) * num_epochs))

        model.fit(fine_tune_tfrecord_dataset, epochs=fine_tune_epochs, steps_per_epoch=fine_tune_steps_per_epoch, verbose=1)  # Added steps_per_epoch

        kaggle_test_predictions = model.predict(kaggle_test_tfrecord_dataset, steps=(len(kaggle_test_data) + batch_size - 1) // batch_size).logits
        
        # Ensure the prediction length matches the test data length
        kaggle_test_predicted_labels = tf.argmax(kaggle_test_predictions, axis=1).numpy()[:len(kaggle_test_data)]
        
        post_fine_tuning_predictions_file = os.path.join(model_save_dir, f"{studyName}_model_trial_{trial.number}_accuracy_{val_accuracy:.4f}_avg_score_{avg_score:.4f}_f1_{val_f1_score:.4f}_post_fine_tuning_submission.csv") 
        submission = pd.DataFrame({'id': kaggle_test_data['id'], 'target': kaggle_test_predicted_labels})
        submission.to_csv(post_fine_tuning_predictions_file, index=False)

        save_top_predictions(pre_fine_tuning_predictions_file, post_fine_tuning_predictions_file, val_accuracy, trial.number)

    return val_accuracy

# Define the Optuna study
optuna_storage = 'mysql+pymysql://<username>:<password>@<host>/<database>?ssl_ca=<path_to_CA_cert>&ssl_verify_cert=true'
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
db_password = user_secrets.get_secret("DB_PASSWORD")
optuna_storage = f'mysql+pymysql://MichaelAzure:{db_password}@kaggle-third-sql.mysql.database.azure.com/kaggle_disaster_database?ssl_ca=/kaggle/input/certification&ssl_verify_cert=true'

studyName = 'disaster_tfrecord_BERT_checkpoint_0'
study = optuna.create_study(study_name=studyName, storage=optuna_storage, direction='maximize', load_if_exists=True)
study.optimize(objective, n_trials=100)

# Access the best trial
best_trial = study.best_trial

print("Best trial:")
print(f"  Value: {best_trial.value}")
print("  Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

# Number of epochs actually run (can be less than the max due to early stopping)
print(f"Number of Epochs Run: {best_trial.number}")

# Load the best model
best_model = tf.keras.models.load_model(checkpoint_filepath, custom_objects={'CustomBertForSequenceClassification': CustomBertForSequenceClassification})

# Use the best model for predictions
kaggle_test_predictions = best_model.predict(kaggle_test_tfrecord_dataset)
kaggle_test_predicted_labels = tf.argmax(kaggle_test_predictions, axis=1).numpy()[:len(kaggle_test_data)]
submission = pd.DataFrame({'id': kaggle_test_data['id'], 'target': kaggle_test_predicted_labels})
submission.to_csv('submission.csv', index=False)


In [None]:
import os
import numpy as np
import pandas as pd
import re
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import set_seed, BertTokenizer, TFBertForSequenceClassification, BertConfig
import optuna

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)
set_seed(42)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# Initialize TPU
try:
    resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.TPUStrategy(resolver)
    tpu_cores = strategy.num_replicas_in_sync
    print(f"TPU cores available: {tpu_cores}")
except ValueError:
    print("TPU not found")
    raise SystemExit

import tensorflow.keras.callbacks
# Set fixed batch size and learning rate parameters
base_learning_rate = 1e-5
batch_size_per_core = 32
tpu_cores = 8
batch_size = batch_size_per_core * tpu_cores
learning_rate = base_learning_rate * (batch_size / (batch_size_per_core * tpu_cores))


# Load the training data
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
kaggle_test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Split the data into 75% training and 25% validation sets
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42, stratify=train_data['target'])

# Clean the text data
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'[^\w\s#]', '', text)  # Remove punctuation except hashtags
    text = text.lower()                  # Convert to lowercase
    return text


train_data['clean_text'] = train_data['text'].apply(clean_text)
val_data['clean_text'] = val_data['text'].apply(clean_text)
kaggle_test_data['clean_text'] = kaggle_test_data['text'].apply(clean_text)

# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        max_length=64,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

# Define your TFRecord parsing and loading functions
def parse_tfrecord_fn(example, include_target=True):
    feature_description = {
        'id': tf.io.FixedLenFeature([], tf.int64),
        'clean_text': tf.io.FixedLenFeature([], tf.string),
    }
    if include_target:
        feature_description['target'] = tf.io.FixedLenFeature([], tf.int64)
    
    example = tf.io.parse_single_example(example, feature_description)
    example['clean_text'] = tf.strings.reduce_join(example['clean_text'])
    
    if include_target:
        return example['clean_text'], example['target']
    return example['clean_text']

def load_tfrecord_dataset(file_pattern, batch_size, include_target=True, repeat=True):
    files = tf.data.Dataset.list_files(file_pattern)
    dataset = files.interleave(tf.data.TFRecordDataset, cycle_length=4)
    dataset = dataset.map(lambda x: parse_tfrecord_fn(x, include_target), num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    if repeat:
        dataset = dataset.repeat()
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

train_tfrecord_dataset = load_tfrecord_dataset('/kaggle/input/tfrecord-disaster/train_data.tfrecord', batch_size)
val_tfrecord_dataset = load_tfrecord_dataset('/kaggle/input/tfrecord-disaster/val_data.tfrecord', batch_size)
fine_tune_tfrecord_dataset = load_tfrecord_dataset('/kaggle/input/tfrecord-disaster/fine_tune_data.tfrecord', batch_size)
kaggle_test_tfrecord_dataset = load_tfrecord_dataset('/kaggle/input/tfrecord-disaster/kaggle_test_data.tfrecord', batch_size, include_target=False, repeat=False)

# Tokenize datasets
def tokenize_tfrecord_dataset(dataset, include_target=True):
    def tokenize_fn(text, target=None):
        encodings = tokenizer(
            [str(t, 'utf-8') for t in text.numpy()],
            max_length=64,
            padding='max_length',
            truncation=True,
            return_tensors='tf'
        )
        if include_target:
            return encodings['input_ids'], target
        return encodings['input_ids']

    def map_fn(text, target=None):
        if include_target:
            input_ids, target = tf.py_function(tokenize_fn, [text, target], [tf.int32, tf.int64])
            input_ids.set_shape([None, 64])
            target.set_shape([None])
            return {'input_ids': input_ids}, target
        input_ids = tf.py_function(tokenize_fn, [text], tf.int32)
        input_ids.set_shape([None, 64])
        return {'input_ids': input_ids}

    if include_target:
        return dataset.map(map_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return dataset.map(map_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)

train_tfrecord_dataset = tokenize_tfrecord_dataset(train_tfrecord_dataset)
val_tfrecord_dataset = tokenize_tfrecord_dataset(val_tfrecord_dataset)
fine_tune_tfrecord_dataset = tokenize_tfrecord_dataset(fine_tune_tfrecord_dataset)
kaggle_test_tfrecord_dataset = tokenize_tfrecord_dataset(kaggle_test_tfrecord_dataset, include_target=False)

# Define and compile your model
class CustomBertForSequenceClassification(tf.keras.Model):
    def __init__(self, model):
        super(CustomBertForSequenceClassification, self).__init__()
        self.model = model
    
    def call(self, inputs):
        outputs = self.model(inputs)
        logits = tf.cast(outputs.logits, tf.float32)
        return logits

# Directory to save models
model_save_dir = './saved_models'
os.makedirs(model_save_dir, exist_ok=True)

# File to store top 5 model predictions
top_predictions_file = 'top_5_predictions.json'

# Load existing top 5 predictions
if os.path.exists(top_predictions_file):
    with open(top_predictions_file, 'r') as file:
        top_predictions = json.load(file)
else:
    top_predictions = []

# Function to save top predictions
def save_top_predictions(pre_fine_tuning_file, post_fine_tuning_file, val_accuracy, model_number):
    global top_predictions

    new_entry = {
        'model_number': model_number,
        'val_accuracy': val_accuracy,
        'pre_fine_tuning_file': pre_fine_tuning_file,
        'post_fine_tuning_file': post_fine_tuning_file
    }

    # Add the new entry and sort by validation accuracy
    top_predictions.append(new_entry)
    top_predictions = sorted(top_predictions, key=lambda x: x['val_accuracy'], reverse=True)

    # If there are more than 5 entries, remove the one with the lowest accuracy
    if len(top_predictions) > 5:
        removed_entry = top_predictions.pop()
        # Check if the files exist before attempting to remove them
        if os.path.exists(removed_entry['pre_fine_tuning_file']):
            os.remove(removed_entry['pre_fine_tuning_file'])
            print(f"File {removed_entry['pre_fine_tuning_file']} has been removed.")
        else:
            print(f"File {removed_entry['pre_fine_tuning_file']} does not exist and cannot be removed.")
        
        if os.path.exists(removed_entry['post_fine_tuning_file']):
            os.remove(removed_entry['post_fine_tuning_file'])
            print(f"File {removed_entry['post_fine_tuning_file']} has been removed.")
        else:
            print(f"File {removed_entry['post_fine_tuning_file']} does not exist and cannot be removed.")

    # Save the updated top predictions to file
    with open(top_predictions_file, 'w') as file:
        json.dump(top_predictions, file, indent=4)
    
# Define steps per epoch and validation steps
steps_per_epoch = len(train_data) // batch_size
validation_steps = len(val_data) // batch_size
fine_tune_steps_per_epoch = len(val_data) // batch_size  # Added step calculation for fine-tuning dataset

# Define the objective function for Optuna
def objective(trial):
    num_epochs = trial.suggest_int("num_epochs", 1, 20)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)
    lr_scheduler_type = trial.suggest_categorical("lr_scheduler_type", ["constant", "linear", "cosine", "cosine_with_restarts"])
    gradient_clip_norm = trial.suggest_float("gradient_clip_norm", 0.0, 1.0)

    with strategy.scope():
        precision = tf.keras.metrics.Precision()
        recall = tf.keras.metrics.Recall()

        def f1_score_custom(y_true, y_pred):
            y_pred = tf.argmax(y_pred, axis=1)
            y_true = tf.cast(y_true, tf.int64)
            precision.update_state(y_true, y_pred)
            recall.update_state(y_true, y_pred)
            precision_result = precision.result()
            recall_result = recall.result()
            f1 = 2 * ((precision_result * recall_result) / (precision_result + recall_result + tf.keras.backend.epsilon()))
            return f1

        config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2, hidden_dropout_prob=dropout_rate)
        base_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
        model = CustomBertForSequenceClassification(base_model)

        if lr_scheduler_type == "linear":
            lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
                initial_learning_rate=learning_rate,
                decay_steps=10000,
                end_learning_rate=0.0,
                power=1.0
            )
        elif lr_scheduler_type == "cosine":
            lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
                initial_learning_rate=learning_rate,
                decay_steps=10000
            )
        elif lr_scheduler_type == "cosine_with_restarts":
            lr_schedule = tf.keras.optimizers.schedules.CosineDecayRestarts(
                initial_learning_rate=learning_rate,
                first_decay_steps=1000
            )
        else:
            lr_schedule = learning_rate

        optimizer = tf.keras.optimizers.AdamW(
            learning_rate=lr_schedule,
            weight_decay=weight_decay,
            epsilon=1e-8,
            clipnorm=gradient_clip_norm
        )

        model.compile(
            optimizer=optimizer, 
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy'), f1_score_custom],
            steps_per_execution=1
        )

#     checkpoint_filepath = './best_model.keras'
#     checkpoint_callback = ModelCheckpoint(
#         filepath=checkpoint_filepath,
#         monitor='val_accuracy',
#         save_best_only=True,
#         save_weights_only=False,
#         mode='max',
#         verbose=1
#     )

#     early_stopping_callback = EarlyStopping(
#         monitor='val_accuracy',
#         patience=3,  # Number of epochs to wait for improvement before stopping
#         mode='max',
#         verbose=1,
#         restore_best_weights=True  # Restore model weights from the epoch with the best validation accuracy
#     )

    history = model.fit(
        train_tfrecord_dataset,
        epochs=num_epochs,
        validation_data=val_tfrecord_dataset,
        steps_per_epoch=steps_per_epoch,
        validation_steps=validation_steps,
        #callbacks=[checkpoint_callback, early_stopping_callback]
    )

    val_loss, val_accuracy, val_f1_score = model.evaluate(val_tfrecord_dataset, steps=validation_steps, verbose=1)
    print(f"f1 score: {val_f1_score} and accuracy: {val_accuracy}")

    avg_score = (val_accuracy + val_f1_score) / 2

    if len(top_predictions) < 5 or val_accuracy > min(top_predictions, key=lambda x: x['val_accuracy'])['val_accuracy']:
        kaggle_test_predictions = model.predict(kaggle_test_tfrecord_dataset, steps=(len(kaggle_test_data) + batch_size - 1) // batch_size).logits
        
        # Ensure the prediction length matches the test data length
        kaggle_test_predicted_labels = tf.argmax(kaggle_test_predictions, axis=1).numpy()[:len(kaggle_test_data)]
        
        pre_fine_tuning_predictions_file = os.path.join(model_save_dir, f"{studyName}_model_trial_{trial.number}_accuracy_{val_accuracy:.4f}_avg_score_{avg_score:.4f}_f1_{val_f1_score:.4f}_pre_fine_tuning_submission.csv") 
        submission = pd.DataFrame({'id': kaggle_test_data['id'], 'target': kaggle_test_predicted_labels})
        submission.to_csv(pre_fine_tuning_predictions_file, index=False)

        training_data_size = len(train_data)
        fine_tune_data_size = len(val_data)
        fine_tune_epochs = max(1, round((fine_tune_data_size / training_data_size) * num_epochs))

        model.fit(fine_tune_tfrecord_dataset, epochs=fine_tune_epochs, steps_per_epoch=fine_tune_steps_per_epoch, verbose=1)  # Added steps_per_epoch

        kaggle_test_predictions = model.predict(kaggle_test_tfrecord_dataset, steps=(len(kaggle_test_data) + batch_size - 1) // batch_size).logits
        
        # Ensure the prediction length matches the test data length
        kaggle_test_predicted_labels = tf.argmax(kaggle_test_predictions, axis=1).numpy()[:len(kaggle_test_data)]
        
        post_fine_tuning_predictions_file = os.path.join(model_save_dir, f"{studyName}_model_trial_{trial.number}_accuracy_{val_accuracy:.4f}_avg_score_{avg_score:.4f}_f1_{val_f1_score:.4f}_post_fine_tuning_submission.csv") 
        submission = pd.DataFrame({'id': kaggle_test_data['id'], 'target': kaggle_test_predicted_labels})
        submission.to_csv(post_fine_tuning_predictions_file, index=False)

        save_top_predictions(pre_fine_tuning_predictions_file, post_fine_tuning_predictions_file, val_accuracy, trial.number)

    return val_accuracy

# Define the Optuna study
optuna_storage = 'mysql+pymysql://<username>:<password>@<host>/<database>?ssl_ca=<path_to_CA_cert>&ssl_verify_cert=true'
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
db_password = user_secrets.get_secret("DB_PASSWORD")
optuna_storage = f'mysql+pymysql://MichaelAzure:{db_password}@kaggle-third-sql.mysql.database.azure.com/kaggle_disaster_database?ssl_ca=/kaggle/input/certification&ssl_verify_cert=true'

studyName = 'disaster_tfrecord_BERT_checkpoint_4'
study = optuna.create_study(study_name=studyName, storage=optuna_storage, direction='maximize', load_if_exists=True)
study.optimize(objective, n_trials=100)

# Access the best trial
best_trial = study.best_trial

print("Best trial:")
print(f"  Value: {best_trial.value}")
print("  Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

# Number of epochs actually run (can be less than the max due to early stopping)
print(f"Number of Epochs Run: {best_trial.number}")

# # Load the best model
# best_model = tf.keras.models.load_model(checkpoint_filepath, custom_objects={'CustomBertForSequenceClassification': CustomBertForSequenceClassification})

# # Use the best model for predictions
# kaggle_test_predictions = best_model.predict(kaggle_test_tfrecord_dataset)
# kaggle_test_predicted_labels = tf.argmax(kaggle_test_predictions, axis=1).numpy()[:len(kaggle_test_data)]
# submission = pd.DataFrame({'id': kaggle_test_data['id'], 'target': kaggle_test_predicted_labels})
# submission.to_csv('submission.csv', index=False)
