# This notebook uses the Hyperparameter Optuna Trials saved in a SQL table and then submits those results

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import random
import os
import re
from transformers import set_seed, BertTokenizer, TFBertForSequenceClassification, BertConfig
import tensorflow as tf
from datasets import load_metric
import optuna
from sklearn.model_selection import train_test_split

# Install necessary packages for Azure SQL connection
%pip install mysql-connector-python 
%pip install PyMySQL

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)
set_seed(42)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

###############################################################################
# Load the training data
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Clean the text data
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'[^\w\s#]', '', text)  # Remove punctuation except hashtags
    text = text.lower()                  # Convert to lowercase
    return text

train_data['clean_text'] = train_data['text'].apply(clean_text)
test_data['clean_text'] = test_data['text'].apply(clean_text)

###############################################################################
# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        max_length=64,
        padding=True,
        truncation=True,
        return_tensors='tf'
    )

train_encodings = tokenize_texts(train_data['clean_text'])
test_encodings = tokenize_texts(test_data['clean_text'])

###############################################################################
train_labels = tf.convert_to_tensor(train_data['target'].values)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

# Create a validation split
val_size = int(0.2 * len(train_data))
val_dataset = train_dataset.take(val_size)
train_dataset = train_dataset.skip(val_size)

In [None]:
import numpy as np
import pandas as pd
import random
import os
import re
from transformers import set_seed, BertTokenizer, TFBertForSequenceClassification, BertConfig
import tensorflow as tf
from datasets import load_metric
import optuna
from sklearn.model_selection import train_test_split

# Install necessary packages for Azure SQL connection
%pip install mysql-connector-python 
%pip install PyMySQL

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)
set_seed(42)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

###############################################################################
# Load the training data
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Clean the text data
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'[^\w\s#]', '', text)  # Remove punctuation except hashtags
    text = text.lower()                  # Convert to lowercase
    return text

train_data['clean_text'] = train_data['text'].apply(clean_text)
test_data['clean_text'] = test_data['text'].apply(clean_text)

###############################################################################
# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        max_length=64,
        padding=True,
        truncation=True,
        return_tensors='tf'
    )

train_encodings = tokenize_texts(train_data['clean_text'])
test_encodings = tokenize_texts(test_data['clean_text'])

###############################################################################
train_labels = tf.convert_to_tensor(train_data['target'].values)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

# Create a validation split
val_size = int(0.2 * len(train_data))
val_dataset = train_dataset.take(val_size)
train_dataset = train_dataset.skip(val_size)

###############################################################################
# Load the F1 metric from the datasets library
# Load the F1 metric from the datasets library
metric = load_metric("f1")

def compute_metrics(predictions, labels):
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

def create_tf_dataset(encodings, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((encodings, labels))
    return dataset.shuffle(10000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

# Define precision and recall metrics outside of the custom metric function
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()

def f1_score(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=1)
    y_true = tf.cast(y_true, tf.int64)
    precision.update_state(y_true, y_pred)
    recall.update_state(y_true, y_pred)
    precision_result = precision.result()
    recall_result = recall.result()
    f1 = 2 * ((precision_result * recall_result) / (precision_result + recall_result + tf.keras.backend.epsilon()))
    return f1


strategy = tf.distribute.MirroredStrategy()

def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    num_epochs = trial.suggest_int("num_epochs", 2, 2)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)
    lr_scheduler_type = trial.suggest_categorical("lr_scheduler_type", ["constant", "linear", "cosine", "cosine_with_restarts"])
    
    # Ensure batch_size is evenly divisible by the number of GPUs
    num_gpus = strategy.num_replicas_in_sync
    if batch_size % num_gpus != 0:
        raise optuna.exceptions.TrialPruned(f"Batch size {batch_size} not divisible by number of GPUs {num_gpus}")
    
    train_dataset_tuned = train_dataset.batch(batch_size // num_gpus).prefetch(tf.data.experimental.AUTOTUNE)
    val_dataset_tuned = val_dataset.batch(batch_size // num_gpus).prefetch(tf.data.experimental.AUTOTUNE)
    
    with strategy.scope():
        config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2, hidden_dropout_prob=dropout_rate)
        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-8)
        
        if lr_scheduler_type == "linear":
            lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
                initial_learning_rate=learning_rate,
                decay_steps=10000,
                end_learning_rate=0.0,
                power=1.0
            )
        elif lr_scheduler_type == "cosine":
            lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
                initial_learning_rate=learning_rate,
                decay_steps=10000
            )
        elif lr_scheduler_type == "cosine_with_restarts":
            lr_schedule = tf.keras.optimizers.schedules.CosineDecayRestarts(
                initial_learning_rate=learning_rate,
                first_decay_steps=1000
            )
        else:
            lr_schedule = learning_rate

        model.compile(optimizer=optimizer, 
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                      metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy'), f1_score])
        
        
    model.fit(train_dataset_tuned, epochs=num_epochs, validation_data=val_dataset_tuned, verbose=1)

    predictions = model.predict(val_dataset_tuned).logits
    y_val = np.concatenate([y.numpy() for _, y in val_dataset_tuned], axis=0)
    metrics = compute_metrics(predictions, y_val)
    return metrics['f1']

# Define your Optuna study, using the MySQL connection string
optuna_storage = 'mysql+pymysql://<username>:<password>@<host>/<database>?ssl_ca=<path_to_CA_cert>&ssl_verify_cert=true'

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
db_password = user_secrets.get_secret("DB_PASSWORD")# This uses the secrets inside of Kaggle so I don't have to explicitly type my password out in code

# Example with your details (replace '<password>' with your real password and '<database>' with your database name)
optuna_storage = f'mysql+pymysql://MichaelAzure:{db_password}@kaggle-third-sql.mysql.database.azure.com/kaggle_disaster_database?ssl_ca=/kaggle/input/certification&ssl_verify_cert=true'

study = optuna.create_study(study_name='disaster_5', # name of the study
                            storage=optuna_storage,  # URL for the mySQL schema
                            direction='maximize', # maximize the log loss
                            load_if_exists=True, # makes it so that if the study_name already exists in the schema, then it will append the new trials with the old trials and essentially resume the study. It will also remember the previous trials so it really is resuming the study
                            )

study.optimize(objective, n_trials=20)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


In [None]:
# ###############################################################################
# best_params = study.best_trial.params


# train_dataset_final = train_dataset.batch(best_params['batch_size']).prefetch(tf.data.experimental.AUTOTUNE)
# val_dataset_final = val_dataset.batch(best_params['batch_size']).prefetch(tf.data.experimental.AUTOTUNE)

# strategy = tf.distribute.MirroredStrategy()
# with strategy.scope():
#     config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2, hidden_dropout_prob=best_params['dropout_rate'])
#     model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
    
#     optimizer = tf.keras.optimizers.Adam(learning_rate=best_params['learning_rate'], epsilon=1e-8, weight_decay=best_params['weight_decay'])
    
#     lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
#         initial_learning_rate=best_params['learning_rate'],
#         decay_steps=best_params['warmup_steps'] + len(train_dataset_final) * best_params['num_epochs'],
#         end_learning_rate=0.0
#     )
    
#     model.compile(optimizer=optimizer, 
#                   loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
#                   metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
    
#     history = model.fit(train_dataset_final, epochs=best_params['num_epochs'], validation_data=val_dataset_final, verbose=1)

# ###############################################################################
# test_dataset = tf.data.Dataset.from_tensor_slices((
#     dict(test_encodings)
# )).batch(best_params['batch_size'])

# predictions = model.predict(test_dataset).logits
# predicted_labels = tf.argmax(predictions, axis=1).numpy()

# # Create a submission DataFrame
# submission = pd.DataFrame({'id': test_data['id'], 'target': predicted_labels})
# submission.to_csv('submission_2_kaggle.csv', index=False)

In [None]:
import pandas as pd

trials = pd.read_csv("/kaggle/input/trials2/trials.csv")
#trials.describe()

# Group the DataFrame by 'study_id' and create a dictionary of DataFrames
grouped = trials.groupby('study_name')
dfs = {study_name: group for study_name, group in grouped}

# Sort each DataFrame by the 'value' column in descending order
sorted_dfs = {study_name: df.sort_values(by='value', ascending=False) for study_name, df in dfs.items()}

disaster4_sorted = sorted_dfs["disaster_4"]

In [None]:
# Assuming `df` is your original DataFrame
# First, filter out only the relevant columns if there are any extra columns
df_filtered = disaster4_sorted[['study_id', 'study_name', 'trial_id', 'trial_number', 'param_name', 'param_value', 'value']]

# Pivot the table
df_pivoted = df_filtered.pivot_table(index=['study_id', 'study_name', 'trial_id', 'trial_number', 'value'], 
                                     columns='param_name', 
                                     values='param_value',
                                     aggfunc='first').reset_index()

# Renaming the columns for easier access
df_pivoted.columns = ['study_id', 'study_name', 'trial_id', 'trial_number', 'value', 'batch_size', 'dropout_rate', 'learning_rate', 'lr_scheduler_type', 'num_epochs', 'weight_decay']

# Sort the pivoted table by value 
disaster4_final = df_pivoted.sort_values('value', ascending=False)

# Convert the columns to int
disaster4_final['batch_size'] = disaster4_final['batch_size'].astype(int)
disaster4_final['lr_scheduler_type'] = disaster4_final['lr_scheduler_type'].astype(int)
disaster4_final['num_epochs'] = disaster4_final['num_epochs'].astype(int)

disaster4_final.head()

In [None]:
import numpy as np
import pandas as pd
import random
import os
import re
from transformers import set_seed, BertTokenizer, TFBertForSequenceClassification, BertConfig
import tensorflow as tf
from datasets import load_metric
import optuna
from sklearn.model_selection import train_test_split

# Install necessary packages for Azure SQL connection
%pip install mysql-connector-python 
%pip install PyMySQL

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)
set_seed(42)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

###############################################################################
# Load the training data
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Clean the text data
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'[^\w\s#]', '', text)  # Remove punctuation except hashtags
    text = text.lower()                  # Convert to lowercase
    return text

train_data['clean_text'] = train_data['text'].apply(clean_text)
test_data['clean_text'] = test_data['text'].apply(clean_text)

###############################################################################


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_texts(texts):
        return tokenizer(
            texts.tolist(),
            max_length=64,
            padding=True,
            truncation=True,
            return_tensors='tf'
        )
###############################################################################


final_train_encodings = tokenize_texts(train_data['clean_text'])
final_test_encodings = tokenize_texts(test_data['clean_text'])

final_train_labels = tf.convert_to_tensor(train_data['target'].values)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(final_train_encodings),
    final_train_labels
))

# Create a validation split
val_size = int(0.2 * len(train_data))
val_dataset = train_dataset.take(val_size)
train_dataset = train_dataset.skip(val_size)
    
###############################################################################

strategy = tf.distribute.MirroredStrategy()

def f1_score(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=1)
    y_true = tf.cast(y_true, tf.int64)
    precision.update_state(y_true, y_pred)
    recall.update_state(y_true, y_pred)
    precision_result = precision.result()
    recall_result = recall.result()
    f1 = 2 * ((precision_result * recall_result) / (precision_result + recall_result + tf.keras.backend.epsilon()))
    return f1

# Define precision and recall metrics outside of the custom metric function
precision = tf.keras.metrics.Precision()
recall = tf.keras.metrics.Recall()


for i in range(2):

    batch_size_index = int(disaster4_final.iloc[i]["batch_size"])
    dropout_rate = disaster4_final.iloc[i]["dropout_rate"]
    learning_rate = disaster4_final.iloc[i]["learning_rate"]
    lr_scheduler_type_index = disaster4_final.iloc[i]["lr_scheduler_type"]
    num_epochs = disaster4_final.iloc[i]["num_epochs"]
    weight_decay = disaster4_final.iloc[i]["weight_decay"]
    score = disaster4_final.iloc[i]["value"]
    
    
    # We have to convert categorical values to their real values. For example, batch_size was categorical during hyperparameter tuning
    # So, the values of batch_size are like 0, 1, and 2, which represent the index of the list [16, 32, 64]. So, we have to convert that to actual values
    batch_size_category = [16, 32, 64]
    batch_size = batch_size_category[batch_size_index]
    
    lr_scheduler_type_category = ["constant", "linear", "cosine", "cosine_with_restarts"]
    lr_scheduler_type = lr_scheduler_type_category[lr_scheduler_type_index]
    

    print(f"Model {i} with score: {score}, batch_size: {batch_size}, dropout_rate: {dropout_rate}, learning_rate: {learning_rate}, lr_scheduler_type: {lr_scheduler_type}, num_epochs: {num_epochs}, and weight_decay: {weight_decay}")


#     # Combine training and validation data
#     combined_dataset = train_dataset.concatenate(val_dataset).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

    num_gpus = strategy.num_replicas_in_sync
    final_train_dataset_tuned = train_dataset.batch(batch_size // num_gpus).prefetch(tf.data.experimental.AUTOTUNE)
    final_val_dataset_tuned = val_dataset.batch(batch_size // num_gpus).prefetch(tf.data.experimental.AUTOTUNE)
    
    # Train the final model
    with strategy.scope():
        config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2, hidden_dropout_prob=dropout_rate)
        final_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-8)

        if lr_scheduler_type == "linear":
            lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
                initial_learning_rate=learning_rate,
                decay_steps=10000,
                end_learning_rate=0.0,
                power=1.0
            )
        elif lr_scheduler_type == "cosine":
            lr_schedule = tf.keras.optimizers.schedules.CosineDecay(
                initial_learning_rate=learning_rate,
                decay_steps=10000
            )
        elif lr_scheduler_type == "cosine_with_restarts":
            lr_schedule = tf.keras.optimizers.schedules.CosineDecayRestarts(
                initial_learning_rate=learning_rate,
                first_decay_steps=1000
            )
        else:
            lr_schedule = learning_rate

        final_model.compile(optimizer=optimizer, 
                            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                            metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy'), f1_score])

        final_model.fit(final_train_dataset_tuned, epochs=num_epochs, validation_data = final_val_dataset_tuned, verbose=1)
        
    
    test_dataset = tf.data.Dataset.from_tensor_slices((
        dict(test_encodings)
    )).batch(batch_size)
        
    predictions = final_model.predict(test_dataset).logits
    predicted_labels = tf.argmax(predictions, axis=1).numpy()

    # Create a submission DataFrame
    submission = pd.DataFrame({'id': test_data['id'], 'target': predicted_labels})
    submission.to_csv(f'submission_{i}_kaggle.csv', index=False)
    print("Submitted Model {i}")
    
    
###############################################################################
    
#     # create predictions
#     predictions = model.predict(test_values_dataset, verbose = 1)

#     # Assuming 'test_values' DataFrame has an 'id' column and is in the same order as 'test_values_dataset'
#     prediction_df = pd.DataFrame(predictions, columns=['blank', 'monkey_prosimian', 'civet_genet', 'rodent', 'antelope_duiker', 'hog', 'bird', 'leopard']) # make sure the column order matches the order of the tfrecord

#     # Insert the 'id' column
#     prediction_df.insert(0, 'id', test_values['id'].values)

#     # Ensure 'id' is the first column
#     prediction_df = prediction_df[['id'] + [col for col in prediction_df.columns if col != 'id']]

#     # Define the new order of the columns
#     new_order = ['id', 'antelope_duiker', 'bird', 'blank', 'civet_genet', 'hog', 'leopard', 'monkey_prosimian', 'rodent'] 

#     # Reorder the columns
#     prediction_df = prediction_df.reindex(columns=new_order) # reorder the columns since that's how the drivendata competition wants it

#     # Save the DataFrame to a CSV file
#     prediction_df.to_csv('submission_ResNet50_test.csv', index=False) # make sure to change file name to represent the model number
#     print("Successfully submitted!")
    