<a href="https://colab.research.google.com/github/Coffinbrain/lessons/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# -*- coding: utf-8 -*-
"""
This script integrates Optuna with Simple Transformers for automated hyperparameter tuning
on a sentiment analysis task using BERT, focusing on a custom evaluation function for multiclass compatibility.
"""

# Ensure Optuna and Simple Transformers are installed
# !pip install simpletransformers optuna

import os
import sqlite3
import pandas as pd
import optuna
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel

# Enable detailed error messages for CUDA
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Load data from an SQLite database
conn = sqlite3.connect('/content/drive/MyDrive/IMDB_Movies_2021.db')  # Adjust this path to your database file
df = pd.read_sql_query("SELECT REVIEW, RATING FROM REVIEWS", conn)
conn.close()

# Data preprocessing
df = df.dropna(subset=['RATING'])
df['RATING'] = df['RATING'].astype(float).round().astype(int) - 1  # Adjust labels to start from 0
df['REVIEW'] = df['REVIEW'].astype(str).str.lower()

# Split the data into training, testing, and evaluation sets
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
train_df, eval_df = train_test_split(train_df, test_size=0.1, random_state=42)

def custom_evaluation(model, eval_df):
    # Get predictions and calculate accuracy
    predictions, raw_outputs = model.predict(eval_df['REVIEW'].tolist())
    accuracy = accuracy_score(eval_df['RATING'], predictions)
    return accuracy

def objective(trial):
    # Hyperparameters search space
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
    num_train_epochs = trial.suggest_int('num_train_epochs', 2, 5)
    train_batch_size = trial.suggest_categorical('train_batch_size', [16, 32])

    # Model configuration
    model_args = {
        'learning_rate': learning_rate,
        'num_train_epochs': num_train_epochs,
        'train_batch_size': train_batch_size,
        'no_save': True,
        'overwrite_output_dir': True,
        'evaluate_during_training': False,
        'manual_seed': 42,  # Ensure reproducibility
    }

    # Initialize and train the model
    model = ClassificationModel(
        'bert',
        'bert-base-uncased',
        num_labels=df['RATING'].nunique(),
        args=model_args,
        use_cuda=True
    )
    model.train_model(train_df[['REVIEW', 'RATING']])

    # Evaluate the model using the custom evaluation function
    accuracy = custom_evaluation(model, eval_df)
    return accuracy  # Optimize for accuracy

# Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)  # Adjust the number of trials based on your resources

# Output the results of the best trial
print("Best trial:")
trial_ = study.best_trial

print("Value:", trial_.value)
print("Params:")
for key, value in trial_.params.items():
    print(f"    {key}: {value}")


[I 2024-04-09 14:27:31,408] A new study created in memory with name: no-name-05c1515a-b1e1-415b-ab20-de89d8504d4f
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/210 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

[I 2024-04-09 14:29:59,414] Trial 0 finished with value: 0.3770053475935829 and parameters: {'learning_rate': 3.8955397962443634e-05, 'num_train_epochs': 3, 'train_batch_size': 16}. Best is trial 0 with value: 0.3770053475935829.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/210 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

[I 2024-04-09 14:32:11,119] Trial 1 finished with value: 0.37433155080213903 and parameters: {'learning_rate': 1.5543818391965666e-05, 'num_train_epochs': 3, 'train_batch_size': 16}. Best is trial 0 with value: 0.3770053475935829.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 5 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

[I 2024-04-09 14:35:45,823] Trial 2 finished with value: 0.39037433155080214 and parameters: {'learning_rate': 3.756818741556757e-05, 'num_train_epochs': 5, 'train_batch_size': 16}. Best is trial 2 with value: 0.39037433155080214.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/105 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/105 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/105 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/105 [00:00<?, ?it/s]

Running Epoch 5 of 5:   0%|          | 0/105 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

[I 2024-04-09 14:38:15,553] Trial 3 finished with value: 0.37967914438502676 and parameters: {'learning_rate': 1.801981299133474e-05, 'num_train_epochs': 5, 'train_batch_size': 32}. Best is trial 2 with value: 0.39037433155080214.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/105 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/105 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/105 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/105 [00:00<?, ?it/s]

Running Epoch 5 of 5:   0%|          | 0/105 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

[I 2024-04-09 14:40:46,321] Trial 4 finished with value: 0.38235294117647056 and parameters: {'learning_rate': 3.744493592052089e-05, 'num_train_epochs': 5, 'train_batch_size': 32}. Best is trial 2 with value: 0.39037433155080214.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 5 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

[I 2024-04-09 14:44:18,162] Trial 5 finished with value: 0.3770053475935829 and parameters: {'learning_rate': 4.652592589894852e-05, 'num_train_epochs': 5, 'train_batch_size': 16}. Best is trial 2 with value: 0.39037433155080214.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Running Epoch 1 of 3:   0%|          | 0/105 [00:00<?, ?it/s]

Running Epoch 2 of 3:   0%|          | 0/105 [00:00<?, ?it/s]

Running Epoch 3 of 3:   0%|          | 0/105 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

[I 2024-04-09 14:45:50,375] Trial 6 finished with value: 0.3770053475935829 and parameters: {'learning_rate': 2.9856598651460134e-05, 'num_train_epochs': 3, 'train_batch_size': 32}. Best is trial 2 with value: 0.39037433155080214.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 5 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

[I 2024-04-09 14:49:22,324] Trial 7 finished with value: 0.3850267379679144 and parameters: {'learning_rate': 2.142830623539477e-05, 'num_train_epochs': 5, 'train_batch_size': 16}. Best is trial 2 with value: 0.39037433155080214.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 2 of 2:   0%|          | 0/210 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

[I 2024-04-09 14:50:51,177] Trial 8 finished with value: 0.3983957219251337 and parameters: {'learning_rate': 2.9538879507921225e-05, 'num_train_epochs': 2, 'train_batch_size': 16}. Best is trial 8 with value: 0.3983957219251337.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-5)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()


  0%|          | 0/6 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

Running Epoch 5 of 5:   0%|          | 0/210 [00:00<?, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

[I 2024-04-09 14:54:26,105] Trial 9 finished with value: 0.3850267379679144 and parameters: {'learning_rate': 1.2021209391131748e-05, 'num_train_epochs': 5, 'train_batch_size': 16}. Best is trial 8 with value: 0.3983957219251337.


Best trial:
Value: 0.3983957219251337
Params:
    learning_rate: 2.9538879507921225e-05
    num_train_epochs: 2
    train_batch_size: 16
