# Buidling our model from the [Distilbert base](https://huggingface.co/distilbert/distilbert-base-uncased)

In [None]:
import optuna
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)
import homemade_functions as hf
from datasets import load_dataset, DatasetDict
import evaluate
import numpy as np
from huggingface_hub import notebook_login
import torch
import os
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
notebook_login()


Using device: cuda


### Load up dataset and tokenizer
 - Tokenize our dataset
 - Adjust our labels from 0-4 to 0-2 to fit our Negative, Neutral, Positive sentiments

In [None]:
dataset = load_dataset("yelp_review_full")  # Example dataset, replace with your dataset
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenized_datasets = dataset.map(lambda examples: hf.tokenize_function(tokenizer=tokenizer, text=examples, truncation=True), batched=True)
remapped_dataset = tokenized_datasets.map(hf.remap_labels)

## Create a reduced set (stratified) for use with hyperparamater training
 - for reduced computing requirements we are starting on a smaller set

In [None]:
train_size = 20000 # Adjust larger or smaller if you have more or less computing power and time to run the model.
test_size = 2000
strat_train = hf.stratified_dataset(remapped_dataset['train'], "label", train_size)
strat_test = hf.stratified_dataset(remapped_dataset['test'], "label", test_size)

reduced_dict = DatasetDict({
    "train": strat_train,
    "test": strat_test
})


## Define our Hyperparameter training

In [None]:
def objective(trial):
    epochs = trial.suggest_int("epochs", 1, 5) # lower and upper limits for how many epochs it will run
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True) #lower and upper limits for the learning rate, selected on a logarithmic scale
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-1, log=True) #lower and upper limits for the weight of decay selected on a logarithmic scale
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])  # Try these three batch sizes
    output_dir = f"../../Data/models/trial_{trial.number}"

    os.makedirs(output_dir, exist_ok=True)
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        logging_dir="../../data/models/optuna_logs",
        num_train_epochs = epochs, #1 These four are what we are testing hyperparmaters of
        per_device_train_batch_size=per_device_train_batch_size, #2
        learning_rate = learning_rate, #3
        weight_decay=weight_decay, #4
        per_device_eval_batch_size=64,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        seed=42,
        report_to="none",  
        push_to_hub=False, # going to push the final model not the paramter testing
        save_strategy="epoch",
        save_total_limit=2,
        run_name=f"lr_{trial.params.get('learning_rate', 'default')}_wd_{trial.params.get('weight_decay', 'default')}"
    )


    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=reduced_dict["train"],
        eval_dataset=reduced_dict["test"],
        compute_metrics=hf.compute_metrics,
    )

    trainer.train()
    trainer.save_model(output_dir)
    return trainer.evaluate()["eval_f1"]


### Perform our Hyperparamater testing

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)  # Adjust n_trials as needed

best_trial = study.best_trial

[I 2025-03-16 20:21:51,490] A new study created in memory with name: no-name-56e90d56-8760-4e90-9a6f-c57fd61fb89d
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8052,0.812311,0.622167,0.613762


[I 2025-03-16 20:33:54,085] Trial 0 finished with value: 0.613761785395665 and parameters: {'epochs': 1, 'learning_rate': 0.00016688043316444577, 'weight_decay': 0.002285598674742945, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 0.613761785395665.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5072,0.512399,0.776333,0.77596
2,0.3908,0.535264,0.793167,0.79303
3,0.2276,0.699742,0.787,0.787271


[I 2025-03-16 21:06:26,203] Trial 1 finished with value: 0.7930304696789476 and parameters: {'epochs': 3, 'learning_rate': 6.36203781967978e-05, 'weight_decay': 0.0020245029680676, 'per_device_train_batch_size': 16}. Best is trial 1 with value: 0.7930304696789476.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5468,0.500539,0.784167,0.783117
2,0.4226,0.525494,0.789667,0.790914
3,0.2725,0.675881,0.775833,0.772878
4,0.1413,0.865552,0.775667,0.776689
5,0.0703,1.17976,0.776167,0.776857


[I 2025-03-16 21:57:29,900] Trial 2 finished with value: 0.7909135740545861 and parameters: {'epochs': 5, 'learning_rate': 0.00010275273977469485, 'weight_decay': 2.4332073653249322e-05, 'per_device_train_batch_size': 32}. Best is trial 1 with value: 0.7930304696789476.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5326,0.502117,0.783333,0.782017
2,0.4427,0.490098,0.793667,0.792935
3,0.3604,0.51339,0.797167,0.795724
4,0.3073,0.536684,0.794,0.794324


[I 2025-03-16 22:38:25,797] Trial 3 finished with value: 0.7957243393907962 and parameters: {'epochs': 4, 'learning_rate': 1.5274336195202594e-05, 'weight_decay': 0.00016569751963498936, 'per_device_train_batch_size': 32}. Best is trial 3 with value: 0.7957243393907962.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7124,0.883934,0.631833,0.619623
2,0.719,0.718088,0.700167,0.697274
3,0.6263,0.742824,0.700667,0.696632
4,0.5484,0.689767,0.719667,0.723359
5,0.4733,0.701299,0.718333,0.718604


[I 2025-03-16 23:29:27,889] Trial 4 finished with value: 0.723358806781516 and parameters: {'epochs': 5, 'learning_rate': 0.00024995719147881726, 'weight_decay': 0.001175985528602067, 'per_device_train_batch_size': 32}. Best is trial 3 with value: 0.7957243393907962.


In [None]:
print(f"Best trial f1 Score: {best_trial.value}, params: {best_trial.params}")

Best trial: 0.7957243393907962, params: {'epochs': 4, 'learning_rate': 1.5274336195202594e-05, 'weight_decay': 0.00016569751963498936, 'per_device_train_batch_size': 32}


### Training our full model
 - Take the best model produced from hyperparameter training
 - Use the same hyperparamaters
 - Train on full dataset

In [None]:
best_model_path = f"../../data/models/trial_{best_trial.number}"
best_model = AutoModelForSequenceClassification.from_pretrained(best_model_path)
print(f"Best Model = Trial number: {best_trial.number}")

/content/drive/MyDrive/Colab Notebooks/Data/trial_3


### Set up our Final Model

In [None]:
fine_tuning_args = TrainingArguments(
    output_dir="../../data/models/fine_tuned_model",
    hub_model_id="FinchW/my-yelp-sentiment-model-finetuned",
    learning_rate=best_trial.params["learning_rate"] / 2,  # Reduce learning rate, want a very thorough training so have reduced the learning rate further
    num_train_epochs=best_trial.params["epochs"],  # Adjust epochs
    per_device_train_batch_size=best_trial.params["per_device_train_batch_size"],  # Adjust batch size
    per_device_eval_batch_size=64,
    weight_decay=best_trial.params["weight_decay"],
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="none", #or wandb
    push_to_hub=True,
    run_name="final_fine_tune"
)

fine_tuning_trainer = Trainer(
    model=best_model,
    args=fine_tuning_args,
    train_dataset=remapped_dataset["train"], # using the full dataset
    eval_dataset=remapped_dataset["test"],
    compute_metrics=hf.compute_metrics,
)





### Training

In [25]:
fine_tuning_trainer.train()

print("Fine-tuning complete!")

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3669,0.365137,0.8484,0.843585
2,0.3343,0.364499,0.84846,0.849493
3,0.3133,0.368921,0.84968,0.848121
4,0.2815,0.377285,0.85056,0.849173


No files have been modified since last commit. Skipping to prevent empty commit.


Fine-tuning complete!


### Always important to save.

In [None]:
save_path = '../../data/models/fine_tuned_model'  
# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

# Save the model
fine_tuning_trainer.save_model(save_path)

### Evaluating model

In [None]:
results = fine_tuning_trainer.evaluate()
print(results)

{'eval_loss': 0.36449944972991943, 'eval_accuracy': 0.84846, 'eval_f1': 0.8494926642060322, 'eval_runtime': 169.6017, 'eval_samples_per_second': 294.808, 'eval_steps_per_second': 4.611, 'epoch': 4.0}
