### Training bert-distilled on our dataset to see how we can do

In [1]:
from transformers import pipeline, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, DatasetDict
import evaluate
import homemade_functions as hf
import pandas as pd
import numpy as np
import pprint as pp
import torch

from huggingface_hub import notebook_login

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\finch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\finch\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Using device: cpu


In [27]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

- load our dataset (unprocessed)

In [2]:
ds = load_dataset('yelp_review_full')

### Start building our model
 - start with the tokenizer

In [3]:
model_name = "distilbert-base-uncased"
model_path = f"distilbert/{model_name}"
tokenizer = AutoTokenizer.from_pretrained(model_path)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
base_name = "../data/models/yelp_sentiment_analysis"

In [4]:
def preprocess_function(examples, tokenizer):
    return tokenizer(examples['text'], truncation=True)

In [5]:
tokenized_datasets = ds.map(lambda examples: hf.preprocess_function(examples, tokenizer), batched=True)
remapped_dataset = tokenized_datasets.map(hf.remap_labels)

id2label = {
    0: "NEGATIVE",
    1: "NEUTRAL",
    2: "POSITIVE",
}
label2id = {
    "NEGATIVE": 0,
    "NEUTRAL": 1,
    "POSITIVE": 2
}
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3, id2label=id2label, label2id=label2id)

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
print(max(remapped_dataset['test']['label']))

2


 - Our labels are currently 0-4, letss adjust that to 0-2 for the 3 options (Positive, Neutral, Negative)

 - confirming our new max and min

In [7]:
learning_rate = 1.0028891024065868e-05
num_train_epochs = 3
per_device_train_batch_size = 16

In [8]:
train_size = 1000
test_size = 200

strat_train = hf.stratified_dataset(remapped_dataset['train'], "label", train_size)
strat_test = hf.stratified_dataset(remapped_dataset['test'], "label", test_size)

reduced_dict = DatasetDict({
    "train": strat_train,
    "test": strat_test
})


Hyperparameter Training

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3)
run_name = hf.get_unique_filename(base_name)
repo_name = f"FinchW/hyper/{model_name}-{run_name}"
training_args = TrainingArguments(
    output_dir=f"../data/models/hyper/{run_name}",
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=False,
    run_name = run_name
)
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=reduced_dict["train"],
    eval_dataset=reduced_dict["test"],
    tokenizer=tokenizer,
    compute_metrics=hf.compute_metrics
)

  trainer = Trainer(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
best_run = trainer.hyperparameter_search(n_trials=5, direction="maximize")

[I 2025-03-14 12:43:08,934] A new study created in memory with name: no-name-48bb22c5-6f16-49be-888c-6da0dfac9766
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.64449,0.703333,0.703223
2,0.723700,0.670088,0.718333,0.710558
3,0.395700,0.939657,0.716667,0.716185
4,0.209800,1.41341,0.701667,0.709336
5,0.209800,1.398766,0.721667,0.725994


[I 2025-03-14 14:23:22,041] Trial 0 finished with value: 1.447661069464068 and parameters: {'learning_rate': 4.481479932933044e-05, 'num_train_epochs': 5, 'seed': 30, 'per_device_train_batch_size': 8}. Best is trial 0 with value: 1.447661069464068.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.77094,0.683333,0.662324


[I 2025-03-14 14:48:17,484] Trial 1 finished with value: 1.3456578257084142 and parameters: {'learning_rate': 2.78058570035639e-05, 'num_train_epochs': 1, 'seed': 29, 'per_device_train_batch_size': 64}. Best is trial 0 with value: 1.447661069464068.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.59309,0.74,0.736437
2,No log,0.58857,0.755,0.751619
3,No log,0.673134,0.721667,0.723802
4,No log,0.71176,0.745,0.746316


[I 2025-03-14 16:22:46,704] Trial 2 finished with value: 1.4913156778125194 and parameters: {'learning_rate': 4.701480004762605e-05, 'num_train_epochs': 4, 'seed': 30, 'per_device_train_batch_size': 32}. Best is trial 2 with value: 1.4913156778125194.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.097785,0.341667,0.315612


[I 2025-03-14 16:47:02,065] Trial 3 finished with value: 0.6572782409287381 and parameters: {'learning_rate': 1.0117200203699473e-06, 'num_train_epochs': 1, 'seed': 14, 'per_device_train_batch_size': 64}. Best is trial 2 with value: 1.4913156778125194.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.739599,0.69,0.669195


[I 2025-03-14 17:11:09,562] Trial 4 finished with value: 1.3591947712395487 and parameters: {'learning_rate': 2.718822791744087e-05, 'num_train_epochs': 1, 'seed': 34, 'per_device_train_batch_size': 64}. Best is trial 2 with value: 1.4913156778125194.


In [15]:
best_run.hyperparameters

{'learning_rate': 4.701480004762605e-05,
 'num_train_epochs': 4,
 'seed': 30,
 'per_device_train_batch_size': 32}

In [29]:
train_size = 10000
test_size = 2000

strat_train = hf.stratified_dataset(remapped_dataset['train'], "label", train_size)
strat_test = hf.stratified_dataset(remapped_dataset['test'], "label", test_size)

reduced_dict = DatasetDict({
    "train": strat_train,
    "test": strat_test
})


In [30]:
run_name = hf.get_unique_filename(base_name)
repo_name = f"FinchW/{model_name}-{run_name}"
training_args = TrainingArguments(
    output_dir=f"../data/models/{run_name}",
    learning_rate=best_run.hyperparameters['learning_rate'],
    per_device_train_batch_size=best_run.hyperparameters['per_device_train_batch_size'],
    per_device_eval_batch_size=64,
    num_train_epochs=best_run.hyperparameters['num_train_epochs'],
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=reduced_dict["train"],
    eval_dataset=reduced_dict["test"],
    tokenizer=tokenizer,
    compute_metrics=hf.compute_metrics
)

  trainer = Trainer(


In [31]:
trainer.train()
trainer.save_model(f"../data/models/{run_name}")

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6475,0.527855,0.768833,0.773058
2,0.4216,0.530732,0.78,0.782744
3,0.2587,0.650384,0.777333,0.777078
4,0.1489,0.813919,0.774,0.775322


No files have been modified since last commit. Skipping to prevent empty commit.


In [None]:
import evaluate
from pprint import pprint
results = trainer.evaluate()
pprint(results)

{'epoch': 4.0,
 'eval_accuracy': 0.78,
 'eval_f1': 0.7827441018383253,
 'eval_loss': 0.5307316780090332,
 'eval_runtime': 825.9801,
 'eval_samples_per_second': 7.264,
 'eval_steps_per_second': 0.114}
