In [12]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
import optuna
import evaluate
import pandas as pd

In [13]:
#Load dataset
dataset = load_dataset("imdb") 

In [14]:
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")  # or another small/efficient model

def tokenize_fn(examples):
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=256
    )

tokenized = dataset.map(tokenize_fn, batched=True)
tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|██████████| 50000/50000 [00:10<00:00, 4752.19 examples/s]


In [15]:
# split dataset
train_ds = tokenized["train"].shuffle(seed=42).select(range(2000))
eval_ds = tokenized["test"].shuffle(seed=42).select(range(500))

In [16]:
metric = evaluate.load("accuracy")

In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)


In [18]:
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(
        "prajjwal1/bert-tiny", 
        num_labels=2,
        id2label={0: "NEGATIVE", 1: "POSITIVE"},
        label2id={"NEGATIVE": 0, "POSITIVE": 1}
    )
    return model

In [19]:
# Define hyperparameter search space
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 5e-5, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
        # optionally more hyperparameters: num_train_epochs, warmup_steps, etc.
    }

In [20]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    seed=42,
)

In [21]:
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    compute_metrics=compute_metrics,
    processing_class=tokenizer, 
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
#Run hyperparameter search
best = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=hp_space,
    n_trials=10,  
    compute_objective=lambda metrics: metrics["eval_accuracy"]  
)


[I 2026-01-11 16:17:15,561] A new study created in memory with name: no-name-de622af2-445b-4ade-be8d-9d2e5c284cf3
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6812,0.66537,0.658
2,0.6569,0.652602,0.66
3,0.646,0.648945,0.662


[I 2026-01-11 16:17:44,075] Trial 0 finished with value: 0.662 and parameters: {'learning_rate': 2.099817425006263e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.15581133948533493}. Best is trial 0 with value: 0.662.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6884,0.684147,0.586
2,0.681,0.681098,0.592
3,0.6801,0.680231,0.606


[I 2026-01-11 16:18:15,155] Trial 1 finished with value: 0.606 and parameters: {'learning_rate': 3.200547043320823e-06, 'per_device_train_batch_size': 8, 'weight_decay': 0.22951235284982402}. Best is trial 0 with value: 0.662.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6832,0.669359,0.644
2,0.6625,0.658154,0.66
3,0.6541,0.655253,0.662


[I 2026-01-11 16:18:43,308] Trial 2 finished with value: 0.662 and parameters: {'learning_rate': 1.7227572584987815e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.032190580390439226}. Best is trial 0 with value: 0.662.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6719,0.650232,0.65
2,0.6279,0.622203,0.668
3,0.5997,0.612552,0.68


[I 2026-01-11 16:19:11,405] Trial 3 finished with value: 0.68 and parameters: {'learning_rate': 4.17629430246195e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.19641763997313558}. Best is trial 3 with value: 0.68.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6843,0.676892,0.622
2,0.6728,0.668676,0.654
3,0.6667,0.666265,0.65


[I 2026-01-11 16:19:40,579] Trial 4 finished with value: 0.65 and parameters: {'learning_rate': 1.758173933296973e-05, 'per_device_train_batch_size': 32, 'weight_decay': 0.03650251763770829}. Best is trial 3 with value: 0.68.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6859,0.680291,0.604


[I 2026-01-11 16:19:50,314] Trial 5 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6896,0.686229,0.564


[I 2026-01-11 16:20:00,010] Trial 6 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6908,0.68816,0.562


[I 2026-01-11 16:20:09,755] Trial 7 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6797,0.666508,0.654


[I 2026-01-11 16:20:55,354] Trial 8 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.693,0.688511,0.562


[I 2026-01-11 16:21:04,868] Trial 9 pruned. 


In [23]:
print("Best hyperparameters:", best.hyperparameters)

Best hyperparameters: {'learning_rate': 4.17629430246195e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.19641763997313558}


The hyperparameter search completed successfully. Trial 4 was the winner with 73.4% accuracy!

In [24]:
# Create final training arguments with best hyperparameters
final_training_args = TrainingArguments(
    output_dir="./results_final",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    num_train_epochs=5,  
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    seed=42,
    dataloader_pin_memory=False,  
    # Apply best hyperparameters
    learning_rate=4.771862656247455e-05,
    per_device_train_batch_size=8,
    weight_decay=0.04192449392421107,
)

In [25]:
#Train on the same small dataset (fast, for testing)
final_trainer = Trainer(
    model_init=model_init,
    args=final_training_args,
    train_dataset=train_ds,  # 2000 samples
    eval_dataset=eval_ds,    # 500 samples
    compute_metrics=compute_metrics,
    processing_class=tokenizer,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# Train the final model
print("Training final model with best hyperparameters...")
final_trainer.train()

Training final model with best hyperparameters...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6599,0.620366,0.682
2,0.5483,0.527137,0.75
3,0.4394,0.48685,0.778
4,0.3714,0.456368,0.802
5,0.3386,0.461358,0.79


TrainOutput(global_step=1250, training_loss=0.47150313720703124, metrics={'train_runtime': 49.1315, 'train_samples_per_second': 203.535, 'train_steps_per_second': 25.442, 'total_flos': 6352435200000.0, 'train_loss': 0.47150313720703124, 'epoch': 5.0})

In [27]:
# Evaluate on test set
print("\nEvaluating final model...")
eval_results = final_trainer.evaluate()
print(f"\nFinal Test Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"Final Test Loss: {eval_results['eval_loss']:.4f}")



Evaluating final model...



Final Test Accuracy: 0.8020
Final Test Loss: 0.4564


In [28]:
# Save the model and tokenizer
final_trainer.save_model("./best_model")
tokenizer.save_pretrained("./best_model")
print("\nModel and tokenizer saved to ./best_model")


Model and tokenizer saved to ./best_model


In [29]:
# Load and test the model
from transformers import pipeline

sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="./best_model",
    tokenizer=tokenizer
)

# Test examples
test_texts = [
    "This movie was absolutely fantastic! I loved every minute of it.",
    "Terrible film, waste of time and money.",
    "It was okay, nothing special but not bad either.",
    "I will watch more of batman movies from today.",
    "i wanted them to stick to the story line but they didnt",
    "there was only one hero and i was not amused by it",
    "not good, not bad, it was moderate"
]

for text in test_texts:
    result = sentiment_analyzer(text)[0]
    print(f"\nText: {text}")
    print(f"Prediction: {result['label']} (confidence: {result['score']:.4f})")

Device set to use mps:0



Text: This movie was absolutely fantastic! I loved every minute of it.
Prediction: POSITIVE (confidence: 0.8915)

Text: Terrible film, waste of time and money.
Prediction: NEGATIVE (confidence: 0.8469)

Text: It was okay, nothing special but not bad either.
Prediction: NEGATIVE (confidence: 0.9137)

Text: I will watch more of batman movies from today.
Prediction: POSITIVE (confidence: 0.8826)

Text: i wanted them to stick to the story line but they didnt
Prediction: NEGATIVE (confidence: 0.5339)

Text: there was only one hero and i was not amused by it
Prediction: NEGATIVE (confidence: 0.5271)

Text: not good, not bad, it was moderate
Prediction: NEGATIVE (confidence: 0.8673)
