In [None]:
# Load datasets (HuggingFace Datasets lib)
from datasets import load_dataset

# HuggingFace Transformers for tokenization, model, training
from transformers import (
    AutoTokenizer,                   # turns text into tokens
    TrainingArguments,               # config training params
    AutoModelForSequenceClassification, # pre-trained model for classification
    Trainer,                          # high-level trainer API
    EarlyStoppingCallback
)


In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# Choose pre-trained model (DistilBERT: small, fast BERT)
model_name = 'distilbert-base-uncased'

# Use only part of the dataset for quick experiments
train_subset = 1000
test_subset  = 500
val_subset   = 500

# Load tokenizer that matches the model (same vocab & preprocessing)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
# Load IMDB dataset (movie reviews, pos/neg)
raw = load_dataset('imdb')

# Create smaller subsets (shuffled for randomness, reproducible with seed)
full_split = raw['train'].train_test_split(test_size=0.2, seed=42)
train_ds = full_split['train'].shuffle(seed=42)
val_ds, test_ds = full_split['test'].train_test_split(test_size=0.5, seed=42).values()


In [5]:
# Function to tokenize dataset samples
def tokenize(data):
    return tokenizer(
        data['text'], truncation=True, padding='max_length', max_length=256, return_tensors="pt"
        )

In [None]:
# Apply tokenization to each dataset
# - batched=True: process multiple samples at once (faster)
# - remove_columns: drop raw 'text' column (keep only tokens + labels)
train_tokenized = train_ds.map(tokenize, batched=True, remove_columns=['text'])
test_tokenized  = test_ds.map(tokenize,  batched=True, remove_columns=['text'])
val_tokenized   = val_ds.map(tokenize,   batched=True, remove_columns=['text'])

In [None]:
# Load pre-trained model (DistilBERT) with classification head
# - from_pretrained: downloads weights from HuggingFace Hub
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
import evaluate  # HuggingFace metrics library

# Load accuracy metric
accuracy = evaluate.load('accuracy')

# Function to calculate metrics during evaluation
def calculate_metrics(eval_pred):
    logist, labels = eval_pred                # model outputs & true labels
    preds = logist.argmax(axis=-1)            # get predicted class
    return {
        "accuracy": accuracy.compute(
            predictions=preds, references=labels
        )["accuracy"]
    }



In [None]:
# Training configuration
args = TrainingArguments(
    output_dir="model/distilbert-imdb",
    learning_rate=2e-5,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    eval_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
)


# High-level Trainer API (handles training + evaluation)
trainer = Trainer(
    model=model,                          # the model to fine-tune
    args=args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,                  # tokenizer (needed for saving model)
    compute_metrics=calculate_metrics,     # evaluation metric function
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


In [11]:
# Start fine-tuning the model on training data
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhamdiabdelkader[0m ([33mhamdiabdelkader-university-of-michigan[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2982,0.237374,0.9008
2,0.1868,0.281996,0.9088
3,0.1308,0.372448,0.9064


TrainOutput(global_step=3750, training_loss=0.1983719341913859, metrics={'train_runtime': 1408.2347, 'train_samples_per_second': 71.011, 'train_steps_per_second': 4.438, 'total_flos': 3974021959680000.0, 'train_loss': 0.1983719341913859, 'epoch': 3.0})

In [12]:
# Evaluate the model on the test set and get metrics
result = trainer.evaluate(test_tokenized)

for x, y in result.items():
  print(x, y)

eval_loss 0.24264472723007202
eval_accuracy 0.9028
eval_runtime 19.0285
eval_samples_per_second 131.382
eval_steps_per_second 16.449
epoch 3.0
