In [None]:
!pip install -q transformers datasets peft evaluate accelerate


In [None]:
import torch
import numpy as np

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

from peft import LoraConfig, get_peft_model, TaskType
import evaluate

In [None]:
# Load the Datset #  it wonâ€™t ask anything, and you will get all 1000 rows in the dataset object.
dataset = load_dataset("shawhin/imdb-truncated", split="train")

print(dataset)

In [None]:
# Create Train / Test Split
dataset = dataset["train"].train_test_split(
    test_size=0.2,
    seed=42
)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [None]:
# Load Tokenizer & Base Model
model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

In [None]:
# Tokenize AFTER Splitting
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=False
    )

tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

tokenized_test = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

In [None]:
#Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=False
    )

In [None]:
# Apply tokenization
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

In [None]:
# Add Padding Token & Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Defining evaluation Metrics
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return accuracy_metric.compute(
        predictions=predictions,
        references=labels
    )

In [None]:
# Evaluate Base Model ( Before Training) # Set up trainer without Weights & Biases prompts
trainer_base = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./base_eval",
        per_device_eval_batch_size=4,
        report_to=[],  # <-- disables wandb / all other loggers
        do_train=False,
        do_eval=True,
    ),
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Evaluate the base model
base_results = trainer_base.evaluate()
print("Base Model Evaluation:", base_results)

In [None]:
# Generate Example Predictions (Before LoRA)
text_list = [
    "I absolutely loved this movie, it was amazing!",
    "This was the worst film I have ever seen."
]

inputs = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True)

with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)

print("Base model predictions:", predictions.tolist())

In [None]:
# Configure LoRA (PEFT)
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_lin", "v_lin"]
)

In [None]:
# Training Arguments
# My system doesnot support Evaluation_strategy, so we remove it .

training_args = TrainingArguments(
    output_dir="./lora_distilbert_sentiment",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    report_to=[],   # <-- disables W&B / all other loggers
    save_steps=500,
    do_eval=True,  # instead of evaluation_strategy
)

In [None]:
# Train with LORA
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)                                                       ####### Since we are running in CPU , it takes lots of time , so we use write the code here.

# Train the model
trainer.train()

#### Evaluate the Fine tune model
lora_results = trainer.evaluate()
print("LoRA Fine-tuned Model Evaluation:", lora_results)