In [1]:
#!pip install peft --quiet

# Libraries

In [2]:
import peft
print("PEFT is installed successfully!")

PEFT is installed successfully!


In [3]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import evaluate

In [4]:
from datasets import load_dataset

dataset = load_dataset("imdb")

# Loading Dataset

In [5]:
# Check the dataset structure
print(dataset)
print("Sample Train Data:", dataset["train"][0])
print("Sample Test Data:", dataset["test"][0])

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})
Sample Train Data: {'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between ask

# Tokenization

In [6]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True)

dataset = dataset.map(tokenize, batched=True)

# Set format for PyTorch
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Optional: small subset for demo (speeds up training)
train_dataset = dataset['train'].shuffle(seed=42).select(range(2000))
val_dataset = dataset['test'].shuffle(seed=42).select(range(500))
test_dataset = dataset['test'].shuffle(seed=42).select(range(500))

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

# Step 5: Load Base Model with Classification Head

In [7]:
num_labels = 2  # Positive/Negative
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Apply LoRA (PEFT)

In [1]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_lin", "v_lin"],  # Updated for DistilBERT
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(model, lora_config)

NameError: name 'model' is not defined

In [None]:
import sys
!{sys.executable} -m pip install evaluate --quiet

# Define Metrics

In [None]:
import evaluate

# Load metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

# Function to compute metrics during training
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels)["f1"]
    }

In [None]:
import transformers
print(transformers.__version__)  # Should be 4.30+ (ideally latest)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Training

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

# Evaluation on Test Set

In [None]:
metrics = trainer.evaluate(test_dataset)
print("Test Accuracy:", metrics["eval_accuracy"])
print("Test F1-Score:", metrics["eval_f1"])

# Test Predictions

In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    pred = outputs.logits.argmax(-1).item()
    return "Positive" if pred == 1 else "Negative"

# Examples
print(predict_sentiment("This movie was fantastic, I loved it!"))
print(predict_sentiment("The movie was boring and too long."))