In [1]:
# Google Colab setup
!pip install transformers datasets peft accelerate bitsandbytes scikit-learn -q

# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

# Authenticate with Hugging Face to use Gemma
from huggingface_hub import login
login()  # You'll need a HF token with access to Gemma

CUDA available: True
GPU: NVIDIA GeForce GTX 1660 SUPER


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
# Import Dataset
import pandas as pd
import json
from datasets import load_dataset
from sklearn.model_selection import KFold

ds = load_dataset("dair-ai/emotion", split="test")
# ds = load_dataset("dair-ai/emotion", "unsplit", split="train")

# Keep a deterministic shuffle for reproducibility
df = ds.to_pandas().sample(frac=1.0, random_state=42).reset_index(drop=True)

df.head()

Unnamed: 0,text,label
0,i feel so dirty but after spending a day at th...,0
1,i could feel his breath on me and smell the sw...,1
2,i just want to feel loved by you,2
3,i have felt the need to write out my sometimes...,3
4,at a party i met a girl who drew me to her,3


In [2]:
df.shape

(2000, 2)

In [3]:
# format data
emotion_map = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

df["label_name"] = df["label"].map(emotion_map)

df

Unnamed: 0,text,label,label_name
0,i feel so dirty but after spending a day at th...,0,sadness
1,i could feel his breath on me and smell the sw...,1,joy
2,i just want to feel loved by you,2,love
3,i have felt the need to write out my sometimes...,3,anger
4,at a party i met a girl who drew me to her,3,anger
...,...,...,...
1995,i help my daughter when she is feeling angry,3,anger
1996,i continue to write this i feel more and more ...,4,fear
1997,i felt so bad for the bad grade and feeling li...,0,sadness
1998,i never feel like anythings getting resolved w...,1,joy


In [4]:
# Prompt helpers
INSTRUCTION = "Classify the text using one of the six emotions."
EMOTIONS_TEXT = "Emotions: sadness, joy, love, anger, fear, surprise"


def build_conversation(text: str):
    return [
        {"role": "system", "content": f"{INSTRUCTION}\n{EMOTIONS_TEXT}"},
        {"role": "user", "content": f"Text: {text}"},
        {"role": "assistant", "content": "Emotion: "}
    ]


def build_target(label_name: str):
    return f"Emotion: {label_name}"


In [5]:
# Load Model
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

model_id="meta-llama/Llama-3.2-1B-Instruct"
device="cuda"

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token   

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.float32 if device == "cpu" else torch.bfloat16,
    device_map="auto"
)

model.config.use_cache = False

print(f"dtype: {model.dtype}")



dtype: torch.bfloat16


In [6]:
# Quick test to see if model loaded correctly
pipe = pipeline(
    "text-generation",
    model=model_id,
    device=device,
    dtype=torch.bfloat16
)

sample_text = "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake"
chat_prompt = tokenizer.apply_chat_template(
    build_conversation(sample_text),
    tokenize=False,
    add_generation_prompt=True
)

pipe(chat_prompt, max_new_tokens=25)


Device set to use cuda
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 08 Jan 2026\n\nClassify the text using one of the six emotions.\nEmotions: sadness, joy, love, anger, fear, surprise<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nText: i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEmotion:<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nBased on the text, I would classify the emotion as: Hope'}]

In [6]:
def generate_training_sample(conversation, target_response, max_length=512):
    """
    Generate a single training sample with proper label masking.

    For causal LM fine-tuning, we want:
    - input_ids: the full sequence (prompt + response)
    - labels: same as input_ids, but with -100 for tokens we don't want to compute loss on (the prompt)

    Args:
        conversation: list of message dicts with role/content (the prompt)
        target_response: the expected model output
        max_length: maximum sequence length

    Returns:
        dict with input_ids, attention_mask, labels
    """
    prompt_text = tokenizer.apply_chat_template(
        conversation,
        continue_final_message=True,
        tokenize=False,
        add_generation_prompt=False
    )

    full_text = prompt_text + target_response + tokenizer.eos_token

    full_encoding = tokenizer(
        full_text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length,
        add_special_tokens=False  # chat_template already added special tokens
    )

    # Tokenize just the prompt to find where labels should start
    prompt_encoding = tokenizer(
        prompt_text,
        return_tensors="pt",
        add_special_tokens=False
    )
    prompt_length = prompt_encoding["input_ids"].shape[1]

    # Create labels: -100 for prompt tokens (ignore in loss), actual tokens for response
    labels = full_encoding["input_ids"].clone()
    labels[0, :prompt_length] = -100  # Mask the prompt
    labels[labels == tokenizer.pad_token_id] = -100  # Also mask padding
    labels[:, -1] = tokenizer.eos_token_id

    return {
        "input_ids": full_encoding["input_ids"],
        "attention_mask": full_encoding["attention_mask"],
        "labels": labels
    }

In [8]:
sample_text = "i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake"
test_conversation = build_conversation(sample_text)
test_expected_response = build_target("sadness")
test_sample = generate_training_sample(test_conversation, test_expected_response, max_length=128)
tokenizer.batch_decode(test_sample["input_ids"])

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 08 Jan 2026\n\nClassify the text using one of the six emotions.\nEmotions: sadness, joy, love, anger, fear, surprise<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nText: i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEmotion:Emotion: sadness<|eot_id|>']

In [9]:
out = model(input_ids=test_sample["input_ids"])

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
out.logits.shape

torch.Size([1, 91, 128256])

In [None]:
test_sample["labels"]

tensor([[  -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   2321,   6082,     25,  51978,
         128009]])

In [None]:
out.logits.shape
out.logits.view(-1, out.logits.shape[-1])

tensor([[ 2.8438,  3.5781,  7.0312,  ..., -1.2422, -1.2422, -1.2422],
        [-3.1719, -2.2188, -1.2500,  ...,  2.7656,  2.7656,  2.7656],
        [ 3.2500,  5.6875,  3.8750,  ..., -0.8047, -0.8047, -0.8047],
        ...,
        [ 3.4531,  5.0000,  4.2812,  ..., -0.9922, -0.9922, -0.9922],
        [ 9.0000,  9.6875,  4.7812,  ...,  1.2891,  1.2812,  1.2812],
        [-1.7422, -3.2656, -4.0625,  ...,  3.6250,  3.6094,  3.6094]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<ViewBackward0>)

In [7]:
from torch.utils.data import Dataset, DataLoader

MAX_LENGTH = 256


class EmotionChatDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, tokenizer, max_length: int = MAX_LENGTH):
        self.df = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        conversation = build_conversation(row["text"])
        target_response = build_target(row["label_name"])
        sample = generate_training_sample(conversation, target_response, max_length=self.max_length)
        # Remove batch dimension for DataLoader stacking
        return {k: v.squeeze(0) for k, v in sample.items()}


def make_dataloader(split_df: pd.DataFrame, batch_size: int, shuffle: bool = True):
    dataset = EmotionChatDataset(split_df, tokenizer, max_length=MAX_LENGTH)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=False)




In [8]:
# create lora config and init peft model
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=32,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
)

# Use init_peft_model() inside each fold
def init_peft_model():
    base_model = AutoModelForCausalLM.from_pretrained(
        model_id,
        dtype=torch.float32 if device == "cpu" else torch.bfloat16,
        device_map="auto"
    )
    base_model.config.use_cache = False
    peft = get_peft_model(base_model, lora_config)
    peft.print_trainable_parameters()
    return peft

In [10]:
# Training configuration
from torch.optim import AdamW
from tqdm import tqdm

BATCH_SIZE = 1
LEARNING_RATE = 1e-5
NUM_EPOCHS = 1
GRADIENT_ACCUMULATION_STEPS = 8  # Effective batch size = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
KFOLDS = 5


def evaluate_loss(model, dataloader):
    model.eval()
    total_loss = 0.0
    total_steps = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            total_loss += outputs.loss.item()
            total_steps += 1
    model.train()
    return total_loss / max(1, total_steps)


def train_one_fold(fold_idx, train_df, val_df):
    print(f"\nFold {fold_idx + 1}/{KFOLDS}")
    peft_model = init_peft_model()

    train_loader = make_dataloader(train_df, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = make_dataloader(val_df, batch_size=BATCH_SIZE, shuffle=False)

    optimizer = AdamW(
        filter(lambda p: p.requires_grad, peft_model.parameters()),
        lr=LEARNING_RATE,
        weight_decay=0.01
    )

    for epoch in range(NUM_EPOCHS):
        total_loss = 0.0
        progress_bar = tqdm(train_loader, desc=f"Fold {fold_idx + 1} Epoch {epoch + 1}/{NUM_EPOCHS}")

        optimizer.zero_grad()
        for step, batch in enumerate(progress_bar):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = peft_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS
            total_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS
            loss.backward()

            if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                optimizer.step()
                optimizer.zero_grad()

            progress_bar.set_postfix({"loss": f"{loss.item() * GRADIENT_ACCUMULATION_STEPS:.4f}"})

        train_loss = total_loss / max(1, len(train_loader))
        val_loss = evaluate_loss(peft_model, val_loader)
        print(f"Fold {fold_idx + 1} Epoch {epoch + 1} - Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    save_path = f"lora-emotion-fold{fold_idx + 1}"
    peft_model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    return {"train_loss": train_loss, "val_loss": val_loss}



In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

# 5-fold cross validation
kf = KFold(n_splits=KFOLDS, shuffle=True, random_state=42)
fold_results = []

for fold_idx, (train_idx, val_idx) in enumerate(kf.split(df)):
    train_df = df.iloc[train_idx].reset_index(drop=True)
    val_df = df.iloc[val_idx].reset_index(drop=True)
    result = train_one_fold(fold_idx, train_df, val_df)
    fold_results.append(result)

avg_train_loss = sum(r["train_loss"] for r in fold_results) / len(fold_results)
avg_val_loss = sum(r["val_loss"] for r in fold_results) / len(fold_results)


Fold 1/5
trainable params: 3,407,872 || all params: 1,239,222,272 || trainable%: 0.2750


Fold 1 Epoch 1/1: 100%|██████████| 1600/1600 [10:25<00:00,  2.56it/s, loss=0.4963]


Fold 1 Epoch 1 - Train Loss: 1.9700 | Val Loss: 0.2577

Fold 2/5
trainable params: 3,407,872 || all params: 1,239,222,272 || trainable%: 0.2750


Fold 2 Epoch 1/1: 100%|██████████| 1600/1600 [58:45<00:00,  2.20s/it, loss=0.5980] 


Fold 2 Epoch 1 - Train Loss: 1.9634 | Val Loss: 0.2706

Fold 3/5
trainable params: 3,407,872 || all params: 1,239,222,272 || trainable%: 0.2750


Fold 3 Epoch 1/1: 100%|██████████| 1600/1600 [59:00<00:00,  2.21s/it, loss=0.3515] 


Fold 3 Epoch 1 - Train Loss: 1.9695 | Val Loss: 0.2749

Fold 4/5
trainable params: 3,407,872 || all params: 1,239,222,272 || trainable%: 0.2750


Fold 4 Epoch 1/1: 100%|██████████| 1600/1600 [59:14<00:00,  2.22s/it, loss=0.2639] 


Fold 4 Epoch 1 - Train Loss: 1.9608 | Val Loss: 0.2716

Fold 5/5
trainable params: 3,407,872 || all params: 1,239,222,272 || trainable%: 0.2750


Fold 5 Epoch 1/1:  30%|███       | 486/1600 [17:52<40:21,  2.17s/it, loss=3.0419]  

In [None]:
print(f"Average Train Loss: {avg_train_loss:.4f}")
print(f"Average Val Loss: {avg_val_loss:.4f}")
print(f"Per-fold results: {fold_results}")