In [36]:

!pip install -U transformers=="4.49.0" datasets

!pip install numpy
!pip install -U accelerate
!pip install scikit-learn
!pip install peft
!pip install bitsandbytes


In [None]:
import numpy as np
import torch
import torch.nn.functional as F

from sklearn.metrics import accuracy_score, f1_score

from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    pipeline,
)

from transformers import BitsAndBytesConfig

from peft import get_peft_model, LoraConfig, TaskType


Parent Model

In [4]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model="Orkhan/llama-2-7b-absa", device=0)
print(classifier("Company misses earnings — shares drop sharply."))
print(classifier("Quarterly profits are bad."))


config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at Orkhan/llama-2-7b-absa and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

Device set to use cuda:0


[{'label': 'LABEL_0', 'score': 0.9849588871002197}]
[{'label': 'LABEL_0', 'score': 0.9930717349052429}]


In [37]:


def evaluate_model_pipeline(model_name, dataset, task, batch_size=32):
    classifier = pipeline(
        "sentiment-analysis",
        model=model_name,
        device=0 if torch.cuda.is_available() else -1,
        batch_size=batch_size,
        return_all_scores=False,
    )

    raw_to_sentiment = {
        "LABEL_1": "negative",
        "LABEL_0": "positive",
    }

    inverse_label_map = {0: "negative", 1: "neutral", 2: "positive"}

    predictions, true_labels = [], []

    for example in dataset:
        result = classifier(example["text"])[0]
        mapped = raw_to_sentiment.get(result["label"], "neutral")
        predictions.append(mapped)
        true_labels.append(inverse_label_map[example["label"]])

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average="weighted")

    print(f"Model: {model_name}, Task: {task}, Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}")
    return accuracy, f1

initial_accuracy, initial_f1 = evaluate_model_pipeline("Orkhan/llama-2-7b-absa", dataset, chosen_task)
# Model: Orkhan/llama-2-7b-absa, Task: sentiment_analysis, Accuracy: 0.5507, F1-Score: 0.5061

Child Model

In [38]:
initial_accuracy, initial_f1 = evaluate_model_pipeline("bigscience/bloomz-1b1", dataset, chosen_task)
# Model: bigscience/bloomz-1b1, Task: sentiment_analysis, Accuracy: 0.3304, F1-Score: 0.3406

Config

In [None]:

TEACHER = "Orkhan/llama-2-7b-absa"
STUDENT = "bigscience/bloom-1b1"
DATASET = "zeroshot/twitter-financial-news-sentiment"
OUTPUT_DIR = "./bloom1b_distilled"
BATCH_SIZE = 4
EPOCHS = 3
LR = 5e-5
DEVICE = 0 if torch.cuda.is_available() else -1

Data Augmentation

In [46]:
ds = load_dataset("zeroshot/twitter-financial-news-sentiment", split="train")
ds = ds.shuffle(42).filter(lambda ex: ex["label"] in (0,1))

texts = ds["text"]
teacher_labels = ds["label"]

augmented_texts, augmented_labels = [], []

for text, label in zip(texts, teacher_labels):
    paraphrase = explain_pipe(
        f"Paraphrase this sentence without changing sentiment: {text}",
        max_new_tokens=64,
        do_sample=False
    )[0]["generated_text"]

    augmented_texts.extend([text, paraphrase])
    augmented_labels.extend([label, label])

# Tokenize all augmented examples
inputs = tokenizer(
    augmented_texts,
    truncation=True,
    padding="max_length",
    max_length=MAX_LEN,
    return_tensors="pt"
)

labels = torch.tensor(augmented_labels)

from datasets import Dataset
ds = Dataset.from_dict({
    "input_ids": inputs["input_ids"],
    "attention_mask": inputs["attention_mask"],
    "labels": labels,
})

ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenizer = AutoTokenizer.from_pretrained(TEACHER)
tokenizer.pad_token = tokenizer.eos_token

def prep(batch):
    toks = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=MAX_LEN)
    toks["labels"] = batch["label"]
    return toks

ds = ds.map(prep, batched=True)
ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
ds = ds.filter(lambda ex: ex["label"] in (0,1))



In [39]:

teacher = AutoModelForSequenceClassification.from_pretrained(TEACHER).to("cpu")

from transformers import BitsAndBytesConfig

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    llm_int8_enable_fp32_cpu_offload=True,    
)

student = AutoModelForSequenceClassification.from_pretrained(
    STUDENT,
    num_labels=2,
    quantization_config=quant_config,
        device_map="auto", 

 )
student.gradient_checkpointing_enable()


student = get_peft_model(
    student,
    LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=16)
)



In [None]:
def get_teacher_logits(inputs):
    with torch.no_grad():
        return teacher(**inputs).logits

def loss_fn(stu_logits, tea_logits, labels):
    soft = F.kl_div(
        F.log_softmax(stu_logits/2, dim=-1),
        F.softmax(tea_logits/2, dim=-1),
        reduction="batchmean"
    ) * (2*2)
    hard = F.cross_entropy(stu_logits, labels)
    return 0.5 * soft + 0.5 * hard

class DistillTrainer(Trainer):
    def compute_loss(self, model, inputs, **kwargs):
        labels = inputs.pop("labels")
        cpu_inputs = {k: v.detach().to("cpu") for k, v in inputs.items()}
        teacher_logits = get_teacher_logits(cpu_inputs).to(model.device)
        student_outputs = model(**inputs)
        return loss_fn(student_outputs.logits, teacher_logits, labels)

trainer = DistillTrainer(
    model=student,
    args=TrainingArguments(
        output_dir=OUTPUT,
        per_device_train_batch_size=BATCH,
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        bf16=True,
        save_total_limit=1,
    ),
    train_dataset=ds,
    eval_dataset=ds,
    data_collator=DataCollatorWithPadding(tokenizer),
)

trainer.train()
trainer.save_model(OUTPUT)
print("Model Saved", OUTPUT)

In [40]:
# Model: ./distilled-ultralight, Task: sentiment_analysis, Accuracy: 0.3517, F1-Score: 0.34167

initial_accuracy, initial_f1 = evaluate_model_pipeline(
    model_name="./distilled-ultralight",  
    dataset=dataset,
    task=chosen_task,
    batch_size=32
)
