In [None]:
!pip install -q transformers accelerate datasets bitsandbytes peft

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

# ----------------------------
# 1. Load Phi-2 with 4-bit quantization
# ----------------------------
model_name = "microsoft/phi-2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # important for batching

print(f"✅ Model loaded: {model_name}")
print(f"Pad token: {tokenizer.pad_token}")
print(f"Model dtype: {model.dtype}")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

✅ Model loaded: microsoft/phi-2
Pad token: <|endoftext|>
Model dtype: torch.float16


In [None]:
from transformers import DataCollatorForSeq2Seq

# ----------------------------
# 1. Adjust tokenizer for left padding
# ----------------------------
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token  # ensure pad exists

print(f"✅ Tokenizer padding side: {tokenizer.padding_side}")
print(f"✅ Pad token set to: {tokenizer.pad_token}")

# ----------------------------
# 2. Try encoding sample dialogues
# ----------------------------
sample_dialogues = [
    "A: Hey, how are you?\nB: I'm good, thanks! And you?",
    "A: What's the agenda for today?\nB: We need to finish the project report."
]

encodings = tokenizer(
    sample_dialogues,
    padding=True,  # left padding will apply here
    truncation=True,
    max_length=64,
    return_tensors="pt"
)

print("Input IDs shape:", encodings["input_ids"].shape)
print("Attention mask:\n", encodings["attention_mask"])


✅ Tokenizer padding side: left
✅ Pad token set to: <|endoftext|>
Input IDs shape: torch.Size([2, 20])
Attention mask:
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [None]:
from peft import LoraConfig, get_peft_model

# ----------------------------
# 1. Enable gradient checkpointing
# ----------------------------
model.gradient_checkpointing_enable()
print("✅ Gradient checkpointing enabled")

# ----------------------------
# 2. Setup LoRA config
# ----------------------------
lora_config = LoraConfig(
    r=16,                # rank (low-rank dimension)
    lora_alpha=32,       # scaling factor
    target_modules=["q_proj", "v_proj"],  # key attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# ----------------------------
# 3. Wrap model with PEFT/LoRA
# ----------------------------
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


✅ Gradient checkpointing enabled
trainable params: 5,242,880 || all params: 2,784,926,720 || trainable%: 0.1883


In [None]:
from datasets import load_dataset

# ----------------------------
# 1. Load Samsum dataset
# ----------------------------
dataset = load_dataset("knkarthick/samsum")

print(dataset)
print(dataset["train"][0])

# ----------------------------
# 2. Preprocess with tokenizer
# ----------------------------
from datasets import DatasetDict

DIALOG_COL  = "dialogue"
SUMMARY_COL = "summary"

# 1. Filter out bad rows
def is_good(e):
    d = e.get(DIALOG_COL); s = e.get(SUMMARY_COL)
    return bool(d) and bool(s) and str(d).strip() and str(s).strip()

dataset = dataset.filter(is_good)

# 2. Reformat into single training string
def format_sample(x):
    dialog = str(x[DIALOG_COL]).strip()
    summ   = str(x[SUMMARY_COL]).strip()
    prompt = f"<dialogue>\n{dialog}\n<summary>\n{summ}"
    return {"text": prompt}

processed_ds = dataset.map(format_sample, remove_columns=dataset["train"].column_names)

print(processed_ds)
print(processed_ds["train"][0]["text"][:300])

# 3. Tokenize final text
def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        max_length=640,      # enough for dialogue + summary
        truncation=True,
        padding="max_length"
    )

tokenized_ds = processed_ds.map(tokenize_function, batched=True, remove_columns=["text"])
print(tokenized_ds["train"][0])


DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})
{'id': '13818513', 'dialogue': "Amanda: I baked  cookies. Do you want some?\nJerry: Sure!\nAmanda: I'll bring you tomorrow :-)", 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}


Filter:   0%|          | 0/14732 [00:00<?, ? examples/s]

Filter:   0%|          | 0/818 [00:00<?, ? examples/s]

Filter:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/14731 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 14731
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 818
    })
    test: Dataset({
        features: ['text'],
        num_rows: 819
    })
})
<dialogue>
Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)
<summary>
Amanda baked cookies and will bring Jerry some tomorrow.


Map:   0%|          | 0/14731 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

{'input_ids': [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256

In [None]:
!pip install -q evaluate rouge_score


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
import evaluate
rouge = evaluate.load("rouge")

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import evaluate

# 1. Data collator (causal LM)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# 2. Training arguments (old version compatible)
training_args = TrainingArguments(
    output_dir="./phi2-samsum-fast",
    do_eval=True,
    logging_steps=50,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=0.3,       # quick run
    warmup_steps=50,
    fp16=True,
    save_total_limit=1,
    report_to=[]
)

# 3. Metric (ROUGE)
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v * 100, 2) for k, v in result.items()}

# 4. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 5. Train
trainer.train()


  trainer = Trainer(


Step,Training Loss
50,2.4055
100,2.2318
150,2.1927
200,2.2348
250,2.1819
300,2.1613
350,2.1693
400,2.1936
450,2.1975
500,2.1692


TrainOutput(global_step=553, training_loss=2.214745863005654, metrics={'train_runtime': 4489.4784, 'train_samples_per_second': 0.984, 'train_steps_per_second': 0.123, 'total_flos': 4.50841086001152e+16, 'train_loss': 2.214745863005654, 'epoch': 0.3002986695628564})

In [None]:
def summarize(dialogue, model, tokenizer, max_new_tokens=128):
    model.eval()
    prompt = f"<dialogue>\n{dialogue}\n<summary>\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract only the part after <summary>
    if "<summary>" in result:
        result = result.split("<summary>")[-1].strip()
    return result

# Example test
test_dialogue = """Amanda: I baked cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)"""

print("Predicted Summary:", summarize(test_dialogue, trainer.model, tokenizer))


Predicted Summary: Amanda baked cookies. Jerry wants some. Amanda will bring Jerry cookies tomorrow. :-)


In [None]:
def add_labels(batch):
    batch["labels"] = batch["input_ids"].copy()
    return batch

tokenized_ds = tokenized_ds.map(add_labels, batched=True)
print(tokenized_ds["train"][0])



Map:   0%|          | 0/14731 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

{'input_ids': [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256

In [None]:
model.enable_input_require_grads()


In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


Step,Training Loss


KeyboardInterrupt: 