In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
model_name = "gpt2"  # keep it small for clarity

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # GPT-style models need this


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [3]:
model = AutoModelForCausalLM.from_pretrained(model_name)



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

### Dataset

In [4]:
import datasets

In [5]:
from datasets import load_dataset

In [7]:
dataset = load_dataset(
    "json",
    data_files="dataset.jsonl",
    split="train"
)

print(dataset[0])

Generating train split: 0 examples [00:00, ? examples/s]

{'question': 'What is the difference between compilation and interpretation?', 'answer': 'Compilation translates source code into machine code creating an executable file. Interpretation translates and executes code line by line without an executable.'}


In [8]:
dataset = dataset.train_test_split(
    test_size = 0.1 ,
    seed = 42
)

In [9]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [10]:
def format_qa(example):
    prompt = f"Q: {example['question']}\nA:"
    answer = " " + example["answer"]  # leading space helps tokenization
    return prompt, answer


In [11]:
def tokenize_qa(example):
    prompt, answer = format_qa(example)

    prompt_ids = tokenizer(
        prompt,
        add_special_tokens=False
    ).input_ids

    answer_ids = tokenizer(
        answer + tokenizer.eos_token,
        add_special_tokens=False
    ).input_ids

    input_ids = prompt_ids + answer_ids

    labels = [-100] * len(prompt_ids) + answer_ids

    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": [1] * len(input_ids)
    }


In [12]:
tokenized_dataset_train = train_dataset.map(
    tokenize_qa,
    remove_columns=train_dataset.column_names
)

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

In [13]:
tokenized_dataset_test = test_dataset.map(
    tokenize_qa,
    remove_columns=test_dataset.column_names
)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

### data collator

In [14]:
import torch
from torch.nn.utils.rnn import pad_sequence

class QACollator:
    def __init__(self, tokenizer):
        self.pad_id = tokenizer.pad_token_id

    def __call__(self, batch):
        input_ids = [torch.tensor(x["input_ids"]) for x in batch]
        labels = [torch.tensor(x["labels"]) for x in batch]
        attention_mask = [torch.tensor(x["attention_mask"]) for x in batch]

        input_ids = pad_sequence(
            input_ids, batch_first=True, padding_value=self.pad_id
        )

        labels = pad_sequence(
            labels, batch_first=True, padding_value=-100
        )

        attention_mask = pad_sequence(
            attention_mask, batch_first=True, padding_value=0
        )

        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
        }


In [15]:
data_collator = QACollator(tokenizer)

### LORA

In [16]:
from peft import LoraConfig, get_peft_model

In [17]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn"],  # GPT-2 attention projection
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


In [18]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364




### Helper for Logging

In [20]:
from logger import ExperimentLogger
logger = ExperimentLogger("GPT-2 Experiment")

In [21]:
from transformers import TrainerCallback

class LossLoggerCallback(TrainerCallback):
    def __init__(self, logger):
        self.logger = logger

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            step = state.global_step
            for k, v in logs.items():
                self.logger.log(f"Step {step} | {k}: {v}")


### Training

In [22]:
from transformers import TrainingArguments

In [23]:
training_args = TrainingArguments(
    output_dir="./gpt-2-qa",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    learning_rate=1e-4,
    num_train_epochs=3,
    logging_steps=1,
    save_steps=100,
    fp16=False,  # GPT-2 CPU-safe
    report_to="none"
)

In [24]:
from transformers import Trainer

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_train,
    data_collator=data_collator,
    callbacks=[LossLoggerCallback(logger)]
)

In [26]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,4.222053
2,4.073438
3,3.001681
4,4.213805
5,4.429975
6,4.081128
7,3.817611
8,3.50351
9,4.204981
10,4.08022


TrainOutput(global_step=270, training_loss=3.474483780507688, metrics={'train_runtime': 15.7815, 'train_samples_per_second': 34.217, 'train_steps_per_second': 17.109, 'total_flos': 11999655419904.0, 'train_loss': 3.474483780507688, 'epoch': 3.0})

### Inference

In [27]:
model.eval()

prompt = "Q: What is the difference between compilation and interpretation?\nA:"

inputs = tokenizer(prompt, return_tensors="pt")
inputs = inputs.to(model.device)

output = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    temperature=0.7
)

print(tokenizer.decode(output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What is the difference between compilation and interpretation?
A: Compilation can be used to create code that is easier to read and understand, and may also be easier to debug.


### Experiment Log

In [28]:
logger.section("MODEL")
logger.log(f"Model name: {model_name}")
logger.log(f"Tokenizer vocab size: {tokenizer.vocab_size}")
logger.log(f"Pad token: {tokenizer.pad_token}")
logger.log(f"EOS token: {tokenizer.eos_token}")


In [29]:
logger.section("DATASET STATS")
logger.log(f"Train size: {len(train_dataset)}")
logger.log(f"Test size: {len(test_dataset)}")

In [30]:
def get_trainable_params_summary(model):
    trainable = 0
    total = 0
    for _, p in model.named_parameters():
        total += p.numel()
        if p.requires_grad:
            trainable += p.numel()
    return trainable, total

In [31]:
logger.section("LORA CONFIGURATION")
logger.log(str(lora_config))

trainable, total = get_trainable_params_summary(model)
logger.log(f"Trainable parameters: {trainable:,}")
logger.log(f"Total parameters: {total:,}")
logger.log(f"Trainable %: {100 * trainable / total:.4f}%")

In [32]:
logger.section("TRAINING ARGUMENTS")

for k, v in training_args.to_dict().items():
    logger.log(f"{k}: {v}")

In [33]:
logger.section("SAMPLE GENERATIONS")

test_questions = [
    "What is the difference between compilation and interpretation?",
    "Explain the concept of polymorphism."
]

model.eval()

for q in test_questions:
    prompt = f"Q: {q}\nA:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7
    )

    text = tokenizer.decode(output[0], skip_special_tokens=True)

    logger.log(f"Q: {q}")
    logger.log(f"OUTPUT:\n{text}")
    logger.log("-" * 40)


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


### Evalution on different scores

In [39]:
def generate_answers(model, tokenizer, questions, max_tokens=100):
    model.eval()
    outputs = []

    for q in questions:
        prompt = f"Q: {q}\nA:"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            out = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=0.3
            )

        text = tokenizer.decode(out[0], skip_special_tokens=True)

        # Remove prompt from output
        answer = text.split("A:")[-1].strip()
        outputs.append(answer)

    return outputs

In [45]:
questions = []
for q in test_dataset['question']:
  questions.append(q)

In [47]:
references = []
for a in test_dataset['answer']:
  references.append(a)

In [49]:
predictions = generate_answers(model, tokenizer, questions)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [50]:
import evaluate
from bert_score import score


def compute_and_log_metrics(logger, predictions, references):
    logger.section("AUTOMATIC METRICS")

    # ---------------- BLEU ----------------
    bleu = evaluate.load("bleu")
    bleu_results = bleu.compute(
        predictions=predictions,
        references=[[ref] for ref in references],
        max_order=4
    )

    logger.log(f"BLEU-4: {bleu_results['bleu']:.4f}")

    # ---------------- ROUGE ----------------
    rouge = evaluate.load("rouge")
    rouge_results = rouge.compute(
        predictions=predictions,
        references=references
    )

    logger.log(f"ROUGE-L: {rouge_results['rougeL']:.4f}")

    # ---------------- BERTScore ----------------
    P, R, F1 = score(
        predictions,
        references,
        lang="en",
        verbose=False
    )

    logger.log(f"BERTScore F1: {F1.mean().item():.4f}")


In [51]:
# Log sample outputs
logger.section("VALIDATION OUTPUTS")
for q, pred in zip(questions[:5], predictions[:5]):
    logger.log(f"Q: {q}")
    logger.log(f"A: {pred}")
    logger.log("-" * 40)

In [54]:
# Log metrics
compute_and_log_metrics(logger, predictions, references)

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.weight            | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
pooler.dense.weight             | MISSING    | 
pooler.dense.bias               | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [None]:
import evaluate
from bert_score import score

def generate_answers(model, tokenizer, questions, max_tokens=100):
    model.eval()
    outputs = []

    for q in questions:
        prompt = f"Q: {q}\nA:"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            out = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=0.3
            )

        text = tokenizer.decode(out[0], skip_special_tokens=True)

        # Remove prompt from output
        answer = text.split("A:")[-1].strip()
        outputs.append(answer)

    return outputs


def compute_and_log_metrics(logger, predictions, references):
    logger.section("AUTOMATIC METRICS")

    # ---------------- BLEU ----------------
    bleu = evaluate.load("bleu")
    bleu_results = bleu.compute(
        predictions=predictions,
        references=[[ref] for ref in references],
        max_order=4
    )

    logger.log(f"BLEU-4: {bleu_results['bleu']:.4f}")

    # ---------------- ROUGE ----------------
    rouge = evaluate.load("rouge")
    rouge_results = rouge.compute(
        predictions=predictions,
        references=references
    )

    logger.log(f"ROUGE-L: {rouge_results['rougeL']:.4f}")

    # ---------------- BERTScore ----------------
    P, R, F1 = score(
        predictions,
        references,
        lang="en",
        verbose=False
    )

    logger.log(f"BERTScore F1: {F1.mean().item():.4f}")

# validation data
questions = [q for q in test_dataset['question']]
references = [a for a in test_dataset['answer']]
predictions = generate_answers(model, tokenizer, questions)

# Log sample outputs
logger.section("VALIDATION OUTPUTS")
for q, pred in zip(questions[:5], predictions[:5]):
    logger.log(f"Q: {q}")
    logger.log(f"A: {pred}")
    logger.log("-" * 40)

# Log metrics
compute_and_log_metrics(logger, predictions, references)