In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [2]:
model_name = "microsoft/phi-2"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [6]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_name)
config.pad_token_id = tokenizer.pad_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    config=config,
    torch_dtype=torch.float16,
    device_map="auto"
)

Loading weights:   0%|          | 0/453 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

### Dataset

In [7]:
import datasets

In [8]:
from datasets import load_dataset

In [9]:
dataset = load_dataset(
    "json",
    data_files="dataset.jsonl",
    split="train"
)

print(dataset[0])

Generating train split: 0 examples [00:00, ? examples/s]

{'question': 'What is the difference between compilation and interpretation?', 'answer': 'Compilation translates source code into machine code creating an executable file. Interpretation translates and executes code line by line without an executable.'}


In [10]:
dataset = dataset.train_test_split(
    test_size = 0.1 ,
    seed = 42
)

In [11]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [12]:
def format_qa(example):
    prompt = f"Q: {example['question']}\nA:"
    answer = " " + example["answer"]  # leading space helps tokenization
    return prompt, answer


In [13]:
def tokenize_qa(example):
    prompt, answer = format_qa(example)

    prompt_ids = tokenizer(
        prompt,
        add_special_tokens=False
    ).input_ids

    answer_ids = tokenizer(
        answer + tokenizer.eos_token,
        add_special_tokens=False
    ).input_ids

    input_ids = prompt_ids + answer_ids

    labels = [-100] * len(prompt_ids) + answer_ids

    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": [1] * len(input_ids)
    }


In [14]:
tokenized_dataset_train = train_dataset.map(
    tokenize_qa,
    remove_columns=train_dataset.column_names
)

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

In [15]:
tokenized_dataset_test = test_dataset.map(
    tokenize_qa,
    remove_columns=test_dataset.column_names
)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

### data collator

In [16]:
import torch
from torch.nn.utils.rnn import pad_sequence

class QACollator:
    def __init__(self, tokenizer):
        self.pad_id = tokenizer.pad_token_id

    def __call__(self, batch):
        input_ids = [torch.tensor(x["input_ids"]) for x in batch]
        labels = [torch.tensor(x["labels"]) for x in batch]
        attention_mask = [torch.tensor(x["attention_mask"]) for x in batch]

        input_ids = pad_sequence(
            input_ids, batch_first=True, padding_value=self.pad_id
        )

        labels = pad_sequence(
            labels, batch_first=True, padding_value=-100
        )

        attention_mask = pad_sequence(
            attention_mask, batch_first=True, padding_value=0
        )

        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
        }


In [17]:
data_collator = QACollator(tokenizer)

### LORA

In [18]:
from peft import LoraConfig, get_peft_model

In [19]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "dense"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [20]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 5,242,880 || all params: 2,784,926,720 || trainable%: 0.1883


### Logging

In [21]:
from logger import ExperimentLogger
logger = ExperimentLogger("Phi-2 Experiment")

In [22]:
from transformers import TrainerCallback

class LossLoggerCallback(TrainerCallback):
    def __init__(self, logger):
        self.logger = logger

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            step = state.global_step
            for k, v in logs.items():
                self.logger.log(f"Step {step} | {k}: {v}")


In [23]:
logger.section("MODEL")
logger.log(f"Model name: {model_name}")
logger.log(f"Tokenizer vocab size: {tokenizer.vocab_size}")
logger.log(f"Pad token: {tokenizer.pad_token}")
logger.log(f"EOS token: {tokenizer.eos_token}")

logger.section("DATASET STATS")
logger.log(f"Train size: {len(train_dataset)}")
logger.log(f"Test size: {len(test_dataset)}")

def get_trainable_params_summary(model):
    trainable = 0
    total = 0
    for _, p in model.named_parameters():
        total += p.numel()
        if p.requires_grad:
            trainable += p.numel()
    return trainable, total

logger.section("LORA CONFIGURATION")
logger.log(str(lora_config))

trainable, total = get_trainable_params_summary(model)
logger.log(f"Trainable parameters: {trainable:,}")
logger.log(f"Total parameters: {total:,}")
logger.log(f"Trainable %: {100 * trainable / total:.4f}%")

### Training

In [24]:
from transformers import TrainingArguments

In [25]:
training_args = TrainingArguments(
    output_dir="./phi_2",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    num_train_epochs=3,
    bf16=True,
    save_strategy="no",
    logging_steps=10,
    save_steps=100,
    report_to="none"
)

In [26]:
logger.section("TRAINING ARGUMENTS")

for k, v in training_args.to_dict().items():
    logger.log(f"{k}: {v}")

In [27]:
from transformers import Trainer

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_train,
    data_collator=data_collator,
    callbacks=[LossLoggerCallback(logger)]
)

In [29]:
trainer.train()

Step,Training Loss
10,2.344043
20,2.030187
30,1.856002
40,1.667098
50,1.600446
60,1.549462


TrainOutput(global_step=69, training_loss=1.804689683775971, metrics={'train_runtime': 193.0677, 'train_samples_per_second': 2.797, 'train_steps_per_second': 0.357, 'total_flos': 373110742794240.0, 'train_loss': 1.804689683775971, 'epoch': 3.0})

### Inference

In [30]:
model.eval()

prompt = "Q: What is the difference between compilation and interpretation?\nA:"

inputs = tokenizer(prompt, return_tensors="pt")
inputs = inputs.to(model.device)

output = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    temperature=0.7
)

print(tokenizer.decode(output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Q: What is the difference between compilation and interpretation?
A: Compilation is a process where source code is translated into machine code, while interpretation translates code at runtime.


### Experiment Log

In [31]:
logger.section("SAMPLE GENERATIONS")

test_questions = [
    "What is the difference between compilation and interpretation?",
    "Explain the concept of polymorphism."
]

model.eval()

for q in test_questions:
    prompt = f"Q: {q}\nA:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7
    )

    text = tokenizer.decode(output[0], skip_special_tokens=True)

    logger.log(f"Q: {q}")
    logger.log(f"OUTPUT:\n{text}")
    logger.log("-" * 40)


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


### Evaluate

In [32]:
!pip install evaluate bert-score rouge_score

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=ede94d17049ebe5aff54966178873881215f29f00aa2535dc298bad214508486
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef546

In [33]:
import evaluate
from bert_score import score

def generate_answers(model, tokenizer, questions, max_tokens=100):
    model.eval()
    outputs = []

    for q in questions:
        prompt = f"Q: {q}\nA:"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            out = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=0.3
            )

        text = tokenizer.decode(out[0], skip_special_tokens=True)

        # Remove prompt from output
        answer = text.split("A:")[-1].strip()
        outputs.append(answer)

    return outputs


def compute_and_log_metrics(logger, predictions, references):
    logger.section("AUTOMATIC METRICS")

    # ---------------- BLEU ----------------
    bleu = evaluate.load("bleu")
    bleu_results = bleu.compute(
        predictions=predictions,
        references=[[ref] for ref in references],
        max_order=4
    )

    logger.log(f"BLEU-4: {bleu_results['bleu']:.4f}")

    # ---------------- ROUGE ----------------
    rouge = evaluate.load("rouge")
    rouge_results = rouge.compute(
        predictions=predictions,
        references=references
    )

    logger.log(f"ROUGE-L: {rouge_results['rougeL']:.4f}")

    # ---------------- BERTScore ----------------
    P, R, F1 = score(
        predictions,
        references,
        lang="en",
        verbose=False
    )

    logger.log(f"BERTScore F1: {F1.mean().item():.4f}")

# validation data
questions = [q for q in test_dataset['question']]
references = [a for a in test_dataset['answer']]
predictions = generate_answers(model, tokenizer, questions)

# Log sample outputs
logger.section("VALIDATION OUTPUTS")
for q, pred in zip(questions[:5], predictions[:5]):
    logger.log(f"Q: {q}")
    logger.log(f"A: {pred}")
    logger.log("-" * 40)

# Log metrics
compute_and_log_metrics(logger, predictions, references)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

RobertaModel LOAD REPORT from: roberta-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
pooler.dense.weight             | MISSING    | 
pooler.dense.bias               | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.
