Install Required Dependencies

In [None]:
!pip install -q accelerate peft bitsandbytes transformers trl datasets evaluate rouge_score
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.9/511.9 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.5 MB/s[0m eta [36m0:

Import Required Libraries

In [None]:
import os
import torch
import random
import warnings
import evaluate
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from tqdm import tqdm
warnings.filterwarnings('ignore')
logging.set_verbosity(logging.CRITICAL)
# AutoModelForCausalLM: This class is specifically designed for causal language modeling, i.e. next-token prediction/generation, which is the standard for most large language models (LLMs) like Mistral, GPT, or Llama
# AutoTokenizer: This class automatically load the correct tokenizer for a given pre-trained model
# NB: Different pre-trained models (like BERT, GPT-2, RoBERTa, etc.) use different tokenizers with unique vocabularies and rules
# SFTTrainer from trl (Transformer Reinforcement Learning): A trainer specifically designed for Supervised Fine-Tuning (SFT) of language models

In [None]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.6.0+cu124
CUDA available: True
GPU: Tesla T4


Authentication and Model Access

In [None]:
from huggingface_hub import login
login(token="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")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Configure Model and Training Parameters

In [None]:
# Model configuration
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
new_model = "tinyllama-1.1b-lora-finetuned"
# new_model is used to define the name of the directory where your fine-tuned model and tokenizer will be saved after training is complete

# LoRA configuration parameters
lora_r = 8  # Rank of adaptation
lora_alpha = 16  # LoRA scaling parameter
lora_dropout = 0.05  # Dropout probability

Set Up 4-bit Quantization

In [None]:
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, #  Activates 4-bit quantized weights for the model
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4", # Specifies the quantization type: NormalFloat4, optimized for LLMs
    bnb_4bit_compute_dtype=compute_dtype
)
# In 4-bit quantization, each weight is limited to just 16 possible values (2^4 values)

Load Base Model and Tokenizer

In [None]:
# Load the base model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

model.config.use_cache = False
model.config.pretraining_tp = 1 # No tensor parallelism

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Recommended for decoder-only models
# For TinyLlama-1.1B-Chat-v1.0, the embedding dimension (or hidden size) is 2048 i.e. each token is represented by a vector of size 2048

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Prepare Model for LoRA Training

In [None]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

print_trainable_parameters(model)

In [None]:
model = prepare_model_for_kbit_training(model)

# Configure LoRA
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM", # Autoregressive model, it uses everything it’s generated so far to predict the next token
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ]
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)
# It essentially adds the small, trainable LoRA matrices (A and B) to the specified target_modules of the model, while keeping the original model weights frozen

In [None]:
print_trainable_parameters(model)
# Original Model Weights: The original pre-trained model's weights (W) are frozen i.e. they are not updated during the fine-tuning (requires_grad=False)
# LoRA Adapter Matrices: LoRA introduces two small matrices, A and B, the parameters within these matrices are trainable (requires_grad=True)
# This is why the percentage of trainable parameters is very small compared to the total number of parameters in the original model

trainable params: 6580224 || all params: 622186496 || trainable%: 1.0575967241821975


Prepare Training Dataset

In [None]:
dataset = load_dataset("tatsu-lab/alpaca", split="train") # Using Alpaca

print(dataset)
print(dataset.shape)
first_few_examples = dataset.select(range(5))
print(first_few_examples['text'])

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-a09b74b3ef9c3b(…):   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52002 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 52002
})
(52002, 4)
Column(['Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat are the three primary colors?\n\n### Response:\nThe three primary colors are red, blue, and yellow.', 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nDescribe the structure of an atom.\n\n### Response:\nAn atom is made up of a nucleus, which contains protons and neutrons, surrounded by 

In [None]:
# Format data for TinyLlama chat format
def format_prompts(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    # print(examples["instruction"][0])

    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        # TinyLlama uses ChatML format: <|system|>, <|user|>, <|assistant|>
        if input_text:
            text = f"<|system|>\nYou are a helpful AI assistant.</s>\n<|user|>\n{instruction}\n\n{input_text}</s>\n<|assistant|>\n{output}</s>"
        else:
            text = f"<|system|>\nYou are a helpful AI assistant.</s>\n<|user|>\n{instruction}</s>\n<|assistant|>\n{output}</s>"
        texts.append(text)

    return {"text": texts}

dataset = dataset.map(format_prompts, batched=True) # Apply formatting

# dataset = dataset.remove_columns(['instruction', 'input', 'output']) # Keep only formatted text

splits = dataset.train_test_split(test_size=0.4, seed=42)
train_split = splits["train"]
train_split = train_split.remove_columns(['instruction', 'input', 'output'])
test_split = splits["test"] # Keep test dataset with original columns for evaluation

# Before formatting:
# {
#     "instruction": "What are the three primary colors?",
#     "input": "",
#     "output": "The three primary colors are red, blue, and yellow."
# }
# {
#     "instruction": "Translate the following phrase into French.",
#     "input": "I love you.",
#     "output": "Je t'aime."
# }
#
# After formatting:
# {
#     "text": "<|system|>\nYou are a helpful AI assistant.</s>\n<|user|>\nWhat are the three primary colors?</s>\n<|assistant|>\nThe three primary colors are red, blue, and yellow.</s>"
# }
# {
#     "text": "<|system|>\nYou are a helpful AI assistant.</s>\n<|user|>\nTranslate the following phrase into French.\n\nI love you.</s>\n<|assistant|>\nJe t'aime.</s>"
# }

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

In [None]:
print(train_split)
print(train_split.shape)
print(test_split)
print(test_split.shape)

Dataset({
    features: ['text'],
    num_rows: 31201
})
(31201, 1)
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 20801
})
(20801, 4)


Configure Training Arguments

In [None]:
# Training arguments optimized for TinyLlama
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,  # Higher batch size for TinyLlama
    gradient_accumulation_steps=2,  # Effective batch size of 8
    optim="paged_adamw_32bit",
    save_steps=100,
    logging_steps=50,
    learning_rate=2e-4,
    weight_decay=0.01,  # Slight increase for regularization
    fp16=False, # T4-FP16
    bf16=False, # A100-BF16
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",  # Better for longer training
    report_to="none",  # Disable wandb for simplicity
    push_to_hub=False,
    dataloader_pin_memory=False,
)

In [None]:
# Initialize the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_split,
    peft_config=peft_config,
    args=training_arguments,
    processing_class=tokenizer
)

Training the model

In [None]:
# Start training
print("Starting training...")
trainer.train()

# Save the fine-tuned LoRA adapter
trainer.save_model(new_model)
tokenizer.save_pretrained(new_model)

print(f"Model saved to {new_model}")

Starting training...
{'loss': 1.3032, 'grad_norm': 1.329494833946228, 'learning_rate': 8.305084745762712e-05, 'num_tokens': 49212.0, 'mean_token_accuracy': 0.7049503976106644, 'epoch': 0.012818869375721062}
{'loss': 1.0717, 'grad_norm': 1.2611113786697388, 'learning_rate': 0.00016779661016949154, 'num_tokens': 98965.0, 'mean_token_accuracy': 0.7398751503229142, 'epoch': 0.025637738751442124}
{'loss': 1.008, 'grad_norm': 1.0375902652740479, 'learning_rate': 0.00019996686427563746, 'num_tokens': 145819.0, 'mean_token_accuracy': 0.7485605400800704, 'epoch': 0.038456608127163186}
{'loss': 1.0153, 'grad_norm': 0.9698510766029358, 'learning_rate': 0.00019977384648529314, 'num_tokens': 193664.0, 'mean_token_accuracy': 0.7481109154224396, 'epoch': 0.05127547750288425}
{'loss': 0.9865, 'grad_norm': 0.9542326927185059, 'learning_rate': 0.0001994088314905155, 'num_tokens': 241533.0, 'mean_token_accuracy': 0.755344632267952, 'epoch': 0.06409434687860531}
{'loss': 0.999, 'grad_norm': 0.857416927814

In [None]:
# # Test the fine-tuned model
# from peft import PeftModel

# # Load base model for inference
# base_model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.float16,
#     device_map="auto",
#     trust_remote_code=True
# )

# # Load the fine-tuned LoRA adapter
# model = PeftModel.from_pretrained(base_model, new_model)
# # It's designed to load a base pre-trained model and then inject or load the trained PEFT adapters, here LoRA adapters

# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

# # Test inference with ChatML format
# def generate_response(prompt):
#     # Format using ChatML template for TinyLlama
#     formatted_prompt = f"<|system|>\nYou are a helpful AI assistant.</s>\n<|user|>\n{prompt}</s>\n<|assistant|>\n"

#     inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_new_tokens=256,
#             temperature=0.7,
#             do_sample=True,
#             top_k=50,
#             top_p=0.95,
#             pad_token_id=tokenizer.eos_token_id,
#             eos_token_id=tokenizer.eos_token_id,
#         )

#     # Decode and extract only the assistant's response
#     full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     assistant_response = full_response.split("<|assistant|>")[-1].strip()
#     return assistant_response

# # Test the model
# test_prompts = [
#     "Explain machine learning in simple terms.",
#     "Write a short story about a robot learning to paint.",
#     "What are the benefits of renewable energy?",
# ]

# for prompt in test_prompts:
#     print(f"\nPrompt: {prompt}")
#     response = generate_response(prompt)
#     print(f"Response: {response}")
#     print("-" * 50)

Testing the model

In [None]:
# Test the fine-tuned model
from peft import PeftModel

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Generation parameters
gen_kwargs = dict(
    max_new_tokens=256,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

# ChatML formatting
def build_chatml(user_msg, system_msg="You are a helpful AI assistant."):
    return (
        f"<|system|>\n{system_msg}</s>\n"
        f"<|user|>\n{user_msg}</s>\n"
        f"<|assistant|>\n"
    )

# Helper: generate one response from a given model
def generate_with_model(model, prompt):
    formatted = build_chatml(prompt)
    inputs = tokenizer(formatted, return_tensors="pt").to(next(model.parameters()).device)
    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract assistant part
    if "<|assistant|>" in decoded:
        return decoded.split("<|assistant|>")[-1].strip()
    return decoded.strip()

# Load BASE model (wo LoRA)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True,
)
base_model.eval()

# Load BASE + LoRA adapter
lora_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True,
)
lora_model = PeftModel.from_pretrained(lora_model, new_model)
lora_model.eval()

# Test prompts
test_prompts = [
    "Explain what is machine learning in simple words.",
    "What are the four main types of energy resources?",
]

# Compare outputs
print("=== Testing Base TinyLlama vs TinyLlama + LoRA ===")
for i, prompt in enumerate(test_prompts, 1):
    print(f"\n[{i}] Prompt: {prompt}")

    base_resp = generate_with_model(base_model, prompt)
    print("\n-- Base TinyLlama --")
    print(base_resp)

    lora_resp = generate_with_model(lora_model, prompt)
    print("\n-- TinyLlama + LoRA --")
    print(lora_resp)

    print("\n" + "-" * 50)

=== Testing Base TinyLlama vs TinyLlama + LoRA ===

[1] Prompt: Explain what is machine learning in simple words.

-- Base TinyLlama --
Machine learning is the process of teaching computers to perform tasks without being explicitly programmed. It involves the development of algorithms that can learn from data and improve their performance over time. Machine learning can be used in various applications, including:

1. Image and video recognition: Machine learning algorithms can be trained to identify objects, people, and objects in images and videos.

2. Natural language processing: Machine learning can be used to analyze text and understand its meaning.

3. Machine translation: Machine learning can be used to improve the accuracy and speed of machine translation.

4. Predictive maintenance: Machine learning can be used to identify potential issues in machinery and predict when maintenance is needed.

5. Fraud detection: Machine learning can be used to detect fraudulent transactions and

In [None]:
# Merge LoRA weights with base model for deployment
merged_model = model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("tinyllama-1.1b-merged")
tokenizer.save_pretrained("tinyllama-1.1b-merged")

print("Merged model saved!")

# Push to Hugging Face Hub
# merged_model.push_to_hub("your-username/tinyllama-1.1b-finetuned")
# tokenizer.push_to_hub("your-username/tinyllama-1.1b-finetuned")

Merged model saved!


In [None]:
# Save the merged model in MyDrive
save_dir = "/content/drive/MyDrive/Colab Notebooks/models/tinyllama-1.1b-merged"
os.makedirs(save_dir, exist_ok=True)
merged_model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print("Merged model saved in MyDrive!")

Merged model saved in MyDrive!


 Evaluation Metrics

In [None]:
# ===== Load Pre-trained Evaluation Metrics =====
print("Loading evaluation metrics...")
perplexity_metric = evaluate.load("perplexity", module_type="metric")
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

# ===== Model Setup =====
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
adapter_dir = "tinyllama-1.1b-lora-finetuned"
merged_dir = "/content/drive/MyDrive/Colab Notebooks/models/tinyllama-1.1b-merged"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(merged_dir, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Generation parameters
gen_kwargs = {
    "max_new_tokens": 256,
    "temperature": 0.7,
    "top_k": 50,
    "top_p": 0.95,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "eos_token_id": tokenizer.eos_token_id,
}

def build_chatml(user_msg, system_msg="You are a helpful AI assistant."):
    return f"<|system|>\n{system_msg}</s>\n<|user|>\n{user_msg}</s>\n<|assistant|>\n"

def generate_with_model(model, prompt):
    formatted = build_chatml(prompt)
    inputs = tokenizer(formatted, return_tensors="pt").to(next(model.parameters()).device)
    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "<|assistant|>" in decoded:
        return decoded.split("<|assistant|>")[-1].strip()
    return decoded.strip()

# ===== Evaluation Functions =====
def evaluate_model_comprehensive(model, test_dataset, model_name_str, batch_size=100):
    """
    Evaluate model using Hugging Face Evaluate library metrics
    """
    print(f"\n=== Evaluating {model_name_str} ===")

    # Prepare test data
    test_prompts = []
    references = []
    predictions = []

    # Create prompts and references from test dataset
    for item in test_dataset:
        if item["input"]:
            prompt = f"{item['instruction']}\n\n{item['input']}"
        else:
            prompt = item["instruction"]
        test_prompts.append(prompt)
        references.append(item["output"])

    # Generate predictions in batches
    print(f"Generating predictions for {len(test_prompts)} samples...")
    for i in tqdm(range(0, len(test_prompts), batch_size), desc="Generating"):
        batch_prompts = test_prompts[i:i+batch_size]
        batch_preds = []

        # for prompt in batch_prompts:
        for j, prompt in tqdm(enumerate(batch_prompts, 1), desc="Predicting"):
            # print(f"\nPrompt {j}: {prompt}")
            pred = generate_with_model(model, prompt)
            # print(f"\nPred {j}: {pred}")
            batch_preds.append(pred)

        predictions.extend(batch_preds)

    # 1. PERPLEXITY using Hugging Face Evaluate
    print("Computing perplexity...")

    # For perplexity, we need the full formatted text sequences
    formatted_sequences = []
    for prompt, ref in zip(test_prompts, references):
        formatted_text = build_chatml(prompt) + ref + "</s>"
        formatted_sequences.append(formatted_text)

    # Use a subset for perplexity (it's computationally expensive)
    ppl_subset_size = min(1000, len(formatted_sequences))
    ppl_sequences = formatted_sequences[:ppl_subset_size]

    try:
        ppl_results = perplexity_metric.compute(
            predictions=ppl_sequences,
            model_id=merged_dir,  # Use base model name for tokenizer
            batch_size=16,
            add_start_token=False
        )
        perplexity_score = ppl_results["mean_perplexity"]
    except Exception as e:
        print(f"Perplexity computation failed: {e}")
        perplexity_score = float('inf')

    print("Perplexity computed!")

    # 2. BLEU Score
    print("Computing BLEU...")
    bleu_results = bleu_metric.compute(
        predictions=predictions,
        references=[[ref] for ref in references]  # BLEU expects list of lists
    )

    print("BLEU computed!")

    # 3. ROUGE Scores
    print("Computing ROUGE...")
    rouge_results = rouge_metric.compute(
        predictions=predictions,
        references=references
    )

    print("ROUGE computed!")

    return {
        "model_name": model_name_str,
        "perplexity": perplexity_score,
        "bleu": bleu_results["bleu"] * 100,  # Convert to percentage
        "rouge1": rouge_results["rouge1"] * 100,
        "rouge2": rouge_results["rouge2"] * 100,
        "rougeL": rouge_results["rougeL"] * 100,
        "predictions": predictions[:5],  # First 5 for inspection
        "references": references[:5],
        "sample_size": len(predictions),
        "perplexity_sample_size": ppl_subset_size
    }

# ===== Load Models =====
# print("Loading base TinyLlama model...")
# base_model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
#     device_map="auto",
#     trust_remote_code=True,
#     low_cpu_mem_usage=True,
# )
# base_model.eval()

print("Loading TinyLlama + LoRA model...")
lora_model = AutoModelForCausalLM.from_pretrained(
    merged_dir,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True,
)
# lora_model = PeftModel.from_pretrained(lora_model, adapter_dir)
lora_model.eval()

# ===== Run Comprehensive Evaluation =====
print("Starting comprehensive evaluation on test dataset...")

# Randomly sample 1000 examples from the test set
random.seed(42)
sampled_test = test_split.shuffle(seed=42).select(range(min(1000, len(test_split))))

# Evaluate base model
# base_results = evaluate_model_comprehensive(
#     base_model,
#     sampled_test,
#     "Base TinyLlama",
#     batch_size=50  # Adjust based on your GPU memory
# )

# Evaluate LoRA model
lora_results = evaluate_model_comprehensive(
    lora_model,
    sampled_test,
    "TinyLlama + LoRA",
    batch_size=50
)

Loading evaluation metrics...
Loading TinyLlama + LoRA model...
Starting comprehensive evaluation on test dataset...

=== Evaluating TinyLlama + LoRA ===
Generating predictions for 1000 samples...


Generating:   0%|          | 0/20 [00:00<?, ?it/s]
Predicting: 0it [00:00, ?it/s][A
Predicting: 1it [00:03,  3.23s/it][A
Predicting: 2it [00:12,  6.73s/it][A
Predicting: 3it [00:22,  8.25s/it][A
Predicting: 4it [00:46, 14.37s/it][A
Predicting: 5it [00:47,  9.47s/it][A
Predicting: 6it [00:47,  6.44s/it][A
Predicting: 7it [00:49,  4.81s/it][A
Predicting: 8it [00:53,  4.73s/it][A
Predicting: 9it [01:05,  6.90s/it][A
Predicting: 10it [01:06,  5.28s/it][A
Predicting: 11it [01:12,  5.43s/it][A
Predicting: 12it [01:24,  7.47s/it][A
Predicting: 13it [01:30,  6.91s/it][A
Predicting: 14it [01:42,  8.48s/it][A
Predicting: 15it [01:54,  9.53s/it][A
Predicting: 16it [01:54,  6.80s/it][A
Predicting: 17it [02:06,  8.37s/it][A
Predicting: 18it [02:07,  5.93s/it][A
Predicting: 19it [02:08,  4.57s/it][A
Predicting: 20it [02:11,  4.09s/it][A
Predicting: 21it [02:16,  4.19s/it][A
Predicting: 22it [02:19,  3.88s/it][A
Predicting: 23it [02:21,  3.48s/it][A
Predicting: 24it [02:22,  2

Computing perplexity...


  0%|          | 0/63 [00:00<?, ?it/s]

Perplexity computed!
Computing BLEU...
BLEU computed!
Computing ROUGE...
ROUGE computed!


In [None]:
# ===== Results Summary =====
def print_results(results):
    print(f"\n{results['model_name']} Results:")
    print(f"  Perplexity: {results['perplexity']:.3f}") # lower is better
    print(f"  BLEU Score: {results['bleu']:.2f}")
    print(f"  ROUGE-1: {results['rouge1']:.2f}")
    print(f"  ROUGE-2: {results['rouge2']:.2f}")
    print(f"  ROUGE-L: {results['rougeL']:.2f}")

# print_results(base_results)
print_results(lora_results)


TinyLlama + LoRA Results:
  Perplexity: 4.298
  BLEU Score: 7.57
  ROUGE-1: 36.05
  ROUGE-2: 16.01
  ROUGE-L: 28.18


In [None]:
# # ===== Results Summary =====
# print("\n" + "=" * 50)
# print("COMPREHENSIVE EVALUATION RESULTS")
# print("=" * 50)

# print(f"\nDataset Information:")
# print(f"- Full dataset size: {len(dataset):,}")
# print(f"- Train set: {len(train_split):,} samples (60%)")
# print(f"- Test set: {len(test_split):,} samples (40%)")
# print(f"- Evaluation sample size: {base_results['sample_size']:,}")
# print(f"- Perplexity sample size: {base_results['perplexity_sample_size']:,}")

# def print_results(results):
#     print(f"\n{results['model_name']} Results:")
#     print(f"  Perplexity: {results['perplexity']:.3f} (lower is better)")
#     print(f"  BLEU Score: {results['bleu']:.2f}")
#     print(f"  ROUGE-1: {results['rouge1']:.2f}")
#     print(f"  ROUGE-2: {results['rouge2']:.2f}")
#     print(f"  ROUGE-L: {results['rougeL']:.2f}")

# print_results(base_results)
# print_results(lora_results)

# # Improvement analysis
# print(f"\n=== IMPROVEMENT ANALYSIS ===")
# ppl_improvement = "↓" if lora_results['perplexity'] < base_results['perplexity'] else "↑"
# print(f"Perplexity: {base_results['perplexity']:.3f} → {lora_results['perplexity']:.3f} {ppl_improvement}")
# print(f"BLEU: {base_results['bleu']:.2f} → {lora_results['bleu']:.2f} ({lora_results['bleu'] - base_results['bleu']:+.2f})")
# print(f"ROUGE-1: {base_results['rouge1']:.2f} → {lora_results['rouge1']:.2f} ({lora_results['rouge1'] - base_results['rouge1']:+.2f})")
# print(f"ROUGE-2: {base_results['rouge2']:.2f} → {lora_results['rouge2']:.2f} ({lora_results['rouge2'] - base_results['rouge2']:+.2f})")
# print(f"ROUGE-L: {base_results['rougeL']:.2f} → {lora_results['rougeL']:.2f} ({lora_results['rougeL'] - base_results['rougeL']:+.2f})")

# # Sample comparison
# print(f"\n=== SAMPLE COMPARISONS ===")
# for i in range(min(3, len(base_results['predictions']))):
#     print(f"\n--- Sample {i+1} ---")
#     print(f"Reference: {base_results['references'][i]}")
#     print(f"Base: {base_results['predictions'][i]}")
#     print(f"LoRA: {lora_results['predictions'][i]}")

Gradio Chatbot Interface

In [None]:
# 1) Install dependencies
!pip install -q gradio

# 2) Imports and Drive
import gradio as gr

from google.colab import drive
drive.mount('/content/drive')

# 3) Model Path
merged_dir = "/content/drive/MyDrive/Colab Notebooks/models/tinyllama-1.1b-merged"

device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Using device: {device}")

# 4) Load model & tokenizer (no PEFT needed for merged model)
model = AutoModelForCausalLM.from_pretrained(
    merged_dir,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto" if device == "cuda" else None,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
).to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(merged_dir, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# 5) Chat settings
BOT_NAME = "LlamaBot"
gen_kwargs = dict(
    max_new_tokens=256,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

def build_chatml_from_messages(history_msgs, latest_user_message):
    # history_msgs: list of dicts with keys: role ('user'|'assistant'|'system'), content
    system = f"<|system|>\nYou are {BOT_NAME}, a helpful AI assistant.</s>\n"
    chatml = system
    for m in history_msgs or []:
        role = m.get("role")
        content = m.get("content", "")
        if role == "user":
            chatml += f"<|user|>\n{content}</s>\n"
        elif role == "assistant":
            chatml += f"<|assistant|>\n{content}</s>\n"
    chatml += f"<|user|>\n{latest_user_message}</s>\n<|assistant|>\n"
    return chatml

NAME_TRIGGERS = [
    "what's your name", "what is your name", "who are you", "tell me your name", "your name"
]

def chat_fn(message, history):
    # history is list[{"role": "...", "content": "..."}] with type="messages"
    msg_lower = (message or "").lower()
    if any(t in msg_lower for t in NAME_TRIGGERS):
        return f"My name is {BOT_NAME}, your friendly AI assistant."

    prompt = build_chatml_from_messages(history, message)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, **gen_kwargs)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    reply = decoded.split("<|assistant|>")[-1].strip()
    return reply

# 6) Gradio ChatInterface
ui = gr.ChatInterface(
    fn=chat_fn,
    type="messages",
    examples=["Hello!", "What's your name?", "Tell me a short story.",],
    title=f"{BOT_NAME}: TinyLlama-1.1B LoRA Chatbot",
    description=f"Chatbot powered by TinyLlama-1.1B with LoRA adapters",
    theme="soft",
)

ui.launch(share=True, debug=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://691c3453e4f47cfd65.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
# # 1. Install dependencies
# !pip install -q gradio accelerate peft bitsandbytes transformers torch

# # 2. Imports
# import torch
# import gradio as gr
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from peft import PeftModel

# # 3. Model & Tokenizer Loading
# model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# adapter_dir = "tinyllama-1.1b-lora-finetuned"

# # Load base model (quantized if you used 4-bit during training, otherwise omit quantization_config)
# base_model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.float16,
#     device_map="auto",
#     trust_remote_code=True,
#     low_cpu_mem_usage=True,
# )

# # Inject LoRA adapter
# model = PeftModel.from_pretrained(base_model, adapter_dir)
# model.eval()

# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

# # 4. Inference Function
# def chat_with_tinyllama(user_message, chat_history=[]):
#     """
#     user_message: str, latest user query
#     chat_history: list of [user, bot] pairs for display
#     """
#     # Build ChatML prompt
#     system_msg = "<|system|>\nYou are a helpful AI assistant.</s>\n"
#     conversation = ""
#     for user, bot in chat_history:
#         conversation += f"<|user|>\n{user}</s>\n<|assistant|>\n{bot}</s>\n"
#     conversation += f"<|user|>\n{user_message}</s>\n<|assistant|>\n"

#     # Tokenize and generate
#     inputs = tokenizer(conversation, return_tensors="pt").to(model.device)
#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_new_tokens=128,
#             temperature=0.7,
#             top_p=0.9,
#             pad_token_id=tokenizer.eos_token_id,
#             eos_token_id=tokenizer.eos_token_id,
#         )
#     decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     # Extract the assistant’s completion
#     reply = decoded.split("<|assistant|>")[-1].strip()
#     # Append and return updated history
#     chat_history = chat_history + [(user_message, reply)]
#     return chat_history, chat_history

# # 5. Gradio Interface
# with gr.Blocks() as demo:
#     gr.Markdown("## TinyLlama-1.1B Chatbot (LoRA Fine-Tuned)")
#     chatbot = gr.Chatbot()
#     msg    = gr.Textbox(placeholder="Type your message here…", show_label=False)
#     clear  = gr.Button("Clear")

#     msg.submit(fn=chat_with_tinyllama, inputs=[msg, chatbot], outputs=[chatbot, chatbot])
#     clear.click(lambda: None, None, chatbot, queue=False)

# # 6. Launch
# demo.launch(share=True)