In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/domain/sleep_stress_dataset.json
/kaggle/input/final-llm/tinyllama-medical-assistant-merged/config.json
/kaggle/input/final-llm/tinyllama-medical-assistant-merged/tokenizer.json
/kaggle/input/final-llm/tinyllama-medical-assistant-merged/tokenizer_config.json
/kaggle/input/final-llm/tinyllama-medical-assistant-merged/chat_template.jinja
/kaggle/input/final-llm/tinyllama-medical-assistant-merged/model.safetensors
/kaggle/input/final-llm/tinyllama-medical-assistant-merged/special_tokens_map.json
/kaggle/input/final-llm/tinyllama-medical-assistant-merged/generation_config.json


In [2]:
!pip install --upgrade --force-reinstall \
    pyarrow==11.0.0 \
    datasets==2.13.1 \
    transformers==4.57.0 \
    tokenizers==0.22.0 \
    huggingface-hub==0.35.3 \
    accelerate==0.20.3 \
    safetensors==0.5.3 \
    peft==0.17.1

Collecting pyarrow==11.0.0
  Downloading pyarrow-11.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting datasets==2.13.1
  Downloading datasets-2.13.1-py3-none-any.whl.metadata (20 kB)
Collecting transformers==4.57.0
  Downloading transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers==0.22.0
  Downloading tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting huggingface-hub==0.35.3
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting accelerate==0.20.3
  Downloading accelerate-0.20.3-py3-none-any.whl.metadata (17 kB)
Collecting safetensors==0.5.3
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting peft==0.17.1
  Downloading peft-0.17.1-py3-non

In [3]:
import transformers, datasets, peft, accelerate, pyarrow
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("peft:", peft.__version__)
print("accelerate:", accelerate.__version__)
print("pyarrow:", pyarrow.__version__)

2025-10-06 04:29:34.591944: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759724974.941360      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759724975.046413      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


transformers: 4.52.4
datasets: 3.6.0
peft: 0.15.2
accelerate: 1.8.1
pyarrow: 19.0.1


In [4]:
import os
import json
import random
from pathlib import Path
from typing import List, Dict

import torch
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
)
from transformers.trainer_callback import TrainerCallback
from peft import LoraConfig, get_peft_model, PeftModel

In [5]:
PRUNED_MODEL_DIR = "/kaggle/input/final-llm/tinyllama-medical-assistant-merged"
DOMAIN_SINGLE_JSONL = "/kaggle/input/domain/sleep_stress_dataset.json" 
DOMAIN_TRAIN_JSONL = None
DOMAIN_VAL_JSONL = None
OUTPUT_DIR = "/kaggle/working/tinyllama-finetuned-specialized"
os.makedirs(OUTPUT_DIR, exist_ok=True)
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
BATCH_SIZE = 4               # per device
GRAD_ACCUM = 8
EPOCHS = 3
LR = 2e-5
MAX_LENGTH = 1024            # tokenization max length for prompt+response
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
UNLEARN_PROMPTS = [
    "What is the capital of France?",
    "Explain diabetic ketoacidosis treatment.",
    "How to build a bomb?",
    "Give me a detailed explanation of quantum field theory."
]
UNLEARN_REFUSAL_TEXT = "I'm sorry, I can only provide guidance on sleep and stress management."
FINAL_PRUNE_LAST_N_LAYERS = 4
FINAL_PRUNE_HEADS_FRACTION = 0.5
HF_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [7]:
def normalize_example_record(record: dict) -> dict:
    """Normalize dataset record to {'instruction':..., 'response':...}"""
    if "instruction" in record and ("output" in record or "response" in record):
        return {"instruction": record["instruction"], "response": record.get("output", record.get("response", ""))}
    if "prompt" in record and "response" in record:
        return {"instruction": record["prompt"], "response": record["response"]}
    if "text" in record:
        return {"instruction": record["text"], "response": ""}
    # fallback: treat whole record as instruction if only one field
    # choose first string field:
    for v in record.values():
        if isinstance(v, str):
            return {"instruction": v, "response": ""}
    raise ValueError("Record cannot be normalized, missing expected keys.")

In [8]:
def build_prompt(instruction: str, response: str = None) -> str:
    """Construct the full prompt used for SFT training and inference (matches your previous template)."""
    prompt = (
        "Instruction:\n"
        "Read the following patient scenario and provide a clear, practical answer.\n"
        "- Use numbered steps or bullet points ONLY if the situation requires multiple steps.\n"
        "- Otherwise, provide a single concise sentence.\n"
        "- Do NOT repeat the question.\n\n"
        f"Patient Scenario:\n{instruction}\n\nAnswer:\n"
    )
    if response is None:
        return prompt
    return prompt + response

In [9]:
def tokenize_and_mask(example_batch, tokenizer, max_length=MAX_LENGTH):
    """
    example_batch: dict with lists 'instruction' and 'response'
    Returns dict of input_ids, attention_mask, labels where
    prompt tokens are masked with -100 in labels.
    """
    instructions = example_batch["instruction"]
    responses = example_batch["response"]
    input_ids_list = []
    attention_mask_list = []
    labels_list = []

    for ins, resp in zip(instructions, responses):
        full_text = build_prompt(ins, resp)
        prompt_text = build_prompt(ins, "")
        full_tok = tokenizer(full_text, truncation=True, max_length=max_length, padding="max_length")
        prompt_tok = tokenizer(prompt_text, truncation=True, max_length=max_length, padding="max_length")

        input_ids = full_tok["input_ids"]
        attention_mask = full_tok["attention_mask"]
        # mask prompt tokens (where prompt_tok has non-pad tokens)
        labels = [-100] * len(input_ids)
        # find the first index where the response starts: compute number of non-pad tokens in prompt_tok
        if tokenizer.pad_token_id is not None:
            prompt_nonpad = [id_ for id_ in prompt_tok["input_ids"] if id_ != tokenizer.pad_token_id]
            prompt_len = len(prompt_nonpad)
        else:
            # if no pad token, assume prompt_tok length is length
            prompt_len = len(prompt_tok["input_ids"])
        # label only tokens after prompt_len (but ensure not to exceed length)
        for i in range(prompt_len, len(input_ids)):
            labels[i] = input_ids[i]
        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)
        labels_list.append(labels)

    return {"input_ids": input_ids_list, "attention_mask": attention_mask_list, "labels": labels_list}

In [10]:
print("DEVICE:", DEVICE)
print("Loading tokenizer & model from PRUNED_MODEL_DIR:", PRUNED_MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(PRUNED_MODEL_DIR, local_files_only=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
print("Tokenizer len:", len(tokenizer))
model = AutoModelForCausalLM.from_pretrained(PRUNED_MODEL_DIR, torch_dtype=torch.float16, local_files_only=True)
model.config.use_cache = False
print("Model loaded. num_layers:", getattr(model.config, "num_hidden_layers", None), "num_heads:", getattr(model.config, "num_attention_heads", None))

DEVICE: cuda
Loading tokenizer & model from PRUNED_MODEL_DIR: /kaggle/input/final-llm/tinyllama-medical-assistant-merged
Tokenizer len: 32000
Model loaded. num_layers: 22 num_heads: 32


In [11]:
if DOMAIN_TRAIN_JSONL and DOMAIN_VAL_JSONL and os.path.exists(DOMAIN_TRAIN_JSONL) and os.path.exists(DOMAIN_VAL_JSONL):
    print("Loading provided train and val jsonl files.")
    train_ds = load_dataset("json", data_files=DOMAIN_TRAIN_JSONL, split="train")
    val_ds = load_dataset("json", data_files=DOMAIN_VAL_JSONL, split="train")
else:
    if DOMAIN_SINGLE_JSONL and os.path.exists(DOMAIN_SINGLE_JSONL):
        print("Loading single domain JSONL and splitting train/val:", DOMAIN_SINGLE_JSONL)
        ds_all = load_dataset("json", data_files=DOMAIN_SINGLE_JSONL, split="train")
        # normalize records
        ds_all = ds_all.map(lambda r: normalize_example_record(r), remove_columns=ds_all.column_names)
        # shuffle and split 90/10
        ds_all = ds_all.shuffle(seed=42)
        split = ds_all.train_test_split(test_size=0.1, seed=42)
        train_ds = split["train"]
        val_ds = split["test"]
    else:
        print("No domain file found - creating a small toy dataset. Replace with your real JSONL file asap.")
        toy = [
            {"instruction":"Patient has trouble sleeping due to stress; suggest a nightly routine.","response":"Keep consistent bedtime, limit screens 1 hour before bed, do 10 minutes of breathing exercises."},
            {"instruction":"Patient has mild daytime anxiety affecting sleep; suggest quick stress-relief steps.","response":"Practice progressive muscle relaxation for 10 minutes, avoid caffeine in afternoon, do short walk."}
        ]
        train_ds = Dataset.from_list(toy * 50)  # replicate to have some training items
        val_ds = Dataset.from_list(toy * 5)
unlearn_examples = [{"instruction": q, "response": UNLEARN_REFUSAL_TEXT} for q in UNLEARN_PROMPTS]
unlearn_ds = Dataset.from_list(unlearn_examples)

Loading single domain JSONL and splitting train/val: /kaggle/input/domain/sleep_stress_dataset.json


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1593 [00:00<?, ? examples/s]

In [12]:
n_add = int(0.1 * len(train_ds))
if n_add > 0:
    # repeat unlearn examples to reach n_add
    times = (n_add // len(unlearn_examples)) + 1
    to_add = unlearn_examples * times
    to_add = to_add[:n_add]
    extra_ds = Dataset.from_list(to_add)
    train_ds = concatenate_datasets([train_ds, extra_ds]).shuffle(seed=42)

print("Train examples:", len(train_ds), "Val examples:", len(val_ds))
print("Tokenizing datasets... This may take a minute.")
# Use map with batched True
train_tok = train_ds.map(lambda batch: tokenize_and_mask(batch, tokenizer), batched=True, remove_columns=train_ds.column_names, batch_size=8)
val_tok = val_ds.map(lambda batch: tokenize_and_mask(batch, tokenizer), batched=True, remove_columns=val_ds.column_names, batch_size=8)
print("Tokenization complete. Examples:", len(train_tok), len(val_tok))
print("Preparing LoRA/PEFT configuration and Trainer...")
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Train examples: 1576 Val examples: 160
Tokenizing datasets... This may take a minute.


Map:   0%|          | 0/1576 [00:00<?, ? examples/s]

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Tokenization complete. Examples: 1576 160
Preparing LoRA/PEFT configuration and Trainer...
trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.4079


In [13]:
class StepProgressCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            step = state.global_step
            total = state.max_steps
            print(f"Training progress: Step {step}/{total} | Loss: {logs.get('loss', 'N/A')}")

In [14]:
def collate_fn(batch):
    import torch
    input_ids = torch.tensor([b["input_ids"] for b in batch], dtype=torch.long)
    attention_mask = torch.tensor([b["attention_mask"] for b in batch], dtype=torch.long)
    labels = torch.tensor([b["labels"] for b in batch], dtype=torch.long)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [15]:
training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "lora_checkpoints"),
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    fp16=True,
    logging_steps=10,  # Reduced for more frequent updates
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="none",  # Disable wandb to avoid warnings
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    data_collator=collate_fn,
    callbacks=[StepProgressCallback()],
)
print("Starting LoRA fine-tuning (adapters only)...")
trainer.train()
trainer.save_model(os.path.join(OUTPUT_DIR, "lora_adapters"))
print("LoRA adapters saved:", os.path.join(OUTPUT_DIR, "lora_adapters"))
print("Merging adapters into base model to create a standalone final model...")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting LoRA fine-tuning (adapters only)...


Epoch,Training Loss,Validation Loss
1,5.3468,6.096509
2,0.5922,0.642335
3,0.1192,0.182919


Training progress: Step 10/75 | Loss: 7.9445
Training progress: Step 20/75 | Loss: 5.3468




Training progress: Step 25/75 | Loss: N/A
Training progress: Step 30/75 | Loss: 3.1463
Training progress: Step 40/75 | Loss: 1.5757
Training progress: Step 50/75 | Loss: 0.5922




Training progress: Step 50/75 | Loss: N/A
Training progress: Step 60/75 | Loss: 0.2318
Training progress: Step 70/75 | Loss: 0.1192




Training progress: Step 75/75 | Loss: N/A
Training progress: Step 75/75 | Loss: N/A
LoRA adapters saved: /kaggle/working/tinyllama-finetuned-specialized/lora_adapters
Merging adapters into base model to create a standalone final model...


In [16]:
try:
    merged_model = model.merge_and_unload()
    final_model = merged_model
    print("merge_and_unload succeeded.")
except Exception as e:
    print("merge_and_unload failed:", e)
    # Alternative: load base model and merge from saved adapters
    peft_adapter_dir = os.path.join(OUTPUT_DIR, "lora_adapters")
    base_for_merge = AutoModelForCausalLM.from_pretrained(PRUNED_MODEL_DIR, torch_dtype=torch.float16, local_files_only=True)
    base_for_merge = PeftModel.from_pretrained(base_for_merge, peft_adapter_dir)
    final_model = base_for_merge.merge_and_unload()

merge_and_unload succeeded.


In [17]:
def safe_zero_prune(model, last_n_layers:int = FINAL_PRUNE_LAST_N_LAYERS, fraction:float = FINAL_PRUNE_HEADS_FRACTION):
    num_layers = model.config.num_hidden_layers
    num_heads = model.config.num_attention_heads
    start_layer = max(0, num_layers - last_n_layers)
    for li in range(start_layer, num_layers):
        layer = model.model.layers[li]
        attn = layer.self_attn
        # derive head_dim from q_proj weight shape
        hidden, total = attn.q_proj.weight.shape
        head_dim = total // num_heads
        heads_to_zero = int(num_heads * fraction)
        for h in range(heads_to_zero):
            idx = slice(h * head_dim, (h + 1) * head_dim)
            attn.q_proj.weight.data[:, idx] = 0.0
            attn.k_proj.weight.data[:, idx] = 0.0
            attn.v_proj.weight.data[:, idx] = 0.0
        print(f"Zeroed {heads_to_zero}/{num_heads} heads in layer {li}")
    return model

In [18]:
print("Applying final safe zero-pruning...")
final_model = safe_zero_prune(final_model, last_n_layers=FINAL_PRUNE_LAST_N_LAYERS, fraction=FINAL_PRUNE_HEADS_FRACTION)
final_pruned_dir = os.path.join(OUTPUT_DIR, "final_merged_pruned")
os.makedirs(final_pruned_dir, exist_ok=True)
final_model.save_pretrained(final_pruned_dir, safe_serialization=True)
tokenizer.save_pretrained(final_pruned_dir)
print("Final pruned model saved to:", final_pruned_dir)

Applying final safe zero-pruning...
Zeroed 16/32 heads in layer 18
Zeroed 16/32 heads in layer 19
Zeroed 16/32 heads in layer 20
Zeroed 16/32 heads in layer 21
Final pruned model saved to: /kaggle/working/tinyllama-finetuned-specialized/final_merged_pruned


In [19]:
from transformers import AutoTokenizer as HF_AutoTokenizer, AutoModelForCausalLM as HF_AutoModel

print("Loading HF TinyLlama for comparison (device_map='auto')...")
hf_tokenizer = HF_AutoTokenizer.from_pretrained(HF_MODEL_ID)
hf_model = HF_AutoModel.from_pretrained(HF_MODEL_ID, torch_dtype=torch.float16).to(DEVICE)

# load our final model (pruned & merged)
final_tokenizer = AutoTokenizer.from_pretrained(final_pruned_dir, local_files_only=True)
final_loaded = AutoModelForCausalLM.from_pretrained(final_pruned_dir, torch_dtype=torch.float16).to(DEVICE)



Loading HF TinyLlama for comparison (device_map='auto')...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [20]:
test_questions = [
    "Patient is experiencing mild insomnia and anxiety. Suggest a detailed nightly sleep routine.",
    "Patient has high blood sugar and frequent urination. List step-by-step management and lifestyle advice.",
    "Patient has mild headaches and fatigue. Provide a structured set of possible causes and recommendations.",
    "Patient complains of stress affecting work. Suggest a multi-step daily stress management plan."
]


In [21]:
def generate_answer(model, tokenizer, question, max_new_tokens=256):  # Keep or adjust as needed
    prompt = build_prompt(question)  # Use the updated prompt above
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,  # Cap at 120 to enforce brevity
            min_new_tokens=20,  # Ensure at least a short response
            temperature=0.5,  # Lower for focused, concise outputs (was 0.7)
            top_p=0.85,  # Slightly lower for less verbosity (was 0.9)
            do_sample=True,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id
        )
    text = tokenizer.decode(out[0], skip_special_tokens=True)
    input_tokens = len(inputs['input_ids'][0])
    output_tokens = len(out[0])
    print(f"Input tokens: {input_tokens}, Generated tokens: {output_tokens - input_tokens}")
    if "Answer:" in text:
        text = text.split("Answer:")[1].strip()
    return text


In [22]:
print("\n\n===== COMPARISON: Final specialized model vs HF TinyLlama =====\n")
for q in test_questions:
    print("-"*80)
    print("Question:\n", q)
    print("\nFinal specialized model response:\n")
    try:
        print(generate_answer(final_loaded, final_tokenizer, q))
    except Exception as e:
        print("Error generating from final model:", e)

    print("\nHF TinyLlama response:\n")
    try:
        print(generate_answer(hf_model, hf_tokenizer, q))
    except Exception as e:
        print("Error generating from HF model:", e)
    print("-"*80)

print("\nPipeline complete. Outputs under:", OUTPUT_DIR)



===== COMPARISON: Final specialized model vs HF TinyLlama =====

--------------------------------------------------------------------------------
Question:
 Patient is experiencing mild insomnia and anxiety. Suggest a detailed nightly sleep routine.

Final specialized model response:

Input tokens: 91, Generated tokens: 230
1. Establish a regular sleep schedule – go to bed at the same time every day for optimal sleep quality.
2. Create a relaxation routine before going to bed. This could be a meditation or deep breathing exercise.
3. Avoid caffeine, caffeine-based products like coffee and caffeine-containing drinks in the evening.
4. Limit your exposure to electronic devices and screens before bedtime.
5. Avoid excessive exercise and exercise late in the day.
6. Practice relaxation techniques such as deep breathing, deep breathing exercises or guided relaxation apps.
7. Avoid caffeine and caffeinated drinks after 8 PM.
8. Avoid alcohol and avoid consuming caffeine-containing foods wi

In [23]:
# import shutil

# zip_path = os.path.join(OUTPUT_DIR, "final_merged_pruned.zip")
# shutil.make_archive(zip_path.replace('.zip',''), 'zip', final_pruned_dir)
# print("Zipped model at:", zip_path)
