## Step 1: Install Dependencies

In [None]:
!pip install -q torch transformers datasets peft bitsandbytes accelerate pydantic

## Step 2: Mount Google Drive (to save model)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print("âœ… Google Drive mounted")

## Step 3: Upload Training Data

Run this cell, then:
1. Click "Choose Files"
2. Select `ent_ai_triage/modelling/data/training_data.jsonl` from your project (the urgency-only JSONL, ~6.5k examples)
3. Upload (a few seconds)

In [None]:
from google.colab import files

print("ðŸ“¤ Upload training_data.jsonl from your Mac...")
uploaded = files.upload()

# Verify
import os
for filename in uploaded:
    size = os.path.getsize(filename)
    print(f"âœ… Uploaded: {filename} ({size/1024/1024:.1f}MB)")

## Step 4: Load & Prepare Data

In [None]:
import json
from datasets import Dataset

# Load JSONL
data = []
with open('training_data.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

print(f"âœ… Loaded {len(data)} training examples")

# Format as instruction-following (must match training_data.jsonl: instruction / input / output)
def format_prompt(example):
    instruction = example["instruction"]
    input_text = example["input"]
    output_text = example["output"]  # one of: routine, semi-urgent, urgent
    prompt = f"""Instruction: {instruction}

Input: {input_text}

Output: {output_text}"""
    return {"text": prompt}

dataset = Dataset.from_list(data)
dataset = dataset.map(format_prompt, remove_columns=list(dataset.column_names))

# Split 90/10
split = dataset.train_test_split(test_size=0.1, seed=42)
train_data = split["train"]
eval_data = split["test"]

print(f"ðŸ“Š Train: {len(train_data)}, Eval: {len(eval_data)}")

## Step 5: Load Model with QLoRA (4-bit quantization)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model_name = "Qwen/Qwen2-0.5B"

print(f"ðŸ¤– Loading {model_name} with 4-bit quantization...")

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Prepare for QLoRA training
model = prepare_model_for_kbit_training(model)

print("âœ… Model loaded and prepared for QLoRA")

## Step 6: Configure LoRA

In [None]:
# LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print(f"âœ… LoRA applied - trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

## Step 7: Tokenize Data

In [None]:
def tokenize(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        max_length=1024,  # long dialogue transcripts need more than 512
        truncation=True
    )

print("ðŸ”„ Tokenizing data...")
train_data = train_data.map(tokenize, batched=True, num_proc=2)
eval_data = eval_data.map(tokenize, batched=True, num_proc=2)
print("âœ… Tokenization complete")

## Step 8: Train!

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/finetuned-ent-llm",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    learning_rate=2e-4,
    warmup_steps=50,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    logging_steps=20,
    save_total_limit=2,
    load_best_model_at_end=True,
    seed=42
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    data_collator=data_collator
)

print("ðŸš€ Starting training...")
trainer.train()

## Step 9: Save Model

In [None]:
# Merge LoRA adapter into base model (so you get a full model for Ollama conversion later)
print("Merging LoRA weights into base model...")
model = model.merge_and_unload()

# Save to Google Drive (full merged model)
output_dir = "/content/drive/My Drive/finetuned-ent-llm"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"âœ… Model saved to Google Drive: {output_dir}")
print("\nðŸ“¥ Next: download folder to your Mac, then run from project root:")
print("   python ent_ai_triage/modelling/code/export_to_ollama.py --model-dir <path-to-downloaded-folder> --ollama-model-name ent-triage-qwen2")

## Step 10: Test Inference

In [None]:
from transformers import pipeline

# Create pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=200
)

# Test prompt (must match the instruction format from training_data.jsonl)
test_input = """Instruction: You are an ENT triage expert. Classify the urgency of this patient as routine, semi-urgent, or urgent based on their symptoms.

Input: Patient reports severe throat pain and difficulty swallowing for 3 days.

Output:"""

result = pipe(test_input, max_new_tokens=10, do_sample=False)
print("ðŸ§ª Test inference:")
print(result[0]["generated_text"])
print("\n(Expected: Output: urgent or semi-urgent)")

## Step 11: After Colab â€” Push to Ollama (on your Mac)

1. **Download** the `finetuned-ent-llm` folder from Google Drive to your Mac.
2. **Place** it at `ent_ai_triage/modelling/model/finetuned-ent-llm/` (or any path you prefer).
3. **Convert to GGUF** (Ollama uses GGUF). From your project root:
   - Install [llama.cpp](https://github.com/ggerganov/llama.cpp) or use: `ollama run qwen2:0.5b` to confirm Ollama is installed.
   - Convert HuggingFace â†’ GGUF (e.g. `python convert-hf-to-gguf.py <model_dir>` from llama.cpp, or use [ollama/import](https://github.com/ollama/ollama/blob/main/docs/import.md) if supported).
4. **Create Ollama model**: `ollama create ent-triage-qwen2 -f Modelfile` (from the folder that has the Modelfile; generate it with `export_to_ollama.py` first).
5. **Update** `ent_ai_triage/app/config.py`: set `OLLAMA_MODEL_NAME = "ent-triage-qwen2"`, then restart the API.

For detailed steps see `ent_ai_triage/modelling/FINETUNING_GUIDE.md`.