# Fine-tuned model demo

This notebook loads the base causal LM, applies a LoRA adapter (PEFT) saved from experiement, and runs a short question to generate answer.


## import depedencies 

In [None]:

import torch, traceback
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from pathlib import Path


In [None]:
# configs 
MODEL_NAME = "Open-Orca/Mistral-7B-OpenOrca"
ADAPTER_DIR = "/home/jovyan/data/adapters/adapter_final"   # where we saved our adapters
MAX_NEW_TOKENS = 128


In [None]:
# Load tokenizer (fast if possible)
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    print("Loaded fast tokenizer")
except Exception as e:
    print("Fast tokenizer failed, falling back to slow. Error:", e)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
    print("Loaded slow tokenizer")

# Ensure pad token exists
if tokenizer.pad_token is None:
    if getattr(tokenizer, "eos_token", None) is not None:
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.pad_token_id = tokenizer.eos_token_id
        print("Set pad_token = eos_token (ok for causal LM).")
    else:
        tokenizer.add_special_tokens({"pad_token": "<pad>"})
        print("Added pad_token '<pad>' to tokenizer. Will resize embeddings after model load if needed.")


In [None]:
# Load base model (attempt multi-GPU auto placement; fallback to CPU)
base = None
try:
    print("Loading base model with device_map='auto' (may take a while)...")
    base = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto", low_cpu_mem_usage=True, dtype=torch.float16)
    print("Base model loaded with device_map='auto'.")
except Exception as e_auto:
    print("device_map='auto' load failed:", e_auto)
    try:
        print("Falling back to CPU load (this may be slower but safer)...")
        base = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cpu")
        print("Base model loaded on CPU.")
    except Exception as e_cpu:
        print("Failed to load base model. Traceback:")
        traceback.print_exc()
        raise

# Resize embeddings if tokenizer was modified
if tokenizer.vocab_size != base.get_input_embeddings().weight.size(0):
    try:
        base.resize_token_embeddings(len(tokenizer))
        print("Resized model embeddings to tokenizer length:", len(tokenizer))
    except Exception as e:
        print("Warning: resize_token_embeddings failed:", e)


In [None]:
# Apply LoRA adapter
if not Path(ADAPTER_DIR).exists():
    raise FileNotFoundError(f"Adapter directory not found: {ADAPTER_DIR}")

print("Applying LoRA adapter from:", ADAPTER_DIR)
try:
    model = PeftModel.from_pretrained(base, ADAPTER_DIR)
    print("Adapter applied (PeftModel).")
except Exception as e:
    print("PeftModel.from_pretrained failed; attempting fallback: load adapter on CPU then move to GPU.")
    traceback.print_exc()
    base_cpu = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="cpu")
    model = PeftModel.from_pretrained(base_cpu, ADAPTER_DIR)
    # move to GPU if available
    if torch.cuda.is_available():
        model = model.to("cuda")
    print("Fallback adapter load done.")


In [None]:
model.eval()

# determine device where model params live (could be sharded)
try:
    model_device = next(model.parameters()).device
except StopIteration:
    model_device = torch.device("cpu")
print("Model parameters device:", model_device)

# simple prompt (edit as desired)
prompt = "Question: What methods and tools are used to assess the land-use suitability in the Kuala Terengganu coastal zone, and how does this information support urban planning and decision-making?\nAnswer: "

# tokenize and move inputs to same device as model weights
inputs = tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(model_device) for k, v in inputs.items()}

# generate
print("Generating...")
with torch.no_grad():
    generated = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=False,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )

# decode only the newly generated tokens
generated_text = tokenizer.decode(generated[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
print("###### GENERATED ######")
print(generated_text.strip())
