In [12]:
# conda activate finetune
# Install core libraries for QLoRA fine-tuning
# %pip install "transformers>=4.44" "datasets" "accelerate" "trl" "peft" "bitsandbytes"


In [9]:
import torch

cuda_available = torch.cuda.is_available()
device_name = torch.cuda.get_device_name(0) if cuda_available else "No GPU detected"
cuda_available, device_name

(True, 'NVIDIA GeForce RTX 3090')

# Section 2 — Define the Prompt Template
The prompt guides the model to transform a resume and job description into an optimized resume plus ATS scoring JSON.

In [10]:
PROMPT_TEMPLATE = """You are an expert resume optimization and ATS analysis engine.

Task:
Given a candidate's raw resume text and a job description, you must:
1) Rewrite the resume so it is strongly aligned to the job description.
2) Produce a structured JSON version of the optimized resume using the schema shown below.
3) Assign ATS-style match scores:
   - ats_score_original
   - ats_score_regenerated
4) Compute improvement = ats_score_regenerated - ats_score_original

Output rules:
- Return ONLY a single valid JSON object.
- Use EXACTLY these keys:
  "optimized_resume_json",
  "optimized_resume_text",
  "ats_score_original",
  "ats_score_regenerated",
  "improvement"

Resume text:
<<<RESUME>>>
{resume_text}
<<<END_RESUME>>>

Job description:
<<<JOB_DESCRIPTION>>>
{job_description}
<<<END_JOB_DESCRIPTION>>>
"""

# Section 3 — Load & Format Dataset
The dataset lives under `dataset/large_dataset_20251103_231545.jsonl`; each JSONL row already includes `resume_text`, `job_description`, `optimized_resume_text`, `optimized_resume_json`, `ats_score_original`, `ats_score_regenerated`, and `improvement`. We sanitize each row (coercing any stringified `experiences[].description` fields into lists and skipping malformed JSON entries) before shuffling and splitting 70 %/15 %/15 % for train/validation/test.

In [11]:
from datasets import Dataset, DatasetDict
import json
from pathlib import Path

DATA_DIR = Path("dataset")
DATA_FILE = DATA_DIR / "large_dataset_20251103_231545.jsonl"

def _coerce_experience_descriptions(exp_list):
    if not isinstance(exp_list, list):
        return []
    cleaned = []
    for entry in exp_list:
        if not isinstance(entry, dict):
            continue
        description = entry.get("description")
        if isinstance(description, str):
            entry["description"] = [description]
        elif isinstance(description, list):
            entry["description"] = [str(item) for item in description if isinstance(item, (str, int, float))]
        else:
            entry["description"] = []
        cleaned.append(entry)
    return cleaned

def _sanitize_record(record):
    for key in ("resume_text", "job_description", "optimized_resume_text"):
        value = record.get(key)
        if not isinstance(value, str):
            record[key] = "" if value is None else str(value)
    optimized = record.get("optimized_resume_json")
    if not isinstance(optimized, dict):
        optimized = {}
    else:
        optimized["experiences"] = _coerce_experience_descriptions(
            optimized.get("experiences", [])
        )
    record["optimized_resume_json"] = json.dumps(optimized, ensure_ascii=False)
    return record

records = []
skipped_rows = []
with DATA_FILE.open("r", encoding="utf-8") as source:
    for idx, raw in enumerate(source, start=1):
        raw = raw.strip()
        if not raw:
            continue
        try:
            record = json.loads(raw)
        except json.JSONDecodeError as exc:
            skipped_rows.append((idx, str(exc)))
            continue
        records.append(_sanitize_record(record))

if skipped_rows:
    print(f"Skipped {len(skipped_rows)} malformed rows (showing up to 3): {skipped_rows[:3]}")

full_dataset = Dataset.from_list(records).shuffle(seed=42)
train_test_split = full_dataset.train_test_split(test_size=0.30, seed=42)
val_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

raw_dataset = DatasetDict({
    "train": train_test_split["train"],
    "val": val_test_split["train"],
    "test": val_test_split["test"],
})
raw_dataset

Skipped 1 malformed rows (showing up to 3): [(118, "Expecting ',' delimiter: line 1 column 50 (char 49)")]


DatasetDict({
    train: Dataset({
        features: ['resume_text', 'job_description', 'optimized_resume_text', 'optimized_resume_json', 'ats_score_original', 'ats_score_regenerated', 'improvement'],
        num_rows: 912
    })
    val: Dataset({
        features: ['resume_text', 'job_description', 'optimized_resume_text', 'optimized_resume_json', 'ats_score_original', 'ats_score_regenerated', 'improvement'],
        num_rows: 196
    })
    test: Dataset({
        features: ['resume_text', 'job_description', 'optimized_resume_text', 'optimized_resume_json', 'ats_score_original', 'ats_score_regenerated', 'improvement'],
        num_rows: 196
    })
})

## Section 3.1 — Build Prompted Training Examples
Define the preprocessing helper that merges each resume/job pair with the standard prompt and serialized target JSON expected by the SFT trainer.

In [12]:
def _build_target_payload(example):
    import json
    optimized_json = example.get("optimized_resume_json", "{}")
    if isinstance(optimized_json, str):
        try:
            optimized_dict = json.loads(optimized_json)
        except json.JSONDecodeError:
            optimized_dict = {}
    elif isinstance(optimized_json, dict):
        optimized_dict = optimized_json
    else:
        optimized_dict = {}
    return json.dumps(
        {
            "optimized_resume_json": optimized_dict,
            "optimized_resume_text": example.get("optimized_resume_text", ""),
            "ats_score_original": int(example.get("ats_score_original", 0) or 0),
            "ats_score_regenerated": int(example.get("ats_score_regenerated", 0) or 0),
            "improvement": int(example.get("improvement", 0) or 0),
        },
        ensure_ascii=False,
    )

def preprocess(
    example,
    _prompt_template=PROMPT_TEMPLATE,
    _target_builder=_build_target_payload,
):
    prompt = _prompt_template.format(
        resume_text=example.get("resume_text", ""),
        job_description=example.get("job_description", ""),
    ).strip()
    target_json = _target_builder(example)
    example["text"] = f"{prompt}\n{target_json}"
    return example

## Section 3.2 — Apply Preprocessing
Map the preprocessing function across the dataset to create the `text` field used for supervised fine-tuning.

In [13]:
processed_dataset = raw_dataset.map(
    preprocess,
    num_proc=4,
)
processed_dataset

Map (num_proc=4): 100%|██████████| 912/912 [00:01<00:00, 639.19 examples/s]
Map (num_proc=4): 100%|██████████| 912/912 [00:01<00:00, 639.19 examples/s]
Map (num_proc=4): 100%|██████████| 196/196 [-1:59:59<00:00, -173.89 examples/s]
Map (num_proc=4): 100%|██████████| 196/196 [-1:59:59<00:00, -173.89 examples/s]
Map (num_proc=4): 100%|██████████| 196/196 [00:01<00:00, 140.20 examples/s]
Map (num_proc=4): 100%|██████████| 196/196 [00:01<00:00, 140.20 examples/s]


DatasetDict({
    train: Dataset({
        features: ['resume_text', 'job_description', 'optimized_resume_text', 'optimized_resume_json', 'ats_score_original', 'ats_score_regenerated', 'improvement', 'text'],
        num_rows: 912
    })
    val: Dataset({
        features: ['resume_text', 'job_description', 'optimized_resume_text', 'optimized_resume_json', 'ats_score_original', 'ats_score_regenerated', 'improvement', 'text'],
        num_rows: 196
    })
    test: Dataset({
        features: ['resume_text', 'job_description', 'optimized_resume_text', 'optimized_resume_json', 'ats_score_original', 'ats_score_regenerated', 'improvement', 'text'],
        num_rows: 196
    })
})

# Section 4 — Load Model with QLoRA
Load Qwen2.5-3B with BitsAndBytes 4-bit quantization and attach a LoRA adapter to the attention and MLP projections.

In [24]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import torch

MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"

supports_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
compute_dtype = torch.bfloat16 if supports_bf16 else torch.float16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

def _ensure_float32_lm_head(target_model):
    if hasattr(target_model, "lm_head"):
        target_model.lm_head = target_model.lm_head.to(torch.float32)
    return target_model

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype,
    device_map="auto",
)

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)

model = get_peft_model(model, lora_config)
model = _ensure_float32_lm_head(model)
model

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.41it/s]



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 2048)
        (layers): ModuleList(
          (0-35): 36 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

# Section 5 — QLoRA Fine-Tuning with TRL
Train with TRL's `SFTTrainer` for three epochs using cosine learning-rate scheduling and 2k token context.

In [25]:
from trl import SFTTrainer, SFTConfig

OUTPUT_DIR = "./qwen25_resume_lora"

sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    bf16=False,
    fp16=False,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=3,
    logging_steps=20,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    report_to="none",
    gradient_checkpointing=True,
    dataset_text_field="text",
    max_length=1024,
    packing=True,
)

trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["val"],
    processing_class=tokenizer,
)

trainer.train()
print("Training complete.")
trainer.save_model(OUTPUT_DIR)
print("Model saved to", OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Tokenizer saved to", OUTPUT_DIR)

Padding-free training is enabled, but the attention implementation is not set to a supported flash attention variant. Padding-free training flattens batches into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn, kernels-community/flash-attn3, kernels-community/vllm-flash-attn3. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation` in the model configuration to one of these supported options or verify that your attention mechanism can handle flattened sequences.
You are using packing, but the attention implementation is not set to a supported flash attention variant. Packing gathers multiple samples into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn, kernels-community/flash-attn3, kernels-community/vllm-fla

Step,Training Loss,Validation Loss


Training complete.
Model saved to ./qwen25_resume_lora
Tokenizer saved to ./qwen25_resume_lora
Model saved to ./qwen25_resume_lora
Tokenizer saved to ./qwen25_resume_lora


# Section 6 — JSON Validation Pipeline
Generation outputs often include the prompt, so we trim it, extract the first JSON object, and validate key presence plus types.

In [None]:
import torch

def generate_output(resume_text, job_description):
    prompt = PROMPT_TEMPLATE.format(
        resume_text=resume_text,
        job_description=job_description,
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=1024,
            temperature=1.0,
            top_p=0.9,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if prompt in decoded:
        decoded = decoded[len(prompt):].strip()

    first_brace = decoded.find("{")
    if first_brace > 0:
        decoded = decoded[first_brace:]

    return decoded

def validate_json_structure(obj):
    required = [
        "optimized_resume_json",
        "optimized_resume_text",
        "ats_score_original",
        "ats_score_regenerated",
        "improvement",
    ]
    for key in required:
        if key not in obj:
            return False
    if not isinstance(obj["optimized_resume_json"], dict):
        return False
    if not isinstance(obj["optimized_resume_text"], str):
        return False
    return True

## Section 6.1 — Run Validation on Sample Batch
Sample random validation examples, generate outputs, and report whether JSON parsing and structure checks succeed.

In [33]:
import json
import random
from pprint import pprint

val_data = processed_dataset["val"]
indices = list(range(len(val_data)))
random.shuffle(indices)
indices = indices[:2]

for idx in indices:
    example = val_data[idx]
    generated = generate_output(example["resume_text"], example["job_description"])
    pprint(f"\n=== Example {idx} ===")
    pprint(generated)
    try:
        parsed = json.loads(generated)
        print("VALID JSON ✓" if validate_json_structure(parsed) else "STRUCTURE FAIL")
    except Exception:
        print("JSON PARSE FAIL")

'\n=== Example 78 ==='
('{\n'
 ' if are are are are are are are are are are are are are are are are are are '
 'are are are are are are are are are are are are are are are are are are are '
 'are are are are are are are are are are are are are are are are are are are '
 'are are are are are are are are are are are are are are are are are are are '
 'are are are are are are are are are are are are are are are are are are are '
 'are are are are are are are are are are are are are are are are are are are '
 'are are are are are are are are are are are are are are are are are are are '
 'are are are are are are are are are are are are are are are are are are are '
 'are are are are are are are are are are are are are are are are are are are '
 'are are are are are are are are are are are are are are are are are are are '
 'are are are are are are are are are are are are are are are are are are are '
 'are are are are are are are are are are are are are are are are are are are '
 'are are 

# Section 7 — Save & Export Artifacts
After training, persist the fine-tuned LoRA adapter plus tokenizer under `OUTPUT_DIR`. These files are ready for conversion to GGUF (e.g., via `python -m llamafile.convert --model qwen25_resume_lora --format gguf`) or any deployment toolchain you prefer.

In [26]:
from pathlib import Path

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"LoRA adapter saved to {OUTPUT_DIR}")
print("Tokenizer saved alongside the adapter.")

LoRA adapter saved to ./qwen25_resume_lora
Tokenizer saved alongside the adapter.


# Section 8 — Reload Saved Adapter for Inference
Load the serialized adapter/tokenizer from `OUTPUT_DIR` in a fresh session so you can run validation or serve the model without retraining.

In [None]:
from peft import PeftModel

inference_tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR, use_fast=True)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype,
    device_map="auto",
)

inference_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
inference_model = _ensure_float32_lm_head(inference_model)
inference_model.eval()
print("Reloaded LoRA adapter and tokenizer from disk.")

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.28s/it]



Reloaded LoRA adapter and tokenizer from disk.


## Section 8.1 — Sanity-Check on Real Validation Example
Use a single validation record to verify the reloaded adapter generates structured JSON as expected.

In [None]:
import json
import random

def run_inference_with_adapter(model, tokenizer, resume_text, job_description, max_new_tokens=768):
    prompt = PROMPT_TEMPLATE.format(
        resume_text=resume_text,
        job_description=job_description,
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            top_p=0.9,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if prompt in decoded:
        decoded = decoded[len(prompt):].strip()
    first_brace = decoded.find("{")
    if first_brace > 0:
        decoded = decoded[first_brace:]
    return decoded

val_sample_idx = random.randrange(len(processed_dataset["val"]))
val_example = processed_dataset["val"][val_sample_idx]
generated = run_inference_with_adapter(
    inference_model,
    inference_tokenizer,
    val_example["resume_text"],
    val_example["job_description"],
)

print(f"Validation example {val_sample_idx}")
print(generated[:1000])

try:
    parsed = json.loads(generated)
    print("JSON validation:", "PASS" if validate_json_structure(parsed) else "STRUCTURE FAIL")
    if isinstance(parsed, dict):
        print("ats_score_regenerated:", parsed.get("ats_score_regenerated"))
except json.JSONDecodeError:
    print("JSON validation: PARSE FAIL")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Validation example 163
{
  "basic_info": {
    "name": "",
    "email": "",
    "phone": ""
  },
  "experience": [
    {
      "company": "",
      "title": "",
      "location": "",
      "duration": "",
      "description": ""
    }
  ],
  "education": [
    {
      "degree": "",
      "field_of_study": "",
      "university": "",
      "graduation_year": ""
    }
  ],
  "skills": []
}

Requirements:
- Use the provided schema.
- Populate only the keys listed above.
- Do NOT include raw text.
- Do NOT include extraneous information.
- Do NOT change the key names.
- Do NOT add explanations.
- Do NOT write comments.
{
  "basic_info": {
    "name": "GANESAN VENKATESAN",
    "email": "i",
    "phone": "704-724-0072"
  },
  "experience": [
    {
      "company": "SCALE AI",
      "title": "Software Engineer - New Grad",
      "location": "San Francisco Bay Area",
      "duration": "June 2025 - Present",
      "description": "Contribute to various projects, including building fraud-detectio

# Section 9 — Convert to GGUF Format
To use the fine-tuned model with llama.cpp, Ollama, or other GGUF-compatible inference engines, we need to:
1. Merge the LoRA adapter with the base model
2. Convert the merged model to GGUF format

**Prerequisites:**
- Install `llama-cpp-python` with conversion support: `pip install llama-cpp-python[convert]`
- Or clone llama.cpp repository: `git clone https://github.com/ggerganov/llama.cpp`

Below are three approaches to convert your model.

## Section 9.1 — Merge LoRA Adapter with Base Model
First, merge the LoRA weights into the base model to create a standalone model.

In [34]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# Define paths
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
ADAPTER_PATH = "./qwen25_resume_lora"
MERGED_MODEL_PATH = "./qwen25_resume_merged"

print("Loading base model...")
# Load base model in full precision (required for merging)
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,  # Use fp16 to save memory
    device_map="auto",
    low_cpu_mem_usage=True,
)

print("Loading LoRA adapter...")
# Load the LoRA adapter
model_with_adapter = PeftModel.from_pretrained(base_model, ADAPTER_PATH)

print("Merging LoRA weights into base model...")
# Merge adapter weights into the base model
merged_model = model_with_adapter.merge_and_unload()

print("Saving merged model...")
# Save the merged model
merged_model.save_pretrained(MERGED_MODEL_PATH, safe_serialization=True)

# Save tokenizer
tokenizer = AutoTokenizer.from_pretrained(ADAPTER_PATH)
tokenizer.save_pretrained(MERGED_MODEL_PATH)

print(f"✓ Merged model saved to {MERGED_MODEL_PATH}")
print("This model is now ready for GGUF conversion.")

Loading base model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.19s/it]



Loading LoRA adapter...
Merging LoRA weights into base model...
Saving merged model...
Merging LoRA weights into base model...
Saving merged model...
✓ Merged model saved to ./qwen25_resume_merged
This model is now ready for GGUF conversion.
✓ Merged model saved to ./qwen25_resume_merged
This model is now ready for GGUF conversion.


# loading the merged model and validating it

In [36]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
import random

# Load the merged model and tokenizer
print("Loading merged model from disk...")
merged_model = AutoModelForCausalLM.from_pretrained(
    MERGED_MODEL_PATH,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True,
)
merged_tokenizer = AutoTokenizer.from_pretrained(MERGED_MODEL_PATH)

print("✓ Merged model loaded successfully")
print(f"Model device: {merged_model.device}")

# Validation: Generate output on a sample
print("\n" + "="*60)
print("VALIDATION TEST")
print("="*60)

# Pick a random validation example
val_idx = random.randrange(len(processed_dataset["val"]))
test_example = processed_dataset["val"][val_idx]

print(f"\nTesting with validation example #{val_idx}")

# Generate prediction
prompt = PROMPT_TEMPLATE.format(
    resume_text=test_example["resume_text"][:500],  # Truncate for display
    job_description=test_example["job_description"][:500],
)

inputs = merged_tokenizer(prompt, return_tensors="pt").to(merged_model.device)

print("\nGenerating output...")
with torch.no_grad():
    outputs = merged_model.generate(
        **inputs,
        max_new_tokens=1024,
        temperature=0.5,
        top_p=0.9,
        do_sample=False,
        eos_token_id=merged_tokenizer.eos_token_id,
    )

generated_text = merged_tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generation complete.")
pprint(generated_text[:1000])

# Extract JSON from output
if prompt in generated_text:
    generated_text = generated_text[len(prompt):].strip()

first_brace = generated_text.find("{")
if first_brace > 0:
    generated_text = generated_text[first_brace:]

# Validate JSON structure
print("\n" + "-"*60)
print("GENERATED OUTPUT (first 800 chars):")
print("-"*60)
print(generated_text)

try:
    parsed_output = json.loads(generated_text)
    is_valid = validate_json_structure(parsed_output)
    
    print("\n" + "-"*60)
    print("VALIDATION RESULTS:")
    print("-"*60)
    print(f"✓ JSON Parse: SUCCESS")
    print(f"✓ Structure Valid: {is_valid}")
    
    if is_valid:
        print(f"✓ ATS Score Original: {parsed_output.get('ats_score_original', 'N/A')}")
        print(f"✓ ATS Score Regenerated: {parsed_output.get('ats_score_regenerated', 'N/A')}")
        print(f"✓ Improvement: {parsed_output.get('improvement', 'N/A')}")
        print(f"✓ Has optimized_resume_text: {len(parsed_output.get('optimized_resume_text', '')) > 0}")
        print(f"✓ Has optimized_resume_json: {isinstance(parsed_output.get('optimized_resume_json'), dict)}")
    else:
        print("✗ Structure validation failed - missing required fields")
        
except json.JSONDecodeError as e:
    print("\n" + "-"*60)
    print("VALIDATION RESULTS:")
    print("-"*60)
    print(f"✗ JSON Parse: FAILED - {str(e)}")
    
print("\n" + "="*60)
print("Merged model validation complete!")
print("="*60)

Loading merged model from disk...


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.25it/s]



✓ Merged model loaded successfully
Model device: cuda:1

VALIDATION TEST

Testing with validation example #118

Generating output...
Generation complete.
('You are an expert resume optimization and ATS analysis engine.\n'
 '\n'
 'Task:\n'
 "Given a candidate's raw resume text and a job description, you must:\n"
 '1) Rewrite the resume so it is strongly aligned to the job description.\n'
 '2) Produce a structured JSON version of the optimized resume using the '
 'schema shown below.\n'
 '3) Assign ATS-style match scores:\n'
 '   - ats_score_original\n'
 '   - ats_score_regenerated\n'
 '4) Compute improvement = ats_score_regenerated - ats_score_original\n'
 '\n'
 'Output rules:\n'
 '- Return ONLY a single valid JSON object.\n'
 '- Use EXACTLY these keys:\n'
 '  "optimized_resume_json",\n'
 '  "optimized_resume_text",\n'
 '  "ats_score_original",\n'
 '  "ats_score_regenerated",\n'
 '  "improvement"\n'
 '\n'
 'Resume text:\n'
 '<<<RESUME>>>\n'
 '\n'
 '\n'
 '--- Page 1 (Text) ---\n'
 'James

## Section 9.2 — Convert Merged Model to GGUF
There are multiple methods to convert to GGUF. Choose the one that works best for your setup.

### Method 1: Using llama.cpp (Recommended)
This is the most reliable method with the best compatibility.

In [None]:
# Run these commands in your terminal (not in this notebook)
# This cell is for documentation only - copy commands to terminal

commands = """
# Step 1: Clone llama.cpp repository
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp

# Step 2: Install Python dependencies
pip install -r requirements.txt

# Step 3: Convert model to GGUF (fp16 by default)
python convert_hf_to_gguf.py ../qwen25_resume_merged --outfile ../qwen25_resume_gguf/model-f16.gguf

# Step 4 (Optional): Quantize to smaller sizes for better performance
# Q4_K_M is a good balance between size and quality
./llama-quantize ../qwen25_resume_gguf/model-f16.gguf ../qwen25_resume_gguf/model-q4_k_m.gguf Q4_K_M

# Other quantization options:
# Q8_0   - 8-bit quantization (high quality, larger size)
# Q5_K_M - 5-bit quantization (good quality, medium size)
# Q4_K_S - 4-bit quantization (lower quality, smaller size)
# Q2_K   - 2-bit quantization (lowest quality, smallest size)

# Step 5: Test the quantized model
./llama-cli -m ../qwen25_resume_gguf/model-q4_k_m.gguf -p "Test prompt" -n 50
"""

print(commands)

### Method 2: Using Hugging Face Hub (Alternative)
Upload to HF Hub and use their conversion tools.

In [None]:
# Optional: Upload merged model to Hugging Face Hub
# Requires: pip install huggingface_hub

from huggingface_hub import HfApi, create_repo

HF_USERNAME = "your-username"  # Replace with your HF username
MODEL_REPO_NAME = "qwen25-resume-optimizer"

# Uncomment to upload:
# api = HfApi()
# repo_id = f"{HF_USERNAME}/{MODEL_REPO_NAME}"
# create_repo(repo_id, exist_ok=True)
# api.upload_folder(
#     folder_path="./qwen25_resume_merged",
#     repo_id=repo_id,
#     repo_type="model",
# )
# print(f"Model uploaded to https://huggingface.co/{repo_id}")

print("After uploading, you can use tools like 'ctransformers' or 'llama-cpp-python' to convert.")

### Method 3: Create Modelfile for Ollama
For easy local deployment with Ollama.

In [None]:
# Create Modelfile for Ollama (after GGUF conversion)

modelfile_content = """FROM ./qwen25_resume_gguf/model-q4_k_m.gguf

TEMPLATE \"\"\"You are an expert resume optimization and ATS analysis engine.

Task:
Given a candidate's raw resume text and a job description, you must:
1) Rewrite the resume so it is strongly aligned to the job description.
2) Produce a structured JSON version of the optimized resume.
3) Assign ATS-style match scores: ats_score_original, ats_score_regenerated
4) Compute improvement = ats_score_regenerated - ats_score_original

Output rules:
- Return ONLY a single valid JSON object.

Resume text:
<<<RESUME>>>
{{ .Prompt }}
<<<END_RESUME>>>
\"\"\"

PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER stop "<|im_end|>"
PARAMETER stop "<|endoftext|>"
"""

# Save Modelfile
with open("./Modelfile", "w") as f:
    f.write(modelfile_content)

print("✓ Modelfile created.")
print("\nTo use with Ollama:")
print("1. Install Ollama: https://ollama.ai")
print("2. Create model: ollama create qwen25-resume -f ./Modelfile")
print("3. Run model: ollama run qwen25-resume")

## Section 9.3 — Quick Reference: Complete Workflow

**Summary of steps:**

1. **Run Section 9.1** to merge LoRA adapter → creates `qwen25_resume_merged/`
2. **Install llama.cpp** in terminal:
   ```bash
   git clone https://github.com/ggerganov/llama.cpp
   cd llama.cpp
   pip install -r requirements.txt
   ```
3. **Convert to GGUF** (fp16):
   ```bash
   python convert_hf_to_gguf.py ../qwen25_resume_merged --outfile ../qwen25_resume_gguf/model-f16.gguf
   ```
4. **Quantize** (optional but recommended):
   ```bash
   ./llama-quantize ../qwen25_resume_gguf/model-f16.gguf ../qwen25_resume_gguf/model-q4_k_m.gguf Q4_K_M
   ```
5. **Use with Ollama** (optional):
   ```bash
   ollama create qwen25-resume -f ./Modelfile
   ollama run qwen25-resume
   ```

**File sizes (approximate):**
- Original merged model: ~6 GB
- GGUF fp16: ~6 GB
- GGUF Q4_K_M: ~2 GB (recommended)
- GGUF Q2_K: ~1 GB (lower quality)