In [None]:
!pip install -q torch torchvision torchaudio
!pip install -q xformers peft accelerate bitsandbytes trl
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# For GGUF conversion (essential for Ollama deployment)
!pip install -q llama-cpp-python gguf

In [None]:
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments

# Choose your model: Mistral 7B is recommended
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Load the model with QLoRA configuration
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=2048,
    dtype=torch.float16,
    load_in_4bit=True,  # Activates QLoRA
)

# Define LoRA parameters
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    max_seq_length=2048,
)

In [None]:
from datasets import load_dataset

# Load your JSONL dataset
dataset_path = "/content/research_dataset.jsonl"
dataset = load_dataset("json", data_files=dataset_path, split="train")

print(f"Loaded {len(dataset)} training examples")

# Format dataset for instruction tuning (NOT chat format)
def formatting_function(examples):
    """
    Format for instruction-following (not chat).
    For a scoring model, we want instruction -> output directly.
    """
    texts = []
    for instruction, output in zip(examples["instruction"], examples["output"]):
        # Mistral Instruct format for single-turn instruction following
        text = f"<s>[INST] {instruction} [/INST] {output}</s>"
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(formatting_function, batched=True)

# Show a sample
print("\nSample formatted example:")
print(dataset[0]["text"])

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
import torch

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=3,  # For 40 samples, consider 5-10 epochs
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        output_dir="mistral_profile_scorer_v1",
        optim="adamw_8bit",
        seed=42,
        save_strategy="epoch",  # Save after each epoch
        save_total_limit=2,     # Keep only last 2 checkpoints
    ),
)

# Start training
trainer.train()

In [None]:
# Merge LoRA weights with base model
print("Merging LoRA adapter with base model...")
model.save_pretrained_merged(
    "final_merged_model",
    tokenizer,
    save_method="merged_16bit"  # Use 'merged_16bit' for full precision
)
print("✓ Merged model saved to: final_merged_model/")

In [None]:
import os

# --- Define Paths ---
# Source directory where your merged model is saved
HF_MODEL_DIR = '/content/final_merged_model'
# Intermediate output file name (will be full size, e.g., 7GB)
GGUF_OUTPUT_FP16 = '/content/profile_scorer.gguf'
# Path to the conversion script
# FIXED: Using the correct, modern script name from the llama.cpp repository
CONVERT_SCRIPT = 'llama.cpp/convert_hf_to_gguf.py'

print(f"Starting conversion of {HF_MODEL_DIR} to unquantized GGUF...")

# Run the conversion script from the shell
# We use the corrected script name here: convert_hf_to_gguf.py
!python {CONVERT_SCRIPT} \
    --outfile {GGUF_OUTPUT_FP16} \
    --outtype f16 \
    {HF_MODEL_DIR}

print(f"\n✓ Intermediate FP16 GGUF saved to: {GGUF_OUTPUT_FP16}")

In [None]:
# Enable inference mode
FastLanguageModel.for_inference(model)

# Test with a sample profile
test_prompt = """Score this Twitter profile. Given the following profile attributes, return a JSON object with: 'handle', 'score' (0.00-1.00 likelihood of being a real person who is an academic researcher), 'reason'. PROFILE ATTRIBUTES: handle: DrJaneSmith, displayName: Dr. Jane Smith, bio: Assistant Professor of Computer Science at MIT. PhD from Stanford. Research interests: ML, NLP., created_at: Mon Jan 15 10:30:00 +0000 2018, followerCount: 1523, location: Cambridge, MA"""

inputs = tokenizer(
    f"<s>[INST] {test_prompt} [/INST]",
    return_tensors="pt"
).to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    temperature=0.1,  # Low temperature for consistent scoring
    do_sample=False   # Deterministic output
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\nModel Response:")
print(response.split("[/INST]")[-1].strip())