In [None]:
from huggingface_hub import login

login("hf_token")  # Replace with your actual key


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 📦 Install required libraries
!pip install -U bitsandbytes transformers datasets peft accelerate

In [None]:
!pip install -U bitsandbytes
!pip install -U transformers accelerate datasets


In [None]:
!apt-get install -y git-lfs
!git lfs install
!git clone https://huggingface.co/unsloth/Llama-3.2-1B-Instruct /content/drive/MyDrive/llama_models/Llama-3.2-1B-Instruct

In [None]:
# ✅ Imports
import os
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    TrainerCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# ✅ VRAM monitoring callback
class VRAMCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if torch.cuda.is_available():
            vram_allocated = round(torch.cuda.memory_allocated() / 1e9, 2)
            vram_reserved = round(torch.cuda.memory_reserved() / 1e9, 2)
            vram_max_allocated = round(torch.cuda.max_memory_allocated() / 1e9, 2)
            print(f"[Memory] Allocated: {vram_allocated} GB | Reserved: {vram_reserved} GB | Max Allocated: {vram_max_allocated} GB")

# ✅ Paths
model_name = "/content/drive/MyDrive/llama_models/Llama-3.2-1B-Instruct"
train_csv = "/content/drive/MyDrive/llm-classification-finetuning/train.csv"
output_dir = "/content/drive/MyDrive/llm-classification-finetuning/llama-lora-finetuned"

# ✅ Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# ✅ Load 4-bit quantized model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# ✅ Enable model for LoRA fine-tuning
model = prepare_model_for_kbit_training(model)

# ✅ Add LoRA adapters
peft_config = LoraConfig(
    r=128,
    lora_alpha=128,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
        "embed_tokens", "lm_head"
    ],
    lora_dropout=0.1,
    bias="all",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# ✅ Load and format dataset
df = pd.read_csv(train_csv)
df = df.sample(n=500, random_state=42)

def format_prompt(batch):
    texts = []
    for prompt, resp_a, resp_b, winner_a, winner_b in zip(
        batch["prompt"], batch["response_a"], batch["response_b"],
        batch["winner_model_a"], batch["winner_model_b"]
    ):
        prompt_text = f"""A user asked the following question:
"{prompt}"

Two different AI assistants replied:

Response A:
"{resp_a}"

Response B:
"{resp_b}"

Which response is more helpful, human-like, and aligned with user expectations? Reply with only "A" or "B".
"""
        label = "A" if winner_a == 1 else ("B" if winner_b == 1 else "C")
        texts.append(prompt_text + label)
    return {"text": texts}

dataset = Dataset.from_pandas(df)
dataset = dataset.map(format_prompt, batched=True)

def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names, batched=True)

# ✅ Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# ✅ Training setup: save all checkpoints
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=15,
    logging_steps=20,
    save_strategy="epoch",   # ✅ Save once per epoch
    # ❌ save_total_limit removed → keeps all
    fp16=True,
    report_to="none",
    push_to_hub=False,
)

# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    callbacks=[VRAMCallback()]
)

# ✅ Check for last checkpoint
last_checkpoint = None
if os.path.isdir(output_dir):
    checkpoints = [
        os.path.join(output_dir, d)
        for d in os.listdir(output_dir)
        if d.startswith("checkpoint")
    ]
    if checkpoints:
        last_checkpoint = max(checkpoints, key=os.path.getctime)
        print(f"✅ Resuming from checkpoint: {last_checkpoint}")
    else:
        print("ℹ️ No checkpoint found, training from scratch.")
else:
    print("ℹ️ No checkpoint folder found, training from scratch.")

# ✅ Train (resume if checkpoint found)
trainer.train(resume_from_checkpoint=last_checkpoint)

# ✅ Save final model & tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


In [None]:
#submission code
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from tqdm import tqdm
import torch
import random

# ✅ Load local model
model_path = "/content/drive/MyDrive/llm-classification-finetuning/llama-lora-finetuned/checkpoint-208"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

# ✅ Enable sampling to estimate probabilities
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=20,
    temperature=0.7,
    do_sample=True
)

# ✅ Load dataset
df = pd.read_csv("/content/drive/MyDrive/llm-classification-finetuning/test.csv")

# ✅ Prompt template
PROMPT_TEMPLATE = """
A user asked the following question:
"{prompt}"

Two different AI assistants replied:

Response A:
"{response_a}"

Response B:
"{response_b}"

Which response is more helpful, human-like, and aligned with user expectations? Reply with only "A" or "B".
"""

# ✅ Generate probabilistic predictions
predictions = []

N_SAMPLES = 5  # number of samples per input

for _, row in tqdm(df.iterrows(), total=len(df)):
    prompt = row['prompt']
    response_a = row['response_a']
    response_b = row['response_b']
    uid = row['id']

    # Randomly flip A/B to reduce bias
    if random.random() < 0.5:
        actual_a, actual_b = response_a, response_b
        flipped = False
    else:
        actual_a, actual_b = response_b, response_a
        flipped = True

    prompt_text = PROMPT_TEMPLATE.format(
        prompt=prompt,
        response_a=actual_a.strip(),
        response_b=actual_b.strip()
    )

    results = generator(prompt_text, num_return_sequences=N_SAMPLES)

    count_a = 0
    count_b = 0

    for res in results:
        last_line = res['generated_text'].strip().split('\n')[-1].strip().upper()
        if "B" in last_line:
            pred = "B"
        else:
            pred = "A"

        # Undo flipping
        if flipped:
            pred = "A" if pred == "B" else "B"

        if pred == "A":
            count_a += 1
        elif pred == "B":
            count_b += 1

    # Normalize to probabilities
    total = count_a + count_b
    prob_a = count_a / total
    prob_b = count_b / total
    prob_tie = 0.0  # no tie handling yet

    predictions.append({
        "id": uid,
        "winner_model_a": prob_a,
        "winner_model_b": prob_b,
        "winner_tie": prob_tie
    })

# ✅ Save to CSV in correct format
out_df = pd.DataFrame(predictions)
out_df.to_csv("/content/drive/MyDrive/llm-classification-finetuning/submission.csv", index=False)
print("✅ Saved final submission with probabilities as submission.csv")
