# Converting Data for Fine-Tuning

In [None]:
import json


with open("/kaggle/input/trainingdata/Final_Selection_Train_v2.json", "r", encoding="utf-8") as f:
    data = json.load(f)


question_parsing_entries = []
cot_parsing_entries = []


for example in data:
    question = example["question"]
    cot = example["cot"]
    qparse = example["question_parsing"]
    cotparse = example["cot_parsing"]

  
    q_output = "Question Parsing:\n" + "\n".join(f"{i+1}. {line}" for i, line in enumerate(qparse))
    question_parsing_entries.append({
        "input": f"Question:\n{question}",
        "output": q_output
    })

   
    cot_output_lines = []
    for entry in cotparse:
        statement = entry["statement"]
        evidence = entry["evidence"]
        verification = entry["Verification"]
        cot_output_lines.append(
            f"Statement: {statement}\nEvidence: {evidence}\nVerification: {verification}"
        )
    cot_output = "CoT Parsing:\n" + "\n\n".join(cot_output_lines)
    cot_parsing_entries.append({
        "input": f"Question:\n{question}\n\nCoT:\n{cot}",
        "output": cot_output
    })


with open("/kaggle/working/train_question_parsing.jsonl", "w", encoding="utf-8") as f:
    for item in question_parsing_entries:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

with open("/kaggle/working/train_cot_parsing.jsonl", "w", encoding="utf-8") as f:
    for item in cot_parsing_entries:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print("✅ Files generated: train_question_parsing.jsonl and train_cot_parsing.jsonl")


In [None]:
!pip install evaluate

In [None]:
!pip uninstall -y nltk

In [None]:
!pip install --upgrade nltk

# PART 1: Fine-tune LLaMA-3-8B-Instruct (Question Parsing) using Unsloth


In [None]:

!pip uninstall -y transformers unsloth unsloth-zoo


!pip install transformers==4.51.1  # Required by Unsloth 2025.3.19
!pip install unsloth

In [None]:
!pip install --force-reinstall numpy==1.26.4

In [None]:
import unsloth 
from unsloth import FastLanguageModel

import torch
from transformers import pipeline

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = 2048,
    dtype = torch.float16,
    load_in_4bit = True,
)


model = FastLanguageModel.get_peft_model(
    model,
    r=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing=False,
    random_state=42,
    max_seq_length=2048,
)

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="/kaggle/working/train_question_parsing.jsonl", split="train")

print(dataset[0])

In [None]:
def format(example_batch):
    inputs = [
        f"{inp}\n\n{out}" for inp, out in zip(example_batch["input"], example_batch["output"])
    ]
    tokenized = tokenizer(
        inputs,
        truncation=True,
        padding="max_length",
        max_length=2048,
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


tokenized_dataset = dataset.map(format, batched=True)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/kaggle/working/llama3-question-parser",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    num_train_epochs=5,
    logging_steps=5,
    save_strategy="epoch",
    save_total_limit=1,
    report_to="none",
    bf16=False,
    fp16=True,  
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
model.save_pretrained("/kaggle/working/finetuned_llama3_question_parsing")
tokenizer.save_pretrained("/kaggle/working/finetuned_llama3_question_parsing")

In [None]:
import shutil


output_dir = "/kaggle/working/finetuned_llama3_question_parsing"


shutil.make_archive(output_dir, 'zip', output_dir)

print(f"✅ Question Parsing Model saved and zipped at {output_dir}.zip")

# PART 2: Fine-tune LLaMA-3-8B-Instruct for cot_parsing

In [None]:
from datasets import load_dataset

cot_dataset = load_dataset("json", data_files="/kaggle/working/train_cot_parsing.jsonl", split="train")

print(cot_dataset[0])

In [None]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = 2048, 
    dtype = torch.float16,
    load_in_4bit = True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing=False,
    random_state=42,
    max_seq_length=2048,
)

In [None]:
def format_cot(example_batch):
    inputs = [
        f"{question}\n\n{cot}" 
        for question, cot in zip(example_batch["input"], example_batch["output"])
    ]

    model_inputs = tokenizer(
        inputs,
        truncation=True,
        padding="max_length",
        max_length=2048,
        return_tensors="pt",  
    )

    model_inputs["labels"] = model_inputs["input_ids"].clone()

   
    for key in model_inputs:
        model_inputs[key] = model_inputs[key].to("cuda")

    return model_inputs


tokenized_cot_dataset = cot_dataset.map(format_cot, batched=True)
tokenized_cot_dataset.set_format(type="torch")

In [None]:
from transformers import TrainingArguments

cot_training_args = TrainingArguments(
    output_dir="/kaggle/working/llama3-cot-parser",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    gradient_checkpointing=False,
    learning_rate=2e-5,
    num_train_epochs=5,
    logging_steps=5,
    save_strategy="epoch",
    save_total_limit=1,
    report_to="none",
    bf16=False,
    fp16=True,
)

In [None]:
from transformers import Trainer

cot_trainer = Trainer(
    model=model,
    args=cot_training_args,
    train_dataset=tokenized_cot_dataset,
    tokenizer=tokenizer,  
)
cot_trainer.train()

In [None]:
model.save_pretrained("/kaggle/working/finetuned_llama3_cot_parsing")
tokenizer.save_pretrained("/kaggle/working/finetuned_llama3_cot_parsing")

In [None]:
import shutil


cot_output_dir = "/kaggle/working/finetuned_llama3_cot_parsing"


shutil.make_archive(cot_output_dir, 'zip', cot_output_dir)

print(f"✅ CoT parsing model zipped at {cot_output_dir}.zip")

# Inference Pipeline for results.json

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import json
import re

# Load Question Parsing model
question_model_path = "/kaggle/working/finetuned_llama3_question_parsing"
question_tokenizer = AutoTokenizer.from_pretrained(question_model_path)
question_model = AutoModelForCausalLM.from_pretrained(question_model_path)
question_pipe = pipeline("text-generation", model=question_model, tokenizer=question_tokenizer)

# Load CoT Parsing model
cot_model_path = "/kaggle/working/finetuned_llama3_cot_parsing"
cot_tokenizer = AutoTokenizer.from_pretrained(cot_model_path)
cot_model = AutoModelForCausalLM.from_pretrained(cot_model_path)
cot_pipe = pipeline("text-generation", model=cot_model, tokenizer=cot_tokenizer)

# Inference helpers
def generate_question_parsing(question_text):
    prompt = f"Question:\n{question_text}\n\nQuestion Parsing:\n"
    response = question_pipe(prompt, max_new_tokens=512, temperature=0.7)[0]['generated_text']
    parsed = response.split("Question Parsing:\n")[-1].strip()
    return [line.strip("1234567890. -") for line in parsed.split("\n") if line.strip()]

def generate_cot_parsing(question_text, cot_text):
    prompt = f"""You are a reasoning parser. Given a multiple-choice question and a chain-of-thought explanation (CoT), extract the reasoning steps into a structured JSON list. 

Each step must include:
- "statement": the reasoning claim
- "evidence": what justifies the claim
- "Verification": "true" or "false"

Example:

Question:
If the lamp is on, the room is bright. The room is not bright.
Is the lamp on?

CoT:
If the lamp is on, the room is bright. The room is not bright. Therefore, the lamp is not on.

CoT Parsing:
[
  {{
    "statement": "If the lamp is on, the room is bright.",
    "evidence": "Provided in the question.",
    "Verification": "true"
  }},
  {{
    "statement": "The room is not bright.",
    "evidence": "Given directly in the question.",
    "Verification": "true"
  }},
  {{
    "statement": "Therefore, the lamp is not on.",
    "evidence": "Contrapositive of the conditional statement.",
    "Verification": "true"
  }}
]

Now do the same for:

Question:
{question_text}

CoT:
{cot_text}

CoT Parsing:
Please format it as a JSON list of dictionaries with keys "statement", "evidence", and "Verification". Only output the JSON list.
"""

    response = cot_pipe(prompt, max_new_tokens=1024, temperature=0.7)[0]["generated_text"]
    parsed = response.split("CoT Parsing:")[-1].strip()

    print("\n========== RAW CoT PARSING ==========")
    print(parsed)

    # Try parsing JSON first
    try:
        cot_entries = json.loads(parsed)
        cot_entries = [e for e in cot_entries if all(k in e for k in ["statement", "evidence", "Verification"]) and e["Verification"] in ("true", "false")]
    except:
        # Fallback: try to salvage valid JSON chunks manually
        cot_entries = []
        matches = re.findall(r'{[^}]+}', parsed)
        for m in matches:
            try:
                m = m.strip()
                if not m.endswith("}"):
                    m += "}"
                entry = json.loads(m)
                if all(k in entry for k in ["statement", "evidence", "Verification"]) and entry["Verification"] in ("true", "false"):
                    cot_entries.append(entry)
            except:
                continue

    return cot_entries

# Load data
with open("/kaggle/input/syntheticprediction/synthetic5examples copy.json", "r", encoding="utf-8") as f:
    val_data = json.load(f)

results = []

for item in val_data:
    q_id = item["id"]
    question_text = item["question"]
    cot_text = item["cot"]
    answer = item.get("answer", None)

    question_parsing = generate_question_parsing(question_text)
    cot_parsing = generate_cot_parsing(question_text, cot_text)

    results.append({
        "id": q_id,
        "question": question_text,
        "answer": answer,
        "cot": cot_text,
        "question_parsing": question_parsing,
        "cot_parsing": cot_parsing
    })

# Save output
with open("/kaggle/working/synthetic_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print("✅ synthetic_results.json saved at /kaggle/working/synthetic_results.json")

Device set to use cuda:0
Device set to use cuda:0



Please format it as a JSON list of dictionaries with keys "statement", "evidence", and "Verification". Only output the JSON list.
[
  {
    "statement": "Ethan must be on Project Y.",
    "evidence": "Given condition (3).",
    "Verification": "true"
  },
  {
    "statement": "Dana is not on Project Y.",
    "evidence": "Given condition (5).",
    "Verification": "true"
  },
  {
    "statement": "Dana must be on Project X.",
    "evidence": "Contrapositive of Dana is not on Project Y.",
    "Verification": "true"
  },
  {
    "statement": "Brenda and Dana are not on the same project.",
    "evidence": "Given condition (2).",
    "Verification": "true"
  },
  {
    "statement": "Brenda must be on Project Y.",
    "evidence": "From the previous statements.",
    "Verification": "true"
  },
  {
    "statement": "Carl cannot be on Project X.",
    "evidence": "From the previous statements.",
    "Verification": "true"
  },
  {
    "statement": "Carl must be on Project Y.",
    "evidence":

# Evaluation using 5 synthetic examples in the validationset

In [5]:
import nltk
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
!python /kaggle/input/evalscript/eval.py \
  --prediction /kaggle/working/synthetic_results.json \
  --reference /kaggle/input/validationset/synthetic5examples.json \
  --question_threshold 0.95 \
  --statement_threshold 0.9 \
  --relation_threshold 0.9

2025-04-13 17:40:18.534825: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-13 17:40:18.558437: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-13 17:40:18.565346: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[?25lTotal number of predictions: [1;36m5[0m
[2K[36m Evaluating ...[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [35m  0%[0m [36m-:--:--[0mAsking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
[2K[36m Evaluating ...[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Seeing if the evaluation script works:

In [7]:
!python /kaggle/input/evalscript/eval.py \
  --prediction /kaggle/input/validationset/synthetic5examples.json \
  --reference /kaggle/input/validationset/synthetic5examples.json \
  --question_threshold 0.95 \
  --statement_threshold 0.9 \
  --relation_threshold 0.9

2025-04-13 17:52:44.174957: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-13 17:52:44.196765: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-13 17:52:44.203769: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[?25lTotal number of predictions: [1;36m5[0m
[2K[36m Evaluating ...[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [35m  0%[0m [36m-:--:--[0mAsking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
[2K[36m Evaluating ...[0m [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━