# 9. Training Two Separate DeBERTa Verifiers

In this notebook we train two specialized verifiers for our structured reasoning pipeline:

1. **Question Parsing (QP) Verifier**  
   - Input: raw question → serialized `question_parsing`  
   - Negatives: drop or shuffle constraints  
   - Model: `microsoft/deberta-v3-base`, 5 epochs, class-balanced  

2. **Chain-of-Thought (CoT) Verifier**  
   - Input: question + conditions + CoT → serialized `cot_parsing`  
   - Negatives: flip verification, swap/drop evidence  
   - Model: `microsoft/deberta-v3-base`, 5 epochs, class-balanced

## Imports and Configuration

In [None]:
import json, random, copy
from sklearn.model_selection import train_test_split
import torch
from collections import Counter
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, f1_score

## Data Preparation

### Generate QP Verifier Dataset

In [None]:
import json
import random
from sklearn.model_selection import train_test_split

# CONFIGURATION
INPUT = "/content/drive/MyDrive/llm-sr-project/700dataset.json"
OUT_TRAIN = "/content/drive/MyDrive/llm-sr-project/verifier_qp_train.jsonl"
OUT_DEV = "/content/drive/MyDrive/llm-sr-project/verifier_qp_dev.jsonl"
NEG_PER_POS = 1  # Number of negative samples to generate per positive
DEV_SIZE = 0.1   # Fraction of total data to allocate to dev set


def corrupt_question_parsing(qp):
    """
    Corrupts a valid question_parsing (QP) list by:
    - Randomly dropping one constraint (if list length > 1), or
    - Shuffling the order of constraints
    """
    qp2 = qp.copy()
    if random.random() < 0.5 and len(qp2) > 1:
        # drop one random item
        qp2.pop(random.randrange(len(qp2)))
    else:
        # shuffle the parsing
        random.shuffle(qp2)
    return qp2

def make_record(question, qp, label):
    """
    Formats a (question, question_parsing) pair into a binary classification example:
    - Premise: the raw question text
    - Hypothesis: serialized JSON version of QP
    - Label: 1 for valid, 0 for corrupted
    """
    premise = question
    hyp_qp = json.dumps(qp, ensure_ascii=False)
    hypothesis = f"QuestionParsing: {hyp_qp}"
    return {"premise": premise, "hypothesis": hypothesis, "label": label}

def main():
    # 1) Load positive (gold) examples from dataset
    with open(INPUT, "r", encoding="utf-8") as f:
        positives = json.load(f)

    # 2) For each gold example, generate one valid and several corrupted versions
    all_records = []
    for ex in positives:
        q = ex["question"]
        qp = ex["question_parsing"]

        # Add the original, valid (label=1)
        all_records.append(make_record(q, qp, 1))

        # Add corrupted examples (label=0)
        for _ in range(NEG_PER_POS):
            qp_bad = corrupt_question_parsing(qp)

            # Ensure the negative is actually different
            if qp_bad != qp:
                all_records.append(make_record(q, qp_bad, 0))

    # 3) Stratified train/dev split (preserve label ratio)
    train, dev = train_test_split(
        all_records,
        test_size=DEV_SIZE,
        random_state=42,
        stratify=[r["label"] for r in all_records]
    )

    # 4) Write train and dev splits as JSONL
    for path, split in [(OUT_TRAIN, train), (OUT_DEV, dev)]:
        with open(path, "w", encoding="utf-8") as f:
            for rec in split:
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    print(f"✔︎ wrote {len(train)} train + {len(dev)} dev QP examples")

if __name__ == "__main__":
    main()

### Define CoT Verifier Dataset

In [None]:
import json
import random
import copy
from sklearn.model_selection import train_test_split

# CONFIGURATION
INPUT        = "/content/drive/MyDrive/llm-sr-project/700dataset.json"
OUT_TRAIN    = "/content/drive/MyDrive/llm-sr-project/verifier_cp_train.jsonl"
OUT_DEV      = "/content/drive/MyDrive/llm-sr-project/verifier_cp_dev.jsonl"
NEG_PER_POS  = 1    # Number of negative samples to generate per positive
DEV_SIZE     = 0.1  # Number of negative samples to generate per positive


def corrupt_cot_parsing(cp):
    """
    Corrupts a valid CoT parsing (list of statements with evidence and verification)
    by one of the following:
    - Flipping the verification field ("true" ↔ "false")
    - Swapping evidence between two statements
    - Dropping the evidence field from one step
    """
    cp2 = copy.deepcopy(cp)
    if not cp2:
        return cp2
    choice = random.choice(["flip", "swap", "drop_field"])
    if choice == "flip":
        idx = random.randrange(len(cp2))
        cur = cp2[idx].get("Verification", "false")
        cp2[idx]["Verification"] = "true" if cur=="false" else "false"
    elif choice == "swap" and len(cp2) >= 2:
        i, j = random.sample(range(len(cp2)), 2)
        cp2[i]["evidence"], cp2[j]["evidence"] = cp2[j].get("evidence"), cp2[i].get("evidence")
    else:
        idx = random.randrange(len(cp2))
        cp2[idx].pop("evidence", None)
    return cp2

def make_record(question, cot, conditions, cp, label):
    """
    Formats a (question, CoT, conditions, CoT parsing) pair into a binary classification example:
    - Premise: question + conditions + CoT text
    - Hypothesis: serialized JSON version of the CoT parsing
    - Label: 1 for valid, 0 for corrupted
    """
    cond_block = "\n".join(conditions)
    premise    = (
        f"Question:\n{question}\n\n"
        f"Conditions:\n{cond_block}\n\n"
        f"CoT:\n{cot}"
    )
    hyp_cp     = json.dumps(cp, ensure_ascii=False)
    hypothesis = f"CoTParsing: {hyp_cp}"
    return {"premise": premise, "hypothesis": hypothesis, "label": label}

def main():
    # 1) Load gold-labeled examples from dataset
    with open(INPUT, "r", encoding="utf-8") as f:
        positives = json.load(f)

    # 2) For each gold example, generate one valid and several corrupted versions
    all_records = []
    for ex in positives:
        q          = ex["question"]
        cot        = ex["cot"]
        qp         = ex["question_parsing"]
        conditions = qp[:-1] if len(qp) > 1 else qp
        cp         = ex["cot_parsing"]

        # Add the original, valid (label=1)
        all_records.append(make_record(q, cot, conditions, cp, 1))

        # Add corrupted examples (label=0)
        for _ in range(NEG_PER_POS):
            cp_bad = corrupt_cot_parsing(cp)
            if cp_bad != cp:
                all_records.append(make_record(q, cot, conditions, cp_bad, 0))

    # 3) Stratified train/dev split (preserve label ratio)
    train, dev = train_test_split(
        all_records,
        test_size=DEV_SIZE,
        random_state=42,
        stratify=[r["label"] for r in all_records]
    )

    # 4) Write train and dev splits as JSONL
    for path, split in [(OUT_TRAIN, train), (OUT_DEV, dev)]:
        with open(path, "w", encoding="utf-8") as f:
            for rec in split:
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    print(f"✔︎ wrote {len(train)} train + {len(dev)} dev CP examples")

if __name__ == "__main__":
    main()

## Train QP Verifier

In [None]:
import torch
import random
from collections import Counter
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, f1_score

# CONFIGURATION
MODEL_NAME       = "microsoft/deberta-v3-base"
TRAIN_FILE_QP    = "/content/drive/MyDrive/llm-sr-project/verifier_qp_train.jsonl"
DEV_FILE_QP      = "/content/drive/MyDrive/llm-sr-project/verifier_qp_dev.jsonl"
OUTPUT_DIR       = "/content/drive/MyDrive/deberta-qparse-verifier"
BATCH_SIZE       = 4
NUM_EPOCHS       = 5
LEARNING_RATE    = 1e-5
DEVICE           = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# 1) Load Dataset
ds = load_dataset("json",
                  data_files={"train": TRAIN_FILE_QP, "validation": DEV_FILE_QP})

# 2) Balance a split function
def balance_split(split_ds):
    labels   = split_ds["label"]
    idxs_neg = [i for i,l in enumerate(labels) if l == 0]
    idxs_pos = [i for i,l in enumerate(labels) if l == 1]

    random.seed(42)
    # down-sample the larger class
    if len(idxs_neg) > len(idxs_pos):
        idxs_neg = random.sample(idxs_neg, len(idxs_pos))
    else:
        idxs_pos = random.sample(idxs_pos, len(idxs_neg))

    # combine & shuffle
    balanced_idxs = idxs_neg + idxs_pos
    random.shuffle(balanced_idxs)
    return split_ds.select(balanced_idxs)

# 3) Apply to both splits
ds["train"]      = balance_split(ds["train"])
ds["validation"] = balance_split(ds["validation"])

print("Balanced train counts:     ", Counter(ds["train"]["label"]))
print("Balanced validation counts:", Counter(ds["validation"]["label"]))


# 2) Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(DEVICE)

data_collator = DataCollatorWithPadding(tokenizer)

def preprocess(examples):
    enc = tokenizer(
        examples["premise"],
        examples["hypothesis"],
        truncation=True
    )
    enc["labels"] = examples["label"]
    return enc

tok_ds = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)

# 3) Metrics
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1":       f1_score(labels, preds),
    }

# 4) TrainingArguments & Trainer
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps=50,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_ds["train"],
    eval_dataset=tok_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 5) Train & Save
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("✅ QP verifier trained and saved to", OUTPUT_DIR)

## Train CoT Verifier

In [None]:
import torch
import random
from collections import Counter
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, f1_score

# CONFIGURATION
MODEL_NAME       = "microsoft/deberta-v3-base"
TRAIN_FILE_CP    = "/content/drive/MyDrive/llm-sr-project/verifier_cp_train.jsonl"
DEV_FILE_CP      = "/content/drive/MyDrive/llm-sr-project/verifier_cp_dev.jsonl"
OUTPUT_DIR       = "/content/drive/MyDrive/deberta-cotparse-verifier"
BATCH_SIZE       = 4
NUM_EPOCHS       = 5
LEARNING_RATE    = 1e-5
DEVICE           = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1) Load Dataset
ds = load_dataset("json",
                  data_files={"train": TRAIN_FILE_CP, "validation": DEV_FILE_CP})

# 2) Balance a split function
def balance_split(split_ds):
    labels   = split_ds["label"]
    idxs_neg = [i for i,l in enumerate(labels) if l == 0]


    idxs_pos = [i for i,l in enumerate(labels) if l == 1]

    random.seed(42)
    # down-sample the larger class
    if len(idxs_neg) > len(idxs_pos):
        idxs_neg = random.sample(idxs_neg, len(idxs_pos))
    else:
        idxs_pos = random.sample(idxs_pos, len(idxs_neg))

    # combine & shuffle
    balanced_idxs = idxs_neg + idxs_pos
    random.shuffle(balanced_idxs)
    return split_ds.select(balanced_idxs)

# 3) Apply to both splits
ds["train"]      = balance_split(ds["train"])
ds["validation"] = balance_split(ds["validation"])

print("Balanced train counts:     ", Counter(ds["train"]["label"]))
print("Balanced validation counts:", Counter(ds["validation"]["label"]))

# 4) Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(DEVICE)

data_collator = DataCollatorWithPadding(tokenizer)

def preprocess(examples):
    enc = tokenizer(
        examples["premise"],
        examples["hypothesis"],
        truncation=True
    )
    enc["labels"] = examples["label"]
    return enc

tok_ds = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)

# 5) Metrics
def compute_metrics(p):
    preds  = p.predictions.argmax(-1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1":       f1_score(labels, preds),
    }

# 6) TrainingArguments & Trainer
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps=50,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_ds["train"],
    eval_dataset=tok_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 7) Train & Save
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("✅ CP verifier trained and saved to", OUTPUT_DIR)

## Next Steps

- Both verifiers have been saved to:
  - `deberta-qparse-verifier/`
  - `deberta-cotparse-verifier/`

- In the next notebook, we will integrate these two models into our hybrid inference pipeline to rerank and validate Chain-of-Thought parses.
