In [None]:
# ============================================
 #DistilBERT Training on Balanced Q/A Dataset
# ============================================
!pip install -q transformers datasets scikit-learn accelerate

import json
import numpy as np
from typing import List, Dict, Any
from pathlib import Path

# If you need to (re)mount Drive:
# from google.colab import drive
# drive.mount('/content/drive')

# -------- Paths: update if yours differ ----------
BASE = Path("/content/drive/MyDrive/Colab Notebooks/merged_data")
TRAIN_PATH = BASE / "train_70.json"
VAL_PATH   = BASE / "val_15.json"
TEST_PATH  = BASE / "test_15.json"

SAVE_DIR = Path("/content/drive/MyDrive/Colab Notebooks/New/hallucination_model")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

# -------------------------
# Load JSON -> HF Dataset
# -------------------------
from datasets import Dataset

def load_json_dataset(path: str | Path) -> Dataset:
    with open(path, "r", encoding="utf-8") as f:
        data: List[Dict[str, Any]] = json.load(f)  # expects a JSON array

    cleaned = []
    for item in data:
        q   = "" if item.get("question") is None else str(item.get("question"))
        ev  = "" if item.get("evidence") is None else str(item.get("evidence"))
        ans = "" if item.get("answer")   is None else str(item.get("answer"))
        lbl = item.get("labels", item.get("label"))

        # normalize labels to 0/1 int
        if isinstance(lbl, str):
            lbl = 1 if lbl.lower() in {"1","true","factual","fact","yes"} else 0
        lbl = int(lbl)

        cleaned.append({"question": q, "evidence": ev, "answer": ans, "labels": lbl})

    return Dataset.from_list(cleaned)

train_ds = load_json_dataset(TRAIN_PATH)
val_ds   = load_json_dataset(VAL_PATH)
test_ds  = load_json_dataset(TEST_PATH)

print("Sample row:", train_ds[0])

# =========================
# Tokenization
# =========================
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# --- Option A (recommended): pair = (question, answer)
def tokenize(batch):
    return tokenizer(
        batch["question"],
        batch["answer"],
        truncation=True,
        max_length=256,
    )

# --- Option B (include evidence):
# def tokenize(batch):
#     left = [f"Q: {q}  EVIDENCE: {e}" if e else f"Q: {q}"
#             for q, e in zip(batch["question"], batch["evidence"])]
#     return tokenizer(left, batch["answer"], truncation=True, max_length=256)

train_ds_tok = train_ds.map(tokenize, batched=True, remove_columns=["question","evidence","answer"])
val_ds_tok   = val_ds.map(tokenize,   batched=True, remove_columns=["question","evidence","answer"])
test_ds_tok  = test_ds.map(tokenize,  batched=True, remove_columns=["question","evidence","answer"])

# =========================
# Model & Training
# =========================
from transformers import (
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    set_seed,
)

set_seed(42)

id2label = {0: "hallucinated", 1: "factual"}
label2id = {"hallucinated": 0, "factual": 1}

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

data_collator = DataCollatorWithPadding(tokenizer)

# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":  float(accuracy_score(labels, preds)),
        "precision": float(precision_score(labels, preds, zero_division=0)),
        "recall":    float(recall_score(labels, preds, zero_division=0)),
        "f1":        float(f1_score(labels, preds, zero_division=0)),
    }

training_args = TrainingArguments(
    output_dir=str(SAVE_DIR / "checkpoints"),
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,   # pick lowest eval loss
    save_total_limit=2,
    seed=42,
    logging_steps=50,
    report_to="none",          # disable W&B/etc unless you use them
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_tok,
    eval_dataset=val_ds_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# =========================
# Train
# =========================
train_result = trainer.train()

# Evaluate best checkpoint on val and test
print("\nEval on validation set:")
val_metrics = trainer.evaluate(eval_dataset=val_ds_tok)
print(val_metrics)

print("\nEval on test set:")
test_metrics = trainer.evaluate(eval_dataset=test_ds_tok)
print(test_metrics)

# =========================
# Save
# =========================
trainer.save_model(str(SAVE_DIR))  # saves the best model (due to load_best_model_at_end)
tokenizer.save_pretrained(str(SAVE_DIR))
print(f"\nSaved model & tokenizer to: {SAVE_DIR}")

# =========================
# Inference (PAIR input)
# =========================
from transformers import pipeline

clf = pipeline(
    "text-classification",
    model=str(SAVE_DIR),
    tokenizer=str(SAVE_DIR),
    device_map="auto",
)

question = "Was David Thewlis born in 1983?"
answer   = "David Thewlis was born in 1983."

pred = clf({"text": question, "text_pair": answer})
print("\nSample prediction:", pred)
print(f"Label: {pred[0]['label']}  Score: {pred[0]['score']:.3f}")

Sample row: {'question': 'Is it true that Is it illegal to kill a praying mantis in the U.S.?', 'evidence': '', 'answer': 'Answer: No, it is legal to kill a praying mantis', 'labels': 1}


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.154,0.173213,0.943333,0.902056,0.994667,0.9461
2,0.0928,0.049146,0.986,0.976471,0.996,0.986139
3,0.0235,0.046338,0.987333,0.977778,0.997333,0.987459
4,0.0124,0.026883,0.995333,0.990753,1.0,0.995355
5,0.0216,0.038543,0.992667,0.986825,0.998667,0.99271
6,0.0172,0.03402,0.992667,0.986825,0.998667,0.99271



Eval on validation set:


{'eval_loss': 0.02688344568014145, 'eval_accuracy': 0.9953333333333333, 'eval_precision': 0.9907529722589168, 'eval_recall': 1.0, 'eval_f1': 0.9953550099535501, 'eval_runtime': 2.388, 'eval_samples_per_second': 628.138, 'eval_steps_per_second': 19.682, 'epoch': 6.0}

Eval on test set:
{'eval_loss': 0.04099784418940544, 'eval_accuracy': 0.9913333333333333, 'eval_precision': 0.9842312746386334, 'eval_recall': 0.9986666666666667, 'eval_f1': 0.9913964262078094, 'eval_runtime': 2.3655, 'eval_samples_per_second': 634.117, 'eval_steps_per_second': 19.869, 'epoch': 6.0}

Saved model & tokenizer to: /content/drive/MyDrive/Colab Notebooks/New/hallucination_model


Device set to use cuda:0



Sample prediction: {'label': 'factual', 'score': 0.9991311430931091}


KeyError: 0

In [None]:
print(train_dataset.column_names)

['question', 'answer', 'evidence', 'label', 'source']


In [None]:
print(train_dataset.features)

{'question': Value('string'), 'answer': Value('string'), 'evidence': Value('string'), 'label': Value('int64'), 'source': Value('string')}
