<a href="https://colab.research.google.com/github/Dur-e-yashfeen/veritas-ai/blob/main/run_training_%26_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# choose GPU: Runtime -> Change runtime type -> GPU
!pip install -q transformers datasets accelerate "torch>=2.0" scikit-learn pandas streamlit pdfminer.six google-cloud-storage google-cloud-bigquery google-cloud-aiplatform

## Prepare small demo dataset

In [7]:
# This will create a small balanced demo dataset (200 AI, 200 human if available)
from datasets import load_dataset
import pandas as pd
from pathlib import Path
out = Path("data/raw")
out.mkdir(parents=True, exist_ok=True)

# Try to load GPT-2 outputs; fallback to wikitext for demo
try:
    ds = load_dataset("openai/gpt2-output-dataset", split="train")
    ai_texts = [d["text"] for d in ds.shuffle(seed=42).select(range(200))]
except Exception as e:
    ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
    ai_texts = [t for t in ds.shuffle(seed=42).select(range(200))]

# For human texts use part of wikitext or sample from wiki + synthetic simple essays for demo
human_ds = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
human_texts = [t for t in human_ds.shuffle(seed=1).select(range(200))]

df = pd.DataFrame({
    "text": ai_texts[:150] + human_texts[:150],
    "label": [1]*150 + [0]*150
})
df.to_csv(out/"demo_dataset.csv", index=False)
print("Wrote demo dataset to", out/"demo_dataset.csv")
df.head()

Wrote demo dataset to data/raw/demo_dataset.csv


Unnamed: 0,text,label
0,"{'text': ' Continuous , short @-@ arc , high p...",1
1,{'text': ' Field Marshal Antonio José de Sucre...,1
2,{'text': ' Norman Gary Finkelstein ( born Dece...,1
3,{'text': ' Galveston has several state @-@ fun...,1
4,{'text': ' Walpole 's works have not been comp...,1


## Train a small DistilBERT model

In [8]:
!pip install -U transformers accelerate
# then Runtime → Restart runtime



In [9]:
from datasets import Dataset
import pandas as pd
import numpy as np
import os
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Disable Weights & Biases logging (no API key needed)
os.environ["WANDB_DISABLED"] = "true"

# Load dataset
df = pd.read_csv("data/raw/demo_dataset.csv")

# Make sure labels are integers (0/1). Map if needed:
# label_map = {"negative": 0, "positive": 1}
# df["label"] = df["label"].map(label_map)

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df["label"])
train = Dataset.from_pandas(train_df.reset_index(drop=True))
val = Dataset.from_pandas(val_df.reset_index(drop=True))

# Tokenizer
MODEL_NAME = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME, use_auth_token=False)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

train = train.map(tokenize, batched=True)
val = val.map(tokenize, batched=True)

train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Model
model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    use_auth_token=False
)

# Training arguments (old-compatible, no eval_strategy)
args = TrainingArguments(
    output_dir="outputs",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    do_eval=True,
    eval_steps=50,        # evaluate every 50 steps
    save_total_limit=1,
    logging_steps=10,
    logging_dir="logs"    # for TensorBoard (optional)
)

# Metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1": f1_score(p.label_ids, preds, average="weighted")
    }

# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train,
    eval_dataset=val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

# Save model
trainer.save_model("outputs/veritas_distilbert")
print("✅ Model saved to outputs/veritas_distilbert")



Map:   0%|          | 0/270 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
10,0.7024
20,0.7
30,0.691


✅ Model saved to outputs/veritas_distilbert


In [12]:
!tar -czf veritas_model.tar.gz -C outputs veritas_distilbert
print("Saved veritas_model.tar.gz")

Saved veritas_model.tar.gz
