### Load the dataset

In [2]:
# class labels: 0 -> vanilla ; 1 -> watermarked

import json
from pathlib import Path

def load_jsonl(path, label):
    path = Path(path).resolve()
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")
    for i, line in enumerate(path.read_text(encoding="utf-8").splitlines(), 1):
        line = line.strip()
        if not line:
            continue  # Skip empty lines
        try:
            obj = json.loads(line)
            yield {"text": obj["response"], "label": label}
        except json.JSONDecodeError as e:
            print(f"Skipping line {i} in {path}: JSON decode error: {e}")
            
TOKEN_WATERMARKED_PATH = "./LLM-Watermarking/prompt-token-watermarking/watermarked_responses/token_watermarked_responses.jsonl"
VANILLA_PATH = "./LLM-Watermarking/prompt-token-watermarking/vanilla_responses_gpt3.5turbo/vanilla_responses.jsonl"

# Load both datasets
data = list(load_jsonl(TOKEN_WATERMARKED_PATH, 1)) \
     + list(load_jsonl(VANILLA_PATH, 0))

# Preview the first 10 examples
print(data[:2])

[{'text': "In a quantum computer, computations are performed using quantum bits (qubits) instead of classical bits. Qubits can exist in a superposition of states (both 0 and 1) simultaneously, allowing for parallel processing and potentially faster computation.\n\nTo perform a computation on a quantum computer with 3 qubits to find the exponent of 7 on all 8 superpositions, you would need to create a quantum circuit that implements the necessary quantum gates to achieve this.\n\nIn this specific example, you would need to create a circuit that applies the appropriate quantum gates to each qubit to represent the exponentiation of 7 in binary form (111). This can be achieved using quantum gates such as X (bit-flip), H (Hadamard), and CNOT (controlled-NOT) gates.\n\nOnce the quantum circuit is constructed and the qubits are prepared in the initial state (usually the |0> state), you can run the circuit on a quantum computer to perform the computation. The quantum computer will then process

### Preprocessing

In [3]:
# Shuffle the data
import random
SEED = 42
random.seed(SEED)
random.shuffle(data)

# Split into train (70%), val (20%), test (10%)
n_total = len(data)
n_train = int(0.7 * n_total)
n_val = int(0.2 * n_total)

train_data = data[:n_train]
val_data = data[n_train:n_train + n_val]
test_data = data[n_train + n_val:]

# Print sizes to verify
print(f"Total: {n_total}, Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

Total: 3999, Train: 2799, Val: 799, Test: 401


1380


#### Baseline Model 1: Logistic regression with TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

#word-level by default
tfidf = TfidfVectorizer(
    max_features=50_000,   # you can adjust
    ngram_range=(1,3),# unigrams + bigrams
    max_df = 0.99
)

# fit on train texts
X_train = tfidf.fit_transform([d["text"] for d in train_data])
y_train = [d["label"] for d in train_data]

# transform val & test
X_val   = tfidf.transform([d["text"] for d in val_data])
y_val   = [d["label"] for d in val_data]

X_test  = tfidf.transform([d["text"] for d in test_data])
y_test  = [d["label"] for d in test_data]


In [16]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    solver="lbfgs",
    max_iter=1000,
    random_state=SEED,
)
model.fit(X_train, y_train)

In [17]:
from sklearn.metrics import accuracy_score

train_acc = accuracy_score(y_train, model.predict(X_train))
val_acc   = accuracy_score(y_val,   model.predict(X_val))
test_acc  = accuracy_score(y_test,  model.predict(X_test))

print(f"Train accuracy:      {train_acc:.4f}")
print(f"Validation accuracy: {val_acc:.4f}")
print(f"Test accuracy:       {test_acc:.4f}")

Train accuracy:      0.9060
Validation accuracy: 0.2628
Test accuracy:       0.2893


### Finetuning a BERT Model

In [35]:
# Fine-tune RoBERTa-base
# `train_data`, `val_data`, `test_data` are lists of {"text":…, "label":0/1}

import torch
from datasets import Dataset
from evaluate import load as load_metric
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

# 1) Build HF Datasets
train_ds = Dataset.from_list(train_data)
val_ds   = Dataset.from_list(val_data)
test_ds  = Dataset.from_list(test_data)

# 2) Tokenize
model_name = "roberta-base"
tokenizer  = AutoTokenizer.from_pretrained(model_name)

def preprocess(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

train_ds = train_ds.map(preprocess, batched=True)
val_ds   = val_ds.map(preprocess,   batched=True)
test_ds  = test_ds.map(preprocess,  batched=True)

train_ds = train_ds.rename_column("label", "labels")
val_ds   = val_ds.rename_column("label", "labels")
test_ds  = test_ds.rename_column("label", "labels")

for ds in (train_ds, val_ds, test_ds):
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 3) Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

# 4) Training arguments
training_args = TrainingArguments(
    output_dir="roberta_watermark_detector",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=0.5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    logging_steps=50,
    logging_dir="logs",
)

# 5) Metrics
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return metric.compute(predictions=preds, references=labels)

# 6) Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
)

# 7) Train & evaluate
trainer.train()
print("Validation results:", trainer.evaluate())

# 8) Test set evaluation
print("Test set results:", trainer.evaluate(test_ds))

# 9) Save
trainer.save_model(training_args.output_dir)
tokenizer.save_pretrained(training_args.output_dir)
print(f"Saved model + tokenizer to `{training_args.output_dir}`")


loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/cmugishawayo25/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file https://huggingface.co/roberta

Map:   0%|          | 0/2799 [00:00<?, ? examples/s]

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Map:   0%|          | 0/401 [00:00<?, ? examples/s]

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/cmugishawayo25/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file https://huggingface.co

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6951,0.693135,0.500626
2,0.6963,0.694141,0.479349
3,0.693,0.691645,0.549437
4,0.6895,0.687806,0.564456
5,0.6512,0.641099,0.638298
6,0.6246,0.638313,0.629537
7,0.606,0.649737,0.652065
8,0.5699,0.659634,0.638298
9,0.5543,0.67123,0.639549
10,0.5587,0.677153,0.637046


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 799
  Batch size = 32
Saving model checkpoint to roberta_watermark_detector/checkpoint-175
Configuration saved in roberta_watermark_detector/checkpoint-175/config.json
Model weights saved in roberta_watermark_detector/checkpoint-175/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 799
  Batch size = 32
Saving model checkpoint to roberta_watermark_detector/checkpoint-350
Configuration saved in roberta_wat

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 401
  Batch size = 32


Validation results: {'eval_loss': 0.6497372388839722, 'eval_accuracy': 0.6520650813516896, 'eval_runtime': 25.2046, 'eval_samples_per_second': 31.701, 'eval_steps_per_second': 0.992, 'epoch': 10.0}


Saving model checkpoint to roberta_watermark_detector
Configuration saved in roberta_watermark_detector/config.json


Test set results: {'eval_loss': 0.6097788214683533, 'eval_accuracy': 0.655860349127182, 'eval_runtime': 12.5012, 'eval_samples_per_second': 32.077, 'eval_steps_per_second': 1.04, 'epoch': 10.0}


Model weights saved in roberta_watermark_detector/pytorch_model.bin
tokenizer config file saved in roberta_watermark_detector/tokenizer_config.json
Special tokens file saved in roberta_watermark_detector/special_tokens_map.json


Saved model + tokenizer to `roberta_watermark_detector`
