### Loading the dataset ###

In [23]:
# class labels: 0 -> vanilla ; 1 -> watermarked

import json
from pathlib import Path

def load_jsonl(path, label):
    path = Path(path).resolve()
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")
    for i, line in enumerate(path.read_text(encoding="utf-8").splitlines(), 1):
        line = line.strip()
        if not line:
            continue  # Skip empty lines
        try:
            obj = json.loads(line)
            yield {"text": obj["response"], "label": label}
        except json.JSONDecodeError as e:
            print(f"Skipping line {i} in {path}: JSON decode error: {e}")
            
PROMPT_WATERMARKED_PATH = "./LLM-Watermarking/prompt-token-watermarking/watermarked_responses/prompt_watermarked_responses.jsonl"
VANILLA_PATH = "./LLM-Watermarking/prompt-token-watermarking/vanilla_responses_gpt3.5turbo/vanilla_responses.jsonl"

# Load both datasets
data = list(load_jsonl(PROMPT_WATERMARKED_PATH, 1)) \
     + list(load_jsonl(VANILLA_PATH, 0))

# Preview the first 10 examples
print(data[:2])

[{'text': "Quantum computations on a quantum computer work differently from classical computers due to the principles of superposition and entanglement. In a quantum computer with 3 qubits, you can perform a computation like finding the exponent of 7 on all 8 superpositions using quantum gates and algorithms such as the quantum phase estimation algorithm or quantum Fourier transform.\n\nHere is a simplified overview of how you can approach this computation on a quantum computer with 3 qubits:\n\n1. Initialize the qubits in the superposition of all possible states.\n2. Apply quantum gates to create the desired quantum circuit that represents the exponentiation operation.\n3. Apply the quantum algorithm to find the exponent of 7 on all 8 superpositions by manipulating the qubits using quantum gates such as controlled-NOT gates, Hadamard gates, phase gates, and others.\n4. Measure the final state of the qubits to obtain the result in the classical form.\n\nIt's important to note that quan

### Preprocessing the dataset ###

In [34]:
# Shuffle the data
import random
SEED = 42
random.seed(SEED)
random.shuffle(data)

# Split into train (70%), val (20%), test (10%)
n_total = len(data)
n_train = int(0.7 * n_total)
n_val = int(0.2 * n_total)

train_data = data[:n_train]
val_data = data[n_train:n_train + n_val]
test_data = data[n_train + n_val:]

# Print sizes to verify
print(f"Total: {n_total}, Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

Total: 3912, Train: 2738, Val: 782, Test: 392


### Baseline Model 1: Logistic Regression with TF-IDF

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

#word-level by default
tfidf = TfidfVectorizer(
    max_features=50_000,   # you can adjust
    ngram_range=(1,2),     # unigrams + bigrams
)

# fit on train texts
X_train = tfidf.fit_transform([d["text"] for d in train_data])
y_train = [d["label"] for d in train_data]

# transform val & test
X_val   = tfidf.transform([d["text"] for d in val_data])
y_val   = [d["label"] for d in val_data]

X_test  = tfidf.transform([d["text"] for d in test_data])
y_test  = [d["label"] for d in test_data]


#### At word level

In [35]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    solver="lbfgs",
    max_iter=1000,
    random_state=SEED,
)
model.fit(X_train, y_train)

### Evaluation ###

In [37]:
from sklearn.metrics import accuracy_score

train_acc = accuracy_score(y_train, model.predict(X_train))
val_acc   = accuracy_score(y_val,   model.predict(X_val))
test_acc  = accuracy_score(y_test,  model.predict(X_test))

print(f"Train accuracy:      {train_acc:.4f}")
print(f"Validation accuracy: {val_acc:.4f}")
print(f"Test accuracy:       {test_acc:.4f}")

Train accuracy:      0.9335
Validation accuracy: 0.5780
Test accuracy:       0.5944


#### At Character-level 

In [40]:
char_vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    max_features=50_000,
)

In [41]:
X_train_char = char_vectorizer.fit_transform([ex["text"] for ex in train_data])
y_train_char = [ex["label"] for ex in train_data]

X_val_char   = char_vectorizer.transform([ex["text"] for ex in val_data])
y_val_char   = [ex["label"] for ex in val_data]

X_test_char  = char_vectorizer.transform([ex["text"] for ex in test_data])
y_test_char  = [ex["label"] for ex in test_data]

In [42]:
char_lr_model = LogisticRegression(
    solver="lbfgs",
    max_iter=1000,
    random_state=42,
)
char_lr_model.fit(X_train_char, y_train_char)

In [43]:
train_acc_char = accuracy_score(y_train_char, char_lr_model.predict(X_train_char))
val_acc_char   = accuracy_score(y_val_char,   char_lr_model.predict(X_val_char))
test_acc_char  = accuracy_score(y_test_char,  char_lr_model.predict(X_test_char))

print(f"Train (char-TFIDF) accuracy:      {train_acc_char:.4f}")
print(f"Validation (char-TFIDF) accuracy: {val_acc_char:.4f}")
print(f"Test (char-TFIDF) accuracy:       {test_acc_char:.4f}")

Train (char-TFIDF) accuracy:      0.9036
Validation (char-TFIDF) accuracy: 0.6675
Test (char-TFIDF) accuracy:       0.6454


### At Byte-pair level

In [45]:
# ─── 1) Train a BPE tokenizer on the train split ────────────────────────────

# Initialize & train
from tokenizers import ByteLevelBPETokenizer
bpe_tokenizer = ByteLevelBPETokenizer()
bpe_tokenizer.train_from_iterator(
    (ex["text"] for ex in train_data),
    vocab_size=30_000,
    min_frequency=2,
    special_tokens=["<unk>", "<pad>"]
)






In [46]:

def encode_to_strs(batch):
    # tokenize each text -> list of BPE tokens -> join back to space-separated string
    return [" ".join(bpe_tokenizer.encode(text).tokens) for text in batch]

In [47]:
train_corpus_bpe = encode_to_strs([ex["text"] for ex in train_data])
val_corpus_bpe   = encode_to_strs([ex["text"] for ex in val_data])
test_corpus_bpe  = encode_to_strs([ex["text"] for ex in test_data])

In [48]:
y_train_bpe = [ex["label"] for ex in train_data]
y_val_bpe   = [ex["label"] for ex in val_data]
y_test_bpe  = [ex["label"] for ex in test_data]

In [49]:
bpe_vectorizer = TfidfVectorizer(
    analyzer='word',       # tokens are already BPE subwords
    max_features=50_000,
)

In [50]:
X_train_bpe = bpe_vectorizer.fit_transform(train_corpus_bpe)
X_val_bpe   = bpe_vectorizer.transform(val_corpus_bpe)
X_test_bpe  = bpe_vectorizer.transform(test_corpus_bpe)


In [51]:
bpe_lr_model = LogisticRegression(
    solver="lbfgs",
    max_iter=1000,
    random_state=42,
)
bpe_lr_model.fit(X_train_bpe, y_train_bpe)

In [52]:
train_acc_bpe = accuracy_score(y_train_bpe, bpe_lr_model.predict(X_train_bpe))
val_acc_bpe   = accuracy_score(y_val_bpe,   bpe_lr_model.predict(X_val_bpe))
test_acc_bpe  = accuracy_score(y_test_bpe,  bpe_lr_model.predict(X_test_bpe))

print(f"Train (BPE) accuracy:      {train_acc_bpe:.4f}")
print(f"Validation (BPE) accuracy: {val_acc_bpe:.4f}")
print(f"Test (BPE) accuracy:       {test_acc_bpe:.4f}")

Train (BPE) accuracy:      0.8860
Validation (BPE) accuracy: 0.6675
Test (BPE) accuracy:       0.6480


## Leveraging Transformers: Finetuning RoBERTa

In [63]:
# 1) Install required libraries (run once)
# !pip install transformers datasets evaluate

# 2) Imports
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
from evaluate import load as load_metric
import numpy as np

# 3) Convert your Python lists into HuggingFace Datasets
# Assuming train_data, val_data, test_data are lists of {"text":…, "label":…}
train_ds = Dataset.from_list(train_data)
val_ds   = Dataset.from_list(val_data)
test_ds  = Dataset.from_list(test_data)

# 4) Load the Roberta tokenizer & model
model_name = "roberta-base"
roberta_tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
roberta_model     = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
)

# 5) Tokenization function
def tokenize_batch(batch):
    return roberta_tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256,   # adjust based on your text lengths
    )

# 6) Apply tokenization
train_tok = train_ds.map(tokenize_batch, batched=True)
val_tok   = val_ds.map(tokenize_batch, batched=True)
test_tok  = test_ds.map(tokenize_batch, batched=True)

# 7) Set format for PyTorch
columns = ["input_ids", "attention_mask", "label"]
train_tok.set_format("torch", columns=columns)
val_tok.set_format("torch", columns=columns)
test_tok.set_format("torch", columns=columns)

# 8) Define a simple compute_metrics function
metric_acc = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric_acc.compute(predictions=preds, references=labels)

# 9) Training arguments
training_args = TrainingArguments(
    output_dir="./roberta_watermark",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# 10) Initialize Trainer
trainer = Trainer(
    model=roberta_model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=roberta_tokenizer,
    compute_metrics=compute_metrics,
)

# 11) Train!
trainer.train()

# 12) Evaluate on train, val, and test
train_metrics = trainer.evaluate(train_tok)
val_metrics   = trainer.evaluate(val_tok)
test_metrics  = trainer.evaluate(test_tok)

print(f"Train Accuracy: {train_metrics['eval_accuracy']:.4f}")
print(f"Val   Accuracy: {val_metrics['eval_accuracy']:.4f}")
print(f"Test  Accuracy: {test_metrics['eval_accuracy']:.4f}")


loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /home/cmugishawayo25/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading file https://huggingface.co/roberta

Map:   0%|          | 0/2738 [00:00<?, ? examples/s]

Map:   0%|          | 0/782 [00:00<?, ? examples/s]

Map:   0%|          | 0/392 [00:00<?, ? examples/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2738
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 860


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4157,0.393712,0.813299
2,0.3831,0.377885,0.829923
3,0.28,0.370106,0.838875
4,0.2357,0.487257,0.836317
5,0.1459,0.541942,0.846547


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 782
  Batch size = 32
Saving model checkpoint to ./roberta_watermark/checkpoint-172
Configuration saved in ./roberta_watermark/checkpoint-172/config.json
Model weights saved in ./roberta_watermark/checkpoint-172/pytorch_model.bin
tokenizer config file saved in ./roberta_watermark/checkpoint-172/tokenizer_config.json
Special tokens file saved in ./roberta_watermark/checkpoint-172/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Eval

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 782
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 392
  Batch size = 32


Train Accuracy: 0.9500
Val   Accuracy: 0.8465
Test  Accuracy: 0.8648
