In [21]:
from datasets import load_dataset, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import numpy as np
import pandas as pd
import json
import evaluate
from sklearn.model_selection import train_test_split

In [22]:
# 1. Load preprocessed CSV
df = pd.read_csv("Data/processed_HC3.csv")  

df = df[df['text'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]

df["text"] = df["text"].astype(str)
df["label"] = df["label"].astype(str)

# Ensure required columns exist
assert "text" in df.columns and "label" in df.columns, "CSV must have 'text' and 'label' columns"

df = df.sample(n=10000, random_state=42).reset_index(drop=True)


# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [23]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [24]:
filenames = ['Data/arxiv_chatGPT.jsonl', 'Data/arxiv_cohere.jsonl','Data/reddit_chatGPT.jsonl','Data/reddit_cohere.jsonl']
rows = []

for file in filenames: 
     with open(file, "r", encoding="utf-8") as file:
        for line in file:
            entry = json.loads(line)
            if "human_text" in entry and "machine_text" in entry:
                rows.append({"text": entry["human_text"].strip(), "label": 0})
                rows.append({"text": entry["machine_text"].strip(), "label": 1})

# Create DataFrame
json_df = pd.DataFrame(rows)

json_df = json_df[json_df['text'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]

json_df["text"] = json_df["text"].astype(str)
json_df["label"] = json_df["label"].astype(str)

In [57]:
print(json_df["label"].apply(type).value_counts())


label
<class 'str'>    22220
Name: count, dtype: int64


In [25]:
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)
train_ds = Dataset.from_pandas(train_df)
eval_ds = Dataset.from_pandas(json_df)

In [26]:
def tokenize_function(example):
    model_input = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    labels = tokenizer(
        example["label"],
        truncation=True,
        padding="max_length",
        max_length=128
    )["input_ids"]

    model_input["labels"] = labels
    return model_input

tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_eval = eval_ds.map(tokenize_function, batched=True)

tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_eval.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|██████████| 8000/8000 [00:02<00:00, 2894.21 examples/s]
Map: 100%|██████████| 22220/22220 [00:20<00:00, 1068.42 examples/s]


In [37]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # If predictions is a tuple, take the first element
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # Replace -100 with pad_token_id in labels
    if isinstance(labels, np.ndarray):
        labels = np.where(labels == -100, tokenizer.pad_token_id, labels)
    else:
        labels = [
            [token if token != -100 else tokenizer.pad_token_id for token in label]
            for label in labels
        ]

    # Same for predictions, if needed
    if isinstance(predictions, np.ndarray):
        predictions = np.where(predictions == -100, tokenizer.pad_token_id, predictions)
    else:
        predictions = [
            [token if token != -100 else tokenizer.pad_token_id for token in pred]
            for pred in predictions
        ]

    # Decode
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Normalize
    decoded_preds = [pred.strip().lower() for pred in decoded_preds]
    decoded_labels = [label.strip().lower() for label in decoded_labels]

    # Compute metrics
    accuracy = accuracy_score(decoded_labels, decoded_preds)
    f1 = f1_score(decoded_labels, decoded_preds, average='weighted')
    precision = precision_score(decoded_labels, decoded_preds, average='weighted', zero_division=0)
    recall = recall_score(decoded_labels, decoded_preds, average='weighted', zero_division=0)

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }



In [38]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results/t5-hc3",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=False  
)


In [39]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [40]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0196,0.114761,0.19541,0.251745,0.4108,0.19541
2,0.0069,0.054153,0.325968,0.320406,0.435411,0.325968




TrainOutput(global_step=2000, training_loss=0.4661511253118515, metrics={'train_runtime': 34318.1128, 'train_samples_per_second': 0.466, 'train_steps_per_second': 0.058, 'total_flos': 2165468823552000.0, 'train_loss': 0.4661511253118515, 'epoch': 2.0})

In [41]:
trainer.evaluate()



{'eval_loss': 0.05415298044681549,
 'eval_accuracy': 0.32596759675967596,
 'eval_f1': 0.3204059006445758,
 'eval_precision': 0.43541117908849886,
 'eval_recall': 0.32596759675967596,
 'eval_runtime': 14056.4768,
 'eval_samples_per_second': 1.581,
 'eval_steps_per_second': 0.198,
 'epoch': 2.0}