In [None]:
from datasets import load_dataset, Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

import torch
import evaluate

In [None]:
ds = load_dataset("artem9k/ai-text-detection-pile")
df = ds['train'].to_pandas()

# Replace 'human' with 0 and 'ai' with 1
df['source'] = df['source'].replace({'human': 0, 'ai': 1})

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

In [None]:
# Initialize the tokenizer
model_id = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_id)

# Define the tokenization function
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=512)


In [None]:
# Convert DataFrame to Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize datasets
train_encodings = train_dataset.map(tokenize, batched=True)
test_encodings = test_dataset.map(tokenize, batched=True)

# Rename and remove unused columns
train_encodings = train_encodings.rename_column("source", "labels")
test_encodings = test_encodings.rename_column("source", "labels")

train_encodings = train_encodings.remove_columns([col for col in train_encodings.column_names if col not in ['input_ids', 'attention_mask', 'labels']])
test_encodings = test_encodings.remove_columns([col for col in test_encodings.column_names if col not in ['input_ids', 'attention_mask', 'labels']])

In [None]:
# Load evaluation metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

Model Training

In [None]:
# Define model and device
model = BertForSequenceClassification.from_pretrained(model_id, num_labels=2)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
model.to(device)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    seed=42,
    fp16=True,
    gradient_accumulation_steps=1,
    eval_accumulation_steps=1,
)

# Metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids

    accuracy_result = accuracy.compute(predictions=preds, references=labels)
    f1_result = f1.compute(predictions=preds, references=labels, average='binary')
    precision_result = precision.compute(predictions=preds, references=labels, average='binary')
    recall_result = recall.compute(predictions=preds, references=labels, average='binary')

    return {
        'accuracy': accuracy_result['accuracy'],
        'f1': f1_result['f1'],
        'precision': precision_result['precision'],
        'recall': recall_result['recall'],
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=test_encodings,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

In [None]:
# Save the model and tokenizer
model.save_pretrained('bert-ai-detection')
tokenizer.save_pretrained('bert-ai-detection')