In [1]:
from datasets import load_dataset, Dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

import torch
import evaluate

  from .autonotebook import tqdm as notebook_tqdm
2024-12-29 16:25:40.985330: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735464341.048976 1569153 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735464341.067629 1569153 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-29 16:25:41.276411: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
ds = load_dataset("artem9k/ai-text-detection-pile")
df = ds['train'].to_pandas()

# Replace 'human' with 0 and 'ai' with 1
df['source'] = df['source'].replace({'human': 0, 'ai': 1})

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

  df['source'] = df['source'].replace({'human': 0, 'ai': 1})


In [4]:
# Initialize the tokenizer
model_id = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_id)

# Define the tokenization function
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=512)


In [5]:
# Convert DataFrame to Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize datasets
train_encodings = train_dataset.map(tokenize, batched=True)
test_encodings = test_dataset.map(tokenize, batched=True)

# Rename and remove unused columns
train_encodings = train_encodings.rename_column("source", "labels")
test_encodings = test_encodings.rename_column("source", "labels")

train_encodings = train_encodings.remove_columns([col for col in train_encodings.column_names if col not in ['input_ids', 'attention_mask', 'labels']])
test_encodings = test_encodings.remove_columns([col for col in test_encodings.column_names if col not in ['input_ids', 'attention_mask', 'labels']])

Map: 100%|███████████████████| 1114017/1114017 [04:35<00:00, 4043.28 examples/s]
Map: 100%|█████████████████████| 278505/278505 [01:08<00:00, 4042.93 examples/s]


In [6]:
# Load evaluation metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

Model Training

In [7]:
# Define model and device
model = BertForSequenceClassification.from_pretrained(model_id, num_labels=2)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    seed=42,
    fp16=True,
    gradient_accumulation_steps=1,
    eval_accumulation_steps=1,
)

# Metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids

    accuracy_result = accuracy.compute(predictions=preds, references=labels)
    f1_result = f1.compute(predictions=preds, references=labels, average='binary')
    precision_result = precision.compute(predictions=preds, references=labels, average='binary')
    recall_result = recall.compute(predictions=preds, references=labels, average='binary')

    return {
        'accuracy': accuracy_result['accuracy'],
        'f1': f1_result['f1'],
        'precision': precision_result['precision'],
        'recall': recall_result['recall'],
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=test_encodings,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0712,0.072309,0.982869,0.967946,0.949678,0.98693
2,0.0223,0.098785,0.98316,0.968706,0.944186,0.994533
3,0.0129,0.091467,0.982309,0.967223,0.940064,0.995999


TrainOutput(global_step=208881, training_loss=0.0497146910746712, metrics={'train_runtime': 17420.1378, 'train_samples_per_second': 191.85, 'train_steps_per_second': 11.991, 'total_flos': 8.793305656769434e+17, 'train_loss': 0.0497146910746712, 'epoch': 3.0})

In [9]:
# Save the model and tokenizer
model.save_pretrained('bert-ai-detection')
tokenizer.save_pretrained('bert-ai-detection')

('bert-ai-detection/tokenizer_config.json',
 'bert-ai-detection/special_tokens_map.json',
 'bert-ai-detection/vocab.txt',
 'bert-ai-detection/added_tokens.json',
 'bert-ai-detection/tokenizer.json')