In [3]:
import pandas as pd
import numpy as np
import torch
import logging
from sklearn.model_selection import train_test_split

from datasets import Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from sklearn.metrics import accuracy_score

In [5]:
# Load CSV
df = pd.read_csv('Data/processed_ai_vs_human.csv')
df.head()

Unnamed: 0,text,generated
0,Cars Cars around since became famous 1900s Hen...,0
1,Transportation large necessity countries world...,0
2,Americas love affair vehicles seems cooling sa...,0
3,often ride car drive one motor vehicle work st...,0
4,Cars wonderful thing perhaps one worlds greate...,0


In [None]:
df = df[df['text'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]


train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [7]:
# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

Map:   0%|          | 0/389788 [00:00<?, ? examples/s]

ValueError: Schema and number of arrays unequal

In [None]:
# Training args
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

In [None]:
# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [None]:
# Trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# Train
trainer.train()

In [None]:
# Evaluate
trainer.evaluate()

In [None]:
# Save model
model.save_pretrained("./results/final_model")
tokenizer.save_pretrained("./results/final_model")