In [None]:
!pip install transformers datasets evaluate accelerate

In [None]:
from datasets import load_dataset
training_data = load_dataset("Feldt/Combined")
training_data['train'][0]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True)

tokenized_training_data = training_data.map(preprocess_function, batched=True)

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

id2label = {0: "FAKE", 1: "TRUE"}
label2id = {"FAKE": 0, "TRUE": 1}

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

training_args = TrainingArguments(

    output_dir="fake_new_detection",

    learning_rate=2e-5,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=16,

    num_train_epochs=2,

    weight_decay=0.01,

    eval_strategy="epoch",

    save_strategy="epoch",

    load_best_model_at_end=True

)

trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_training_data["train"],

    eval_dataset=tokenized_training_data["test"],

    processing_class=tokenizer,

    compute_metrics=compute_metrics,
)

trainer.train()