In [None]:
!pip install transformers datasets scikit-learn pandas torch numpy

In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from google.colab import files
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer, EvalPrediction
from datasets import Dataset

#Upload datasets
datasets = files.upload()

#Load datasets as dataframes
gc_real = pd.read_csv('gossipcop_real.csv')
gc_fake = pd.read_csv('gossipcop_fake.csv')
pf_real = pd.read_csv('politifact_real.csv')
pf_fake = pd.read_csv('politifact_fake.csv')

#Binary standardized labels (map truth to 0 and false to 1)
#Real info
gc_real['label'] = 0
pf_real['label'] = 0
#Fake info
pf_fake['label'] = 1
gc_fake['label'] = 1

#Keep only the columns required for training (title, label)
gc_real = gc_real[['title', 'label']]
gc_fake = gc_fake[['title', 'label']]
pf_real = pf_real[['title', 'label']]
pf_fake = pf_fake[['title', 'label']]

#Merge all data to one training set
full_training = pd.concat([gc_real, gc_fake, pf_real, pf_fake], ignore_index=True)
full_training = full_training.dropna(subset=['title']) #Drop missing titles just in case so that the model is not trained on bad information
full_training = full_training.sample(frac=1, random_state=42).reset_index(drop=True) #Shuffle up the order of the data

full_training.head()

#Split data into training and testing sets
x = full_training['title']
y = full_training['label']

train_texts, test_texts, train_labels, test_labels = train_test_split(
    x.tolist(), y.tolist(), test_size=0.2, random_state=0, shuffle=True #Remove random state later
)

#Tokenize the data
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

#Wrap in PyTorch dataset
class MisinfoDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __len__(self):
      return len(self.labels)

  def __getitem__(self, idx):
    item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

train_dataset = MisinfoDataset(train_encodings, train_labels)
test_dataset = MisinfoDataset(test_encodings, test_labels)

#Load the model to train
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

#Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=False,
    metric_for_best_model="accuracy"
)

#Define metrics
def compute_metrics(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {
      'accuracy': accuracy_score(p.label_ids, preds),
      'f1': f1_score(p.label_ids, preds),
  }

#Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()

#Save new model
model.save_pretrained('./bullbot_model')
tokenizer.save_pretrained('./bullbot_model')

In [None]:
from google.colab import files

#Zip and download trained model files
!zip -r bullbot_model.zip bullbot_model/
files.download('bullbot_model.zip')