### import required libraries

In [None]:
# This is used to import required libraries
import pandas as pd

import numpy as np

In [None]:
#this is needed to grant colab access to google drive.
#it is important because when access is granted, files can be loaded directly from drive to colab notebook without needing to download and install
#files everytime colab is used, since it is only a virtual IDE.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import zipfile
import os

# the code snippet helps Unzip the uploaded file making it explorable to colab
with zipfile.ZipFile('/content/drive/My Drive/Colab Notebooks/colab_libs/bert-base-uncased.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/drive/My Drive/Colab Notebooks/colab_libs/bert-base-uncased')

In [None]:
# checking for first few rows of the dataset
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/datasets/df_clean.csv')
df.head()

Unnamed: 0,date,title,verdict,body,label,cleaned_body
0,31-Mar-25,Kenyan president has not asked Nigerian govern...,Posts on Facebook claim that Kenyan president ...,Kenyan president William Ruto has asked the Ni...,False,Kenyan president William Ruto has asked the Ni...
1,24-Mar-25,"No, chairperson of Nigeria’s main opposition P...",Nigeria’s Peoples Democratic Party is experien...,"The Peoples Democratic Party (PDP), Nigeria’...",False,"The Peoples Democratic Party (PDP), Nigerias m..."
2,24-Mar-25,No evidence Nigeria’s Enugu state plans to ban...,Some Facebook posts in Nigeria claim that the ...,Several posts on Facebook in Nigeria claim tha...,False,Several posts on Facebook in Nigeria claim tha...
3,27-Mar-25,Nigeria’s former president Obasanjo did not ad...,Several Facebook posts claim that former Niger...,Nigeria’s former president Olusegun Obasanjo ...,False,Nigerias former president Olusegun Obasanjo vi...
4,14-Mar-25,"No, HIV-positive patients in Nigeria won’t hav...",US president Donald Trump’s decision to tempor...,Donald Trump was sworn in as president of t...,False,Donald Trump was sworn in as president of the ...


In [None]:
# used to convert label column to string, strip off white spaces and convert to lower case
df['label'] = df['label'].astype(str).str.strip().str.lower()

Feature engineering

In [None]:
# Converting categorical data to numeric format.
# this converts df['verdict_label'] values: false to '0' and true to '1'
df['verdict_label'] = df['label'].apply(lambda x: 1 if x == 'true' else 0)
df.sample(6)

Unnamed: 0,date,title,verdict,body,label,cleaned_body,verdict_label
1085,12-Apr-21,Is there truly a new HIV vaccine with 97% anti...,While it is true an HIV candidate vaccine (IAV...,Claim: The screenshot of a report making the r...,True,Claim: The screenshot of a report making the r...,1
994,10-Mar-25,Verifying several claims on Pineapple leaf as ...,Verdict: Misleading! Two out of the three clai...,"Claim: A Facebook page , Herbal Magic, claimed...",False,"Claim: A Facebook page , Herbal Magic, claimed...",0
442,16-Dec-22,False claim on Tinubu attributed to the Nigeri...,The masterminds of this false information were...,The attention of the Nigerian Fact-Checkers Co...,False,The attention of the Nigerian Fact-Checkers Co...,0
1222,21-Nov-21,Did Nigerian court order MTN to pay user N5.5 ...,"The claim is True. In 2020, an FCT High Court ...",Claim: A WhatsApp message claims that a Nigeri...,True,Claim: A WhatsApp message claims that a Nigeri...,1
1911,30-Oct-24,Is video showing foreign student protesters ca...,"Verdict: True. In August, students protested o...",Claim: A viral clip on Facebook and WhatsApp s...,True,Claim: A viral clip on Facebook and WhatsApp s...,1
706,30-Apr-24,Peter Obi’s Water Project: Video of woman scoo...,Verdict: FALSE. The video is not related to Ni...,Claim: An X user posted a video showing a woma...,False,Claim: An X user posted a video showing a woma...,0


In [None]:
# Dropping irrelevant columns
df.drop(['date','title','verdict','body','label'], axis=1, inplace=True)

In [None]:
df.shape

(2048, 2)

In [None]:
df['verdict_label'].value_counts()

Unnamed: 0_level_0,count
verdict_label,Unnamed: 1_level_1
0,1024
1,1024


In [None]:
df['verdict_label'].unique()

array([0, 1])

In [None]:
# this is used to check for duplicate values: if any exists or not
df.duplicated().any()

np.True_

In [None]:
# used to remove duplicate values
df.drop_duplicates(inplace=True)

In [None]:
# used to count values of unique elements in df['verdict_label'] column.
df['verdict_label'].value_counts()

Unnamed: 0_level_0,count
verdict_label,Unnamed: 1_level_1
0,1024
1,1023


In [None]:
# balancing dataset by removing a row from dataframe where label value is 0
index_to_drop = df[df['verdict_label'] == 0].index[0]
df = df.drop(index_to_drop)

In [None]:
# this shows balanced df
df['verdict_label'].value_counts()

Unnamed: 0_level_0,count
verdict_label,Unnamed: 1_level_1
0,1023
1,1023


Loading pretrained tokenizer and model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# this navigates into directory containing:'config.json','pytorch_model.bin','vocab.txt','tokenizer_config.json'; making the local files accessible to colab
model_dir = '/content/drive/My Drive/Colab Notebooks/colab_libs/bert-base-uncased/MYBERTmodel'

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/My Drive/Colab Notebooks/colab_libs/bert-base-uncased/MYBERTmodel and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df.columns

Index(['cleaned_body', 'verdict_label'], dtype='object')

In [None]:
# using sklearn library to split dataframe into 80% training and 20% testing datasets
x_train, x_test, y_train, y_test = train_test_split(df['cleaned_body'], df['verdict_label'], test_size=0.2, random_state=42, stratify=df['verdict_label'])
print('shape of x_train: ', x_train.shape)
print('shape of x_test: ', x_test.shape)
print('shape of y_train: ', y_train.shape)
print('shape of y_test: ', y_test.shape)

shape of x_train:  (1636,)
shape of x_test:  (410,)
shape of y_train:  (1636,)
shape of y_test:  (410,)


In [None]:
len(x_train)

1636

In [None]:
# useful to convert dataset to pytorch tokenized format
x_train_tokenized = tokenizer(list(x_train), padding=True, truncation=True, return_tensors='pt')
x_test_tokenized = tokenizer(list(x_test), padding=True, truncation=True, return_tensors='pt')

In [None]:
x_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
x_train_tokenized['input_ids']

tensor([[  101,  4366,  1024,  ...,  2093,  3962,   102],
        [  101,  4366,  1024,  ..., 12318, 24471,   102],
        [  101,  4366,  1024,  ...,  3463,  2013,   102],
        ...,
        [  101,  4366,  1024,  ..., 13523, 20026,   102],
        [  101,  4366,  1024,  ...,  2003,  1996,   102],
        [  101,  4366,  1024,  ...,  9556,  2265,   102]])

In [None]:
# converts label dataset to pytorch tensor
import torch

y_train_tensor = torch.tensor(y_train.values)
y_test_tensor = torch.tensor(y_test.values)

In [None]:
y_train_tensor

tensor([0, 0, 0,  ..., 0, 0, 1])

In [None]:
# pytorch's standard function for loading train and test dataset
from torch.utils.data import Dataset, DataLoader

class TextClassificationDataset(Dataset):
        def __init__(self, tokenized_inputs, labels):
            self.input_ids = tokenized_inputs['input_ids']
            self.attention_mask = tokenized_inputs['attention_mask']
            self.labels = labels

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, idx):
            return {
                'input_ids': self.input_ids[idx],
                'attention_mask': self.attention_mask[idx],
                'labels': self.labels[idx]
            }

# this creates pytorch recognized dataset using built "TextClassificationDataset" class
train_dataset = TextClassificationDataset(x_train_tokenized, y_train_tensor)
test_dataset = TextClassificationDataset(x_test_tokenized, y_test_tensor)

# Create DataLoaders
batch_size = 16 # adjustable
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [39]:
import torch
from torch.optim import AdamW # Common optimizer for transformers
from torch.nn import CrossEntropyLoss # Common loss for classification


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) # Move model to GPU if available

optimizer = AdamW(model.parameters(), lr=5e-5) # the learning rate can be tuned.
loss_fn = CrossEntropyLoss() # BERT models often output logits, so CrossEntropyLoss is suitable

num_epochs = 3 # the number of epochs can adjusted

# creating a Training Loop
for epoch in range(num_epochs):
    model.train() # Set model to training mode
    total_loss = 0

    for batch in train_dataloader:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward Pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        # Get loss from model output
        loss = outputs.loss
        total_loss += loss.item()

        # Backward Pass
        loss.backward()

        # Optimizer Step
        optimizer.step()

        # Zero Gradients
        optimizer.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}")

    # Evaluation Loop
    model.eval() # Sets model to evaluation mode
    total_eval_accuracy = 0
    total_eval_loss = 0

    with torch.no_grad(): # Disable gradient calculations for evaluation
        for batch in test_dataloader:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward Pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

            # Get loss and logits
            loss = outputs.loss
            logits = outputs.logits

            total_eval_loss += loss.item()

            # Calculate accuracy
            predictions = torch.argmax(logits, dim=-1)
            accuracy = (predictions == labels).sum().item()
            total_eval_accuracy += accuracy

    avg_eval_loss = total_eval_loss / len(test_dataloader)
    avg_eval_accuracy = total_eval_accuracy / len(test_dataset)

    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_eval_loss:.4f}, Validation Accuracy: {avg_eval_accuracy:.4f}")

print("Training complete!")

Epoch 1/3, Training Loss: 0.4028
Epoch 1/3, Validation Loss: 0.4453, Validation Accuracy: 0.7951
Epoch 2/3, Training Loss: 0.2381
Epoch 2/3, Validation Loss: 0.5671, Validation Accuracy: 0.7707
Epoch 3/3, Training Loss: 0.1492
Epoch 3/3, Validation Loss: 0.6459, Validation Accuracy: 0.7732
Training complete!


In [42]:
# evaluating the model's performance using sample text
model.eval()

# Example text to predict
text_to_predict = 'Nnamdi kanu the leader of IPOB according to recent rumored news is claimed to be dead'

# Use the same tokenizer used during training
text_tokenized = tokenizer(text_to_predict, padding=True, truncation=True, return_tensors='pt')

# Convert to tensors and Move to the device
# Move the tokenized inputs to the same device as the model
input_ids = text_tokenized['input_ids'].to(device)
attention_mask = text_tokenized['attention_mask'].to(device)


with torch.no_grad(): # Disable gradient calculations for prediction
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

# Get the logits
logits = outputs.logits

# used to Interpret the output
# Apply softmax to convert logits to probabilities
probabilities = torch.softmax(logits, dim=1)

# Get the predicted class (the one with the highest probability)
predicted_class_index = torch.argmax(probabilities, dim=1).item()

# You can map the class index back to your labels ('true' or 'false')
# Assuming 0 corresponds to 'false' and 1 corresponds to 'true' based on your data processing
predicted_label = 'true' if predicted_class_index == 1 else 'false'

print(f"The text is predicted as: {predicted_label}")
print(f"Probabilities: {probabilities.tolist()}")

The text is predicted as: true
Probabilities: [[0.07983937114477158, 0.9201606512069702]]


In [43]:
import torch

# path where the model is to be saved
save_path = '/content/drive/My Drive/Colab Notebooks/trained_model/news_classifier.pth'

import os
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Save the model's state dictionary
torch.save(model.state_dict(), save_path)

print(f"Model saved to {save_path}")

Model saved to /content/drive/My Drive/Colab Notebooks/trained_model/news_classifier.pth
