In [1]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch.nn as nn
import os
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [29]:
# Load test data
test_data = pd.read_csv('test_data.csv')

In [30]:
test_data.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [31]:
test_data.shape

(5200, 4)

In [32]:
test_data.dropna(inplace=True)

In [33]:
test_data.shape

(4575, 4)

In case only the model state dictionary is saved

In [34]:
# Define the same model architecture as the one used for training
class FakeNewsClassifier(nn.Module):
    def __init__(self, num_labels=2):
        super(FakeNewsClassifier, self).__init__()
        self.bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits  # Extract logits


In [35]:
# Check if GPU is available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [36]:
import torch

if torch.cuda.is_available():
    print("GPU is available and PyTorch can use it.")
else:
    print("GPU is not available or PyTorch cannot use it.")

GPU is available and PyTorch can use it.


In [37]:
# Initialize and load the trained model
model = FakeNewsClassifier(num_labels=2)
model.load_state_dict(torch.load("model/distilbert-fake-news.pth", map_location=device))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("distilbert-fake-news-3.pth", map_location=device))


<All keys matched successfully>

In [38]:
# Move model to GPU (if available)
model.to(device)

FakeNewsClassifier(
  (bert): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): DistilBertSdpaAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
         

In [39]:
# Set model to evaluation mode (for inference)
model.eval()

FakeNewsClassifier(
  (bert): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): DistilBertSdpaAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
         

In [40]:
''' This class takes in the data, tokenizes it and returns the input IDs, attention masks.'''

class FakeNewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.data = data
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        # .squeeze(0), Removes batch dimension
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0)

In [41]:
# Create PyTorch data loaders for the test sets:

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
#prepare the dataset
test_dataset = FakeNewsDataset(test_data, tokenizer)
#create test data loader
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [42]:
# define the method for the test data prediction or inference
def predict_fake_news(test_loader):
    predictions = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

# to speed up the model inference we use no_grad() to avoid the step for updating the model weights
    with torch.no_grad():
        for batch in test_loader:
          # Accessing the elements of the batch as a tuple
            input_ids, attention_mask = batch # Unpack the tuple
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            predicted_classes = torch.argmax(logits, dim=1).tolist()
            predictions.extend(["True" if pred == 1 else "Fake" for pred in predicted_classes])
    return predictions

In [43]:
predictions = predict_fake_news(test_loader)

# In case the full model (i.e the model architecture and parameters are saved


In [None]:
model = torch.load("distilbert_fake_news", map_location=device)
model.eval()

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [None]:
# Predictions
y_pred = model.predict(X_test)

Calculate the model accuracy and plot confusion matrix

In [None]:
# calculate the model accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Confusion matrix
plt.figure(figsize=(5, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [44]:
# Save predictions to CSV
output_csv_path = os.path.join("results.csv")
pd.DataFrame({"Prediction": predictions}).to_csv(output_csv_path, index=False)

print(f"results are saved to {output_csv_path}")

results are saved to results.csv
