In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch.nn as nn
import os

In [None]:
# Define the model architecture
class FakeNewsClassifier(nn.Module):
    def __init__(self, num_labels=2):
        super(FakeNewsClassifier, self).__init__()
        self.bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits  # Extract logits

# Check if GPU is available, otherwise fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# initialize and load the trained model
model = FakeNewsClassifier(num_labels=2)
model.load_state_dict(torch.load("model_state/distilbert-fake-news-3.pth", map_location=device))
# Move model to GPU (if available)
model.to(device)
# Set model to evaluation mode
model.eval()

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Define a dataset class that reads from a CSV file
class FakeNewsDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=512):
        self.data = pd.read_csv(csv_file)  # Read CSV file
        self.data.fillna("", inplace=True)  # Handle missing values
        self.ids = self.data["ID"].tolist()  # Store IDs
        self.texts = (self.data["title"] + " " + self.data["text"]).tolist()  # Concatenate title & text
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=self.max_length
        )
        return {
            "ID": self.ids[idx],  # Include ID for reference
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0)
        }

# Function to make predictions using DataLoader
def predict_fake_news(dataloader):
    predictions = []
    ids = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"]
            attention_mask = batch["attention_mask"]
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            predicted_classes = torch.argmax(logits, dim=1).tolist()
            batch_predictions = ["Real" if pred == 1 else "Fake" for pred in predicted_classes]
            predictions.extend(batch_predictions)
            ids.extend(batch["ID"])  # Store IDs
    return ids, predictions

# Load test dataset from 'dataset' folder
file_path = os.path.join("dataset", "test.csv")  # Path to the CSV file
dataset = FakeNewsDataset(file_path, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)  # Batch processing

# Get predictions
ids, predictions = predict_fake_news(dataloader)

# Save predictions to CSV
output_df = pd.DataFrame({"ID": ids, "Prediction": predictions})
output_file_path = os.path.join("dataset", "predictions.csv")
output_df.to_csv(output_file_path, index=False)

print(f"Predictions saved to {output_file_path}")


In [None]:
# Example inference with GPU support
text = "Breaking news: Scientists have discovered a new planet!"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Move input tensors to the same device as the model
inputs = {key: val.to(device) for key, val in inputs.items()}

with torch.no_grad():
    logits = model(inputs["input_ids"], inputs["attention_mask"])

# Move logits back to CPU before converting to NumPy/tensors
predicted_class = torch.argmax(logits, dim=1).cpu().item()

print("Prediction:", "Real News" if predicted_class == 1 else "Fake News")


In [None]:
#compute and plot confusion matrix
import torch
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import numpy as np

true_labels = [0, 1, 1, 0, 1, 0, 1, 1, 0, 1]  # Ground truth
predicted_labels = [0, 1, 0, 0, 1, 0, 1, 1, 1, 1]  # Model predictions

# Compute confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)

# Plot heatmap
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", square=True, cbar=False,
            xticklabels=["Fake News", "Real News"],
            yticklabels=["Fake News", "Real News"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()