# Loading the data and pre process

In [3]:
import os
import re
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import BertTokenizer, BertModel
from torchvision.models import resnet50, ResNet50_Weights

# Load dataset
file_path = '/content/drive/MyDrive/All_Data/Cleaned_news_final1.csv'
df = pd.read_csv(file_path)

# Manual stopwords list
manual_stopwords = set([
    "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is",
    "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there",
    "these", "they", "this", "to", "was", "will", "with", "we", "you", "your"
])

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = ' '.join([word for word in text.split() if word not in manual_stopwords])
    return text

# Apply text preprocessing
df['cleaned_title'] = df['title'].apply(preprocess_text)
df['cleaned_description'] = df['description'].apply(preprocess_text)

# Tokenize text using BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df['tokenized_text'] = df['cleaned_description'].apply(lambda x: tokenizer(x, padding='max_length', truncation=True, max_length=512, return_tensors='pt'))

# Define image transformation pipeline
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define image directory path
IMAGE_DIR = '/content/drive/MyDrive/All_Data'

# Function to load and preprocess images
def load_and_preprocess_image(image_filename):
    if isinstance(image_filename, str):
        image_filename = image_filename.replace("\\", "/")  # Convert Windows-style paths
        image_path = os.path.join(IMAGE_DIR, image_filename)

        if os.path.exists(image_path):
            image = Image.open(image_path).convert("RGB")  # Ensure all images have 3 channels (RGB)
            return image_transform(image)

    return torch.zeros((3, 224, 224))  # Return a blank image tensor if missing

# Apply image preprocessing
df['processed_image'] = df['image_location'].apply(load_and_preprocess_image)

# Convert labels to numerical format
df['label_fake_news'] = df['fake_news_label'].apply(lambda x: 1 if x.lower() == 'fake' else 0)
df['label_image_relation'] = df['image_relation'].apply(lambda x: 1 if x.lower() == 'yes' else 0)

# Handle class imbalance with oversampling
ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
train_df_resampled, _ = ros.fit_resample(df, df['label_fake_news'])

# Split data into train and test sets
train_df, test_df = train_test_split(train_df_resampled, test_size=0.2, random_state=42)

# Compute class weights for imbalanced dataset
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=train_df['label_fake_news'].values)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Define Multimodal Model with Dual Output
class MultiModalFakeNewsModel(nn.Module):
    def __init__(self):
        super(MultiModalFakeNewsModel, self).__init__()

        # Text Model (BERT)
        self.text_model = BertModel.from_pretrained("bert-base-uncased")
        self.text_fc = nn.Linear(768, 256)

        # Image Model (ResNet-50)
        self.image_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
        self.image_model.fc = nn.Linear(self.image_model.fc.in_features, 256)

        # Fusion Layer
        self.fusion_fc = nn.Linear(512, 256)

        # Output Layers
        self.fc_fake_news = nn.Linear(256, 1)  # Fake News Classification
        self.fc_image_relation = nn.Linear(256, 1)  # Image-Text Relationship Classification

        self.sigmoid = nn.Sigmoid()

    def forward(self, text_input, attention_mask, image_input):
        text_features = self.text_model(text_input, attention_mask=attention_mask).pooler_output
        text_features = self.text_fc(text_features)

        image_features = self.image_model(image_input)

        combined = torch.cat((text_features, image_features), dim=1)
        fused_features = self.fusion_fc(combined)

        output_fake_news = self.fc_fake_news(fused_features)
        output_image_relation = self.fc_image_relation(fused_features)

        return self.sigmoid(output_fake_news), self.sigmoid(output_image_relation)
# Define Model Training Function
def train_model(model, train_loader, criterion_fake_news, criterion_image_relation, optimizer, device, num_epochs=10):
    model.to(device)
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0
        for text_data, image_data, label_fake_news, label_image_relation in train_loader:
            text_input_ids = text_data["input_ids"].squeeze(1).to(device)
            attention_mask = text_data["attention_mask"].squeeze(1).to(device)
            image_data = image_data.to(device)
            label_fake_news = label_fake_news.to(device)
            label_image_relation = label_image_relation.to(device)

            optimizer.zero_grad()

            output_fake_news, output_image_relation = model(text_input_ids, attention_mask, image_data)

            loss_fake_news = criterion_fake_news(output_fake_news.squeeze(), label_fake_news)
            loss_image_relation = criterion_image_relation(output_image_relation.squeeze(), label_image_relation)

            loss = loss_fake_news + loss_image_relation  # Total loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")

# Define PyTorch Dataset class
class NewsDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text_data = self.data.iloc[idx]['tokenized_text']
        image_data = self.data.iloc[idx]['processed_image']
        label_fake_news = torch.tensor(self.data.iloc[idx]['label_fake_news'], dtype=torch.float)
        label_image_relation = torch.tensor(self.data.iloc[idx]['label_image_relation'], dtype=torch.float)
        return text_data, image_data, label_fake_news, label_image_relation

# Create DataLoaders
train_dataset = NewsDataset(train_df)
test_dataset = NewsDataset(test_df)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Initialize Model, Loss, Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiModalFakeNewsModel().to(device)
criterion_fake_news = nn.BCEWithLogitsLoss(pos_weight=class_weights[1])
criterion_image_relation = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Train the Model
train_model(model, train_loader, criterion_fake_news, criterion_image_relation, optimizer, device, num_epochs=10)





Epoch 1/10, Loss: 0.9316
Epoch 2/10, Loss: 0.6653
Epoch 3/10, Loss: 0.5679
Epoch 4/10, Loss: 0.5411
Epoch 5/10, Loss: 0.5255
Epoch 6/10, Loss: 0.5244
Epoch 7/10, Loss: 0.5107
Epoch 8/10, Loss: 0.5186
Epoch 9/10, Loss: 0.5154
Epoch 10/10, Loss: 0.5147


In [4]:
# Evaluate Model
# Define Model Evaluation Function
def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds_fake_news, all_labels_fake_news = [], []
    all_preds_image_relation, all_labels_image_relation = [], []

    with torch.no_grad():
        for text_data, image_data, label_fake_news, label_image_relation in dataloader:
            text_input_ids = text_data["input_ids"].squeeze(1).to(device)
            attention_mask = text_data["attention_mask"].squeeze(1).to(device)
            image_data = image_data.to(device)
            label_fake_news = label_fake_news.to(device)
            label_image_relation = label_image_relation.to(device)

            output_fake_news, output_image_relation = model(text_input_ids, attention_mask, image_data)

            predicted_fake_news = (output_fake_news.squeeze() > 0.5).float()
            predicted_image_relation = (output_image_relation.squeeze() > 0.5).float()

            all_preds_fake_news.extend(predicted_fake_news.cpu().numpy())
            all_labels_fake_news.extend(label_fake_news.cpu().numpy())

            all_preds_image_relation.extend(predicted_image_relation.cpu().numpy())
            all_labels_image_relation.extend(label_image_relation.cpu().numpy())

    # Calculate Metrics
    metrics = {
        "Fake News Accuracy": accuracy_score(all_labels_fake_news, all_preds_fake_news),
        "Fake News Precision": precision_score(all_labels_fake_news, all_preds_fake_news, zero_division=1),
        "Fake News Recall": recall_score(all_labels_fake_news, all_preds_fake_news, zero_division=1),
        "Fake News F1-score": f1_score(all_labels_fake_news, all_preds_fake_news, zero_division=1),
        "Fake News AUC-ROC": roc_auc_score(all_labels_fake_news, all_preds_fake_news),
        "Image Relation Accuracy": accuracy_score(all_labels_image_relation, all_preds_image_relation),
        "Image Relation Precision": precision_score(all_labels_image_relation, all_preds_image_relation, zero_division=1),
        "Image Relation Recall": recall_score(all_labels_image_relation, all_preds_image_relation, zero_division=1),
        "Image Relation F1-score": f1_score(all_labels_image_relation, all_preds_image_relation, zero_division=1),
        "Image Relation AUC-ROC": roc_auc_score(all_labels_image_relation, all_preds_image_relation)
    }

    return metrics

evaluation_results = evaluate_model(model, test_loader, device)
for metric, value in evaluation_results.items():
    print(f"{metric}: {value:.4f}")


Fake News Accuracy: 1.0000
Fake News Precision: 1.0000
Fake News Recall: 1.0000
Fake News F1-score: 1.0000
Fake News AUC-ROC: 1.0000
Image Relation Accuracy: 0.9273
Image Relation Precision: 0.9286
Image Relation Recall: 0.9750
Image Relation F1-score: 0.9512
Image Relation AUC-ROC: 0.8875
