# Loading the data and process

In [1]:
import pandas as pd

# Load the dataset (Ensure the correct path)
df = pd.read_csv("Raw_labeled_with_gemini.csv")

# Display basic info
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 779 entries, 0 to 778
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            779 non-null    object
 1   image            506 non-null    object
 2   published        779 non-null    object
 3   description      775 non-null    object
 4   category         779 non-null    object
 5   author           762 non-null    object
 6   image_relation   779 non-null    object
 7   fake_news_label  779 non-null    object
dtypes: object(8)
memory usage: 48.8+ KB
None
                                               title  \
0  Social Security head replaced after clash with...   
1  Richard Linklater and Ethan Hawke Tease New Mo...   
2  Northpoint Commercial Finance Partners with Ya...   
3  Institutions Can Earn Bitcoin Yield With lstBT...   
4  Your 10-Step Guide to Buying a Home, From Star...   

                                               image  \
0  http

In [None]:

# Standardize column names
df.columns = df.columns.str.strip().str.lower()

# Fill missing values
df["image"].fillna("Unknown", inplace=True)
df["author"].fillna("Unknown", inplace=True)
df["description"].fillna("No description available", inplace=True)

# Fix category format (Convert ['category'] → category)
df["category"] = df["category"].str.replace(r"[\[\]']", "", regex=True)

# Convert 'published' column to datetime format
df["published"] = pd.to_datetime(df["published"], errors="coerce")

# Remove duplicates
df.drop_duplicates(inplace=True)

# Save the cleaned dataset
df.to_csv("cleaned_news_data.csv", index=False)

print(" Preprocessing complete! Cleaned data saved as 'cleaned_news_data.csv'")


✅ Preprocessing complete! Cleaned data saved as 'cleaned_news_data.csv'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["image"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["author"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alwa

# Data Processing and Definig the model


In [None]:
import os
import re
import pandas as pd
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import BertTokenizer, BertModel
from torchvision.models import resnet50, ResNet50_Weights

# Load dataset
file_path = r'C:\Users\rajar\OneDrive\Desktop\New folder\Fake-News\All_Data\Cleaned_news_final1.csv'
df = pd.read_csv(file_path)

# Manual stopwords list
manual_stopwords = set([
    "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", 
    "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", 
    "these", "they", "this", "to", "was", "will", "with", "we", "you", "your"
])

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = ' '.join([word for word in text.split() if word not in manual_stopwords])
    return text

# Apply text preprocessing
df['cleaned_title'] = df['title'].apply(preprocess_text)
df['cleaned_description'] = df['description'].apply(preprocess_text)

# Tokenize text using BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df['tokenized_text'] = df['cleaned_description'].apply(lambda x: tokenizer(x, padding='max_length', truncation=True, max_length=512, return_tensors='pt'))

# Define image transformation pipeline
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define image directory path
IMAGE_DIR = r"C:\\Users\\rajar\\OneDrive\\Desktop\\New folder\\Fake-News\\All_Data"  

# Function to load and preprocess images
def load_and_preprocess_image(image_filename):
    if isinstance(image_filename, str):
        image_filename = image_filename.replace("\\", "/")  # Convert Windows-style paths
        image_path = os.path.join(IMAGE_DIR, image_filename)
        
        if os.path.exists(image_path):
            image = Image.open(image_path).convert("RGB")  # Ensure all images have 3 channels (RGB)
            return image_transform(image)

    return torch.zeros((3, 224, 224))  # Return a blank image tensor if missing

# Apply image preprocessing
df['processed_image'] = df['image_location'].apply(load_and_preprocess_image)

# Convert labels to numerical format
df['label_fake_news'] = df['fake_news_label'].apply(lambda x: 1 if x.lower() == 'fake' else 0)
df['label_image_relation'] = df['image_relation'].apply(lambda x: 1 if x.lower() == 'yes' else 0)

# Split data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Define Model Training Function
def train_model(model, train_loader, criterion_fake_news, criterion_image_relation, optimizer, device, num_epochs=5):
    model.to(device)
    model.train()
    
    for epoch in range(num_epochs):
        total_loss = 0
        for text_data, image_data, label_fake_news, label_image_relation in train_loader:
            text_input_ids = text_data["input_ids"].squeeze(1).to(device)
            attention_mask = text_data["attention_mask"].squeeze(1).to(device)
            image_data = image_data.to(device)
            label_fake_news = label_fake_news.to(device)
            label_image_relation = label_image_relation.to(device)
            
            optimizer.zero_grad()
            
            output_fake_news, output_image_relation = model(text_input_ids, attention_mask, image_data)
            
            loss_fake_news = criterion_fake_news(output_fake_news.squeeze(), label_fake_news)
            loss_image_relation = criterion_image_relation(output_image_relation.squeeze(), label_image_relation)
            
            loss = loss_fake_news + loss_image_relation  # Total loss
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}")

# Initialize Model, Loss, Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiModalFakeNewsModel().to(device)
criterion_fake_news = nn.BCELoss()
criterion_image_relation = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Train the Model
train_model(model, train_loader, criterion_fake_news, criterion_image_relation, optimizer, device, num_epochs=5)




Epoch 1/5, Loss: 0.6187
Epoch 2/5, Loss: 0.2868
Epoch 3/5, Loss: 0.1754


In [None]:
# Define Evaluation Function
def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds_fake_news, all_labels_fake_news = [], []
    all_preds_image_relation, all_labels_image_relation = [], []
    
    with torch.no_grad():
        for text_data, image_data, label_fake_news, label_image_relation in dataloader:
            text_input_ids = text_data["input_ids"].squeeze(1).to(device)
            attention_mask = text_data["attention_mask"].squeeze(1).to(device)
            image_data = image_data.to(device)
            label_fake_news = label_fake_news.to(device)
            label_image_relation = label_image_relation.to(device)
            
            output_fake_news, output_image_relation = model(text_input_ids, attention_mask, image_data)
            
            predicted_fake_news = (output_fake_news.squeeze() > 0.5).float()
            predicted_image_relation = (output_image_relation.squeeze() > 0.5).float()
            
            all_preds_fake_news.extend(predicted_fake_news.cpu().numpy())
            all_labels_fake_news.extend(label_fake_news.cpu().numpy())
            
            all_preds_image_relation.extend(predicted_image_relation.cpu().numpy())
            all_labels_image_relation.extend(label_image_relation.cpu().numpy())
    
    # Calculate Metrics
    metrics = {
        "Fake News Accuracy": accuracy_score(all_labels_fake_news, all_preds_fake_news),
        "Fake News Precision": precision_score(all_labels_fake_news, all_preds_fake_news),
        "Fake News Recall": recall_score(all_labels_fake_news, all_preds_fake_news),
        "Fake News F1-score": f1_score(all_labels_fake_news, all_preds_fake_news),
        "Fake News AUC-ROC": roc_auc_score(all_labels_fake_news, all_preds_fake_news),
        "Image Relation Accuracy": accuracy_score(all_labels_image_relation, all_preds_image_relation),
        "Image Relation Precision": precision_score(all_labels_image_relation, all_preds_image_relation),
        "Image Relation Recall": recall_score(all_labels_image_relation, all_preds_image_relation),
        "Image Relation F1-score": f1_score(all_labels_image_relation, all_preds_image_relation),
        "Image Relation AUC-ROC": roc_auc_score(all_labels_image_relation, all_preds_image_relation)
    }
    
    return metrics


Unnamed: 0,title,image,published,description,category,author,image_relation,fake_news_label,image_loc,image_location,cleaned_title,cleaned_description,tokenized_text,processed_image,label_fake_news,label_image_relation
0,Social Security head replaced after clash with...,https://s.yimg.com/ny/api/res/1.2/NpP.f1PQfst9...,2025-02-18 17:47:30+00:00,The acting Social Security Administration comm...,general,Yahoo Finance,Yes,Real,image_0.jpg,Images\image_0.jpg,social security head replaced after clash elon...,acting social security administration commissi...,"[input_ids, token_type_ids, attention_mask]","[[[tensor(1.2728), tensor(1.1358), tensor(1.30...",0,1
1,Richard Linklater and Ethan Hawke Tease New Mo...,https://variety.com/wp-content/uploads/2025/02...,2025-02-18 17:19:25+00:00,Before premiering 'Blue Moon' at Berlin Film F...,entertainment,Ellise Shafer,Yes,Real,image_1.jpg,Images\image_1.jpg,richard linklater ethan hawke tease new movie ...,before premiering blue moon berlin film festiv...,"[input_ids, token_type_ids, attention_mask]","[[[tensor(-2.1179), tensor(-2.1179), tensor(-2...",0,1
2,Institutions Can Earn Bitcoin Yield With lstBT...,https://cdn.benzinga.com/files/images/story/20...,2025-02-18 17:02:27+00:00,"Maple Finance, BitGo, Copper, and Hex Trust te...",finance,Khyathi Dalal,No,Real,image_2.jpeg,Images\image_2.jpeg,institutions can earn bitcoin yield lstbtc whi...,maple finance bitgo copper hex trust team up c...,"[input_ids, token_type_ids, attention_mask]","[[[tensor(-1.8953), tensor(-1.8610), tensor(-1...",0,0
3,"Your 10-Step Guide to Buying a Home, From Star...",https://www.cnet.com/a/img/resize/769d3b87e85c...,2025-02-18 17:00:00+00:00,Buying a home is a multistep process. Before a...,technology,Katherine Watt,Yes,Real,image_3.jpg,Images\image_3.jpg,10 step guide buying home from start finish,buying home multistep process before applying ...,"[input_ids, token_type_ids, attention_mask]","[[[tensor(1.2043), tensor(1.1872), tensor(1.18...",0,1
4,OppFi Stock Deserves A Strong Buy On Its Dip (...,https://static.seekingalpha.com/cdn/s3/uploads...,2025-02-18 15:35:17+00:00,OppFi is a specialty finance firm focusing on ...,"business, finance",Danil Sereda,Yes,Real,image_4.jpg,Images\image_4.jpg,oppfi stock deserves strong buy its dip opfi,oppfi specialty finance firm focusing underser...,"[input_ids, token_type_ids, attention_mask]","[[[tensor(-1.0048), tensor(-0.9705), tensor(-0...",0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,The Assault on Science and Health: Sometimes I...,https://static01.nyt.com/images/2025/02/12/mul...,2025-02-20 20:05:02+00:00,To the Editor:\r\n\r\nRe “Deep Cuts to Medical...,general,nytimes,No,Real,image_494.jpg,Images\image_494.jpg,assault science health sometimes s personal,editor re deep cuts medical research funds cou...,"[input_ids, token_type_ids, attention_mask]","[[[tensor(-0.2171), tensor(-0.2513), tensor(-0...",0,0
276,Exoplanet with iron rain has violent winds 'li...,https://s.yimg.com/ny/api/res/1.2/bYhSJWMJVIaU...,2025-02-18 16:00:00+00:00,Astronomers have discovered a powerful jet str...,general,Robert Lea,Yes,Real,image_496.jpg,Images\image_496.jpg,exoplanet iron rain has violent winds like som...,astronomers have discovered powerful jet strea...,"[input_ids, token_type_ids, attention_mask]","[[[tensor(-1.3130), tensor(-1.3130), tensor(-1...",0,1
277,Censored Science Can't Save Lives,https://static01.nyt.com/images/2025/02/10/opi...,2025-02-18 10:00:53+00:00,Censoring research on how to deliver treatment...,general,Jehan Alladina,Yes,Real,image_497.jpg,Images\image_497.jpg,censored science can t save lives,censoring research how deliver treatments thos...,"[input_ids, token_type_ids, attention_mask]","[[[tensor(-2.1179), tensor(-2.1179), tensor(-2...",0,1
278,An Iowa college gets a $10 million gift from t...,https://media.zenfs.com/en/the-des-moines-regi...,2025-02-17 22:02:16+00:00,"Among other things, a $10 million gift from a ...",general,Des Moines Register,Yes,Real,image_498.jpg,Images\image_498.jpg,iowa college gets 10 million gift from estate ...,among other things 10 million gift from tech i...,"[input_ids, token_type_ids, attention_mask]","[[[tensor(0.5707), tensor(0.8276), tensor(0.57...",0,1
