In [21]:
import os
import torch
import pandas as pd
import open_clip
from PIL import Image
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# Step 1: Load the cleaned dataset
dataset_path = "C:/Users/rajar/OneDrive/Desktop/New folder/Fake-News/politifact_cleaned_filtered.csv"

if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Dataset file not found: {dataset_path}")

df = pd.read_csv(dataset_path)
print(f"✅ Loaded dataset with {len(df)} records")

# Step 2: Ensure dataset filenames are correctly formatted
df["Image Filename"] = df["Image Filename"].astype(str)
df = df.dropna(subset=["Image Filename"])  # Remove missing image filenames

# Step 3: Check if the image directory exists
image_dir = "C:/Users/rajar/OneDrive/Desktop/New folder/Fake-News/politifact_images"

if not os.path.exists(image_dir):
    raise FileNotFoundError(f"Image directory not found: {image_dir}")

# Step 4: Verify image filenames match actual files
existing_images = set(os.listdir(image_dir))
df = df[df["Image Filename"].isin(existing_images)].reset_index(drop=True)

if len(df) == 0:
    raise ValueError("No valid image filenames match the dataset")

print(f"✅ Verified {len(df)} valid image records with matching images")

# Step 5: Load CLIP Model for Image Embeddings
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = open_clip.create_model("ViT-B-32-quickgelu", pretrained="openai").to(device)
preprocess = open_clip.image_transform(clip_model.visual.image_size, is_train=False)
print("✅ CLIP model loaded successfully")

# Step 6: Load BERT Model for Text Embeddings
bert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)
print("✅ BERT model loaded successfully")

# Step 7: Define Image Processing Function
def get_image_embedding(image_path):
    if not os.path.exists(image_path):
        print(f"❌ Image file missing: {image_path}")
        return None
    try:
        image = Image.open(image_path).convert("RGB")
        image = preprocess(image).unsqueeze(0).to(device)
        with torch.no_grad():
            image_embedding = clip_model.encode_image(image).cpu().numpy().flatten()
        return image_embedding
    except Exception as e:
        print(f"❌ Error processing image {image_path}: {e}")
        return None

# Step 8: Define Text Processing Function
def get_text_embedding(text):
    try:
        inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        with torch.no_grad():
            text_embedding = bert_model(**inputs).last_hidden_state[:, 0, :].cpu().numpy().flatten()
        return text_embedding
    except Exception as e:
        print(f"❌ Error processing text: {e}")
        return None

# Step 9: Extract Features
image_features = []
text_features = []
failed_images = []
failed_texts = []

for index, row in tqdm(df.iterrows(), total=len(df), desc="Extracting Features"):
    image_path = os.path.join(image_dir, row["Image Filename"])
    
    # Debugging: Check if the image file exists before processing
    if not os.path.exists(image_path):
        print(f"❌ Image file not found: {image_path}")
        failed_images.append(image_path)
        continue

    # Extract image features
    image_feat = get_image_embedding(image_path)
    if image_feat is None:
        print(f"⚠️ Skipping image {image_path} due to extraction error")
        failed_images.append(image_path)
        continue

    # Extract text features
    text_feat = get_text_embedding(row["Claim"])
    if text_feat is None:
        print(f"⚠️ Skipping text embedding for Claim: {row['Claim']}")
        failed_texts.append(row["Claim"])
        continue

    # Append extracted features
    image_features.append(image_feat)
    text_features.append(text_feat)

print(f"✅ Extracted {len(image_features)} image features and {len(text_features)} text features")
print(f"❌ {len(failed_images)} images failed, {len(failed_texts)} text embeddings failed")

# Step 10: Convert extracted features to DataFrame
if image_features and text_features:
    image_features_df = pd.DataFrame(image_features, columns=[f"img_feat_{i}" for i in range(len(image_features[0]))])
    text_features_df = pd.DataFrame(text_features, columns=[f"text_feat_{i}" for i in range(len(text_features[0]))])

    # Step 11: Merge with Original Dataset
    df_features = pd.concat([df.reset_index(drop=True), image_features_df, text_features_df], axis=1)

    # Step 12: Save Final Features Dataset
    feature_dataset_path = "C:/Users/rajar/OneDrive/Desktop/New folder/Fake-News/politifact_features.csv"
    df_features.to_csv(feature_dataset_path, index=False)

    print("✅ Feature Extraction Complete")
    print(f"📂 Dataset saved as: {feature_dataset_path}")
else:
    print("❌ No valid embeddings extracted. Please check logs above for errors.")


✅ Loaded dataset with 254 records
✅ Verified 254 valid image records with matching images
✅ CLIP model loaded successfully
✅ BERT model loaded successfully


Extracting Features: 100%|██████████| 254/254 [00:18<00:00, 13.69it/s]


✅ Extracted 254 image features and 254 text features
❌ 0 images failed, 0 text embeddings failed
✅ Feature Extraction Complete
📂 Dataset saved as: C:/Users/rajar/OneDrive/Desktop/New folder/Fake-News/politifact_features.csv


In [22]:
df_features

Unnamed: 0,Claimant,Claim,Date,URL,Source,Full Article,Rating,Image Filename,Image URL,CLIP_Label,...,text_feat_758,text_feat_759,text_feat_760,text_feat_761,text_feat_762,text_feat_763,text_feat_764,text_feat_765,text_feat_766,text_feat_767
0,"Social Media\nstated on March 5, 2025 in in so...","""Apple quietly removed International Women’s D...","stated on March 5, 2025 in in social media posts:",https://www.politifact.com/factchecks/2025/mar...,,Social media users complained that during a mo...,False,image_0.jpg,https://static.politifact.com/politifact/photo...,Related,...,0.012678,-0.100333,0.147155,-0.177870,0.268346,-0.162729,-0.029706,-0.269903,0.249885,0.268251
1,"Laura Loomer\nstated on February 24, 2025 in a...",U.S. Agency for International Development subs...,"stated on February 24, 2025 in an X post:",https://www.politifact.com/factchecks/2025/mar...,,Soon after MSNBC announced it would cancel Joy...,False,image_1.jpg,https://static.politifact.com/CACHE/images/pol...,Unrelated,...,0.073323,-0.357742,0.076304,-0.078982,0.220347,-0.331839,-0.061583,-0.093297,0.477254,0.341540
2,"Social Media\nstated on February 25, 2025 in s...",U.S. Rep. Nancy Pelosi’s vineyard received $14...,"stated on February 25, 2025 in social media po...",https://www.politifact.com/factchecks/2025/mar...,,"Rep. Nancy Pelosi, D-Calif., isn’t being paid ...",False,image_2.jpg,https://archive.ph/Tnx6A/dd104dec18290bdfc7b62...,Related,...,0.148269,-0.123658,0.098619,-0.197032,0.137140,-0.094740,0.088639,-0.131566,0.166093,0.191880
3,"Larry Kudlow\nstated on March 4, 2025 in on-ai...",Elon Musk’s Department of Government Efficienc...,"stated on March 4, 2025 in on-air remarks:",https://www.politifact.com/factchecks/2025/mar...,,As Fox News viewers waited to hear President D...,False,image_3.jpg,https://static.politifact.com/CACHE/images/pol...,Related,...,-0.083076,-0.077016,0.044324,-0.113718,0.310322,-0.039914,0.055144,-0.070009,0.305845,0.284415
4,"Donald Trump\nstated on March 4, 2025 in an ad...",The Trump administration identified $1.9 billi...,"stated on March 4, 2025 in an address to a joi...",https://www.politifact.com/factchecks/2025/mar...,,"During his March 4 address to Congress, Presid...",False,image_4.jpg,https://static.politifact.com/CACHE/images/pol...,Unrelated,...,0.216485,-0.049063,-0.023716,-0.126519,0.345962,-0.146998,0.013425,-0.094037,0.232764,0.289571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,"Threads posts\nstated on November 1, 2024 in a...","Says Liz Cheney said, “Donald Trump is now lit...","stated on November 1, 2024 in a Threads post:",https://www.politifact.com/factchecks/2024/nov...,,Former President Donald Trump recently said fo...,False,image_271.jpg,https://static.politifact.com/CACHE/images/pol...,Related,...,-0.003106,0.126125,0.112931,-0.010949,0.217905,-0.086458,-0.075832,-0.136058,0.288699,0.366983
250,"Threads posts\nstated on November 3, 2024 in a...",Former President Donald Trump posted on Truth ...,"stated on November 3, 2024 in a Threads post:",https://www.politifact.com/factchecks/2024/nov...,,After a viral moment with his microphone stand...,False,image_272.jpg,https://static01.nyt.com/images/2024/11/02/mul...,Related,...,0.227678,-0.037211,0.096696,-0.100100,0.273724,-0.158404,0.016957,0.071021,0.244668,0.272161
251,"Instagram posts\nstated on November 3, 2024 in...",Video shows Kamala Harris has a “drinking prob...,"stated on November 3, 2024 in an Instagram post:",https://www.politifact.com/factchecks/2024/nov...,,Social media users shared an altered video cli...,False,image_273.jpg,https://static.politifact.com/CACHE/images/pol...,Related,...,0.146457,-0.367928,-0.039199,-0.065122,-0.025221,0.022258,-0.170861,-0.274011,0.136101,0.089287
252,"Instagram posts\nstated on October 31, 2024 in...",The FBI seized a video of former Secretary of ...,"stated on October 31, 2024 in a post:",https://www.politifact.com/factchecks/2024/nov...,,A federal indictment alleging music magnate Se...,False,image_274.jpg,https://mediaproxy.snopes.com/width/1200/https...,Related,...,0.101182,-0.219191,0.109563,-0.204170,-0.020743,-0.185645,0.013800,-0.123612,0.341433,0.327575
