In [6]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import requests
from PIL import Image
from io import BytesIO
from transformers import CLIPProcessor, CLIPModel
import matplotlib.pyplot as plt
from tqdm import tqdm
import torch.nn as nn

# ==============================
# Load and Preprocess Dataset
# ==============================

# Load dataset from CSV
data_str = "/content/politifact_articles_with_images (1).csv"

def load_dataset(file_path):
    """Load dataset from CSV or Excel."""
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        df = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Use CSV or Excel.")

    return df

# Preprocess the dataset
def preprocess_dataset(df):
    """Clean and map ratings to labels."""
    if 'Claim' in df.columns:
        df['Claim'] = df['Claim'].str.replace('â€œ', '"').str.replace('â€', '"')

    rating_map = {
        'TRUE': 1,
        'MOSTLY TRUE': 0.75,
        'HALF TRUE': 0.5,
        'MOSTLY FALSE': 0.25,
        'FALSE': 0,
        'PANTS ON FIRE': 0
    }

    if 'Rating' in df.columns:
        df['Rating_numeric'] = df['Rating'].map(rating_map)
        df['Label'] = df['Rating'].apply(lambda x: 1 if x in ['TRUE', 'MOSTLY TRUE'] else 0)

    return df

# Download images
def download_images(df, image_dir='./images'):
    """Download images from URLs and save them locally."""
    os.makedirs(image_dir, exist_ok=True)
    image_paths = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Downloading images"):
        image_url = row.get('Image URL', None)

        if pd.isna(image_url) or image_url is None or image_url == 'N/A' or image_url == 'nan':
            image_paths.append(None)
            continue

        try:
            image_filename = f"{idx}.jpg"
            image_path = os.path.join(image_dir, image_filename)

            if not os.path.exists(image_path):
                response = requests.get(image_url)
                img = Image.open(BytesIO(response.content))
                img.save(image_path)

            image_paths.append(image_path)
        except Exception as e:
            print(f"Error downloading image {image_url}: {e}")
            image_paths.append(None)

    df['local_image_path'] = image_paths
    return df


# ==============================
# CLIP Classifier
# ==============================

class CLIPClassifier(nn.Module):
    def __init__(self, clip_model_name="openai/clip-vit-base-patch32", num_classes=2):
        super(CLIPClassifier, self).__init__()
        self.clip = CLIPModel.from_pretrained(clip_model_name)

        # Freeze CLIP parameters
        for param in self.clip.parameters():
            param.requires_grad = False

        hidden_size = self.clip.config.projection_dim

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, num_classes)
        )

    def forward(self, input_ids=None, attention_mask=None, pixel_values=None, return_loss=False, labels=None):
        outputs = self.clip(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)

        text_embeds = outputs.text_embeds
        logits = self.classifier(text_embeds)

        loss = None
        if return_loss and labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return logits, loss


# ==============================
# PyTorch Dataset
# ==============================

class FakeNewsDataset(Dataset):
    def __init__(self, dataframe, processor, text_column='Claim', label_column='Label', image_column='local_image_path'):
        self.dataframe = dataframe
        self.processor = processor
        self.text_column = text_column
        self.label_column = label_column
        self.image_column = image_column

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]

        text = row[self.text_column]
        label = row[self.label_column]
        image_path = row[self.image_column]

        if pd.isna(image_path) or image_path is None:
            image = Image.new('RGB', (224, 224), color='white')
        else:
            try:
                image = Image.open(image_path).convert('RGB')
            except:
                image = Image.new('RGB', (224, 224), color='white')

        inputs = self.processor(
            text=text,
            images=image,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=77
        )

        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs['labels'] = torch.tensor(label, dtype=torch.long)

        return inputs


# ==============================
# Model Training
# ==============================

def train_vlm(model, train_loader, val_loader, device, num_epochs=3, learning_rate=5e-5):
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    training_stats = {'train_loss': [], 'val_loss': [], 'val_acc': []}

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch.pop('labels')

            logits, _ = model(**batch, return_loss=False)
            loss = criterion(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_loader)
        training_stats['train_loss'].append(avg_train_loss)

        model.eval()
        total_val_loss = 0
        all_preds, all_labels = [], []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
                batch = {k: v.to(device) for k, v in batch.items()}
                labels = batch.pop('labels')

                logits, _ = model(**batch, return_loss=False)
                loss = criterion(logits, labels)

                total_val_loss += loss.item()
                preds = torch.argmax(logits, dim=1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_loader)
        val_acc = accuracy_score(all_labels, all_preds)

        training_stats['val_loss'].append(avg_val_loss)
        training_stats['val_acc'].append(val_acc)

        print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.4f}")

    return model, training_stats


# ==============================
# Execute Workflow
# ==============================

# Load and preprocess the dataset
df = load_dataset(data_str)
df = preprocess_dataset(df)
df = download_images(df)

# Split dataset
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)

# Initialize the model and processor
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPClassifier()

# Create datasets and loaders
train_dataset = FakeNewsDataset(train_df, processor)
val_dataset = FakeNewsDataset(val_df, processor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Train the model
model, training_stats = train_vlm(model, train_loader, val_loader, device)


Downloading images: 100%|██████████| 3600/3600 [13:55<00:00,  4.31it/s]
Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Epoch 1/3 - Training:   0%|          | 0/630 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Epoch 1/3 - Training: 100%|██████████| 630/630 [00:40<00:00, 15.57it/s]
Epoch 1/3 - Validation: 100%|██████████| 270/270 [00:16<00:00, 16.76it/s]


Epoch 1 - Train Loss: 0.3040, Val Loss: 0.0499, Val Acc: 1.0000


Epoch 2/3 - Training: 100%|██████████| 630/630 [00:37<00:00, 16.78it/s]
Epoch 2/3 - Validation: 100%|██████████| 270/270 [00:16<00:00, 16.73it/s]


Epoch 2 - Train Loss: 0.0201, Val Loss: 0.0084, Val Acc: 1.0000


Epoch 3/3 - Training: 100%|██████████| 630/630 [00:37<00:00, 16.59it/s]
Epoch 3/3 - Validation: 100%|██████████| 270/270 [00:16<00:00, 16.61it/s]

Epoch 3 - Train Loss: 0.0047, Val Loss: 0.0029, Val Acc: 1.0000





In [7]:
def predict_fake_news(model, processor, text, image_path=None, device="cuda"):
    """
    Predict whether a news item is fake (0) or real (1).

    Args:
        model: Trained CLIPClassifier
        processor: CLIPProcessor
        text: News claim (str)
        image_path: Path to image (optional)
        device: "cuda" or "cpu"
    Returns:
        dict: {"prediction": 0/1, "confidence": float, "class": "Fake"/"Real"}
    """
    model.eval()
    model.to(device)

    # Handle missing image
    if image_path is None or not os.path.exists(image_path):
        image = Image.new('RGB', (224, 224), color='white')
    else:
        image = Image.open(image_path).convert('RGB')

    # Preprocess inputs
    inputs = processor(
        text=text,
        images=image,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=77
    ).to(device)

    # Predict
    with torch.no_grad():
        logits, _ = model(**inputs, return_loss=False)
        probs = torch.softmax(logits, dim=1)
        pred_class = torch.argmax(probs).item()
        confidence = probs[0][pred_class].item()

    return {
        "prediction": pred_class,
        "confidence": confidence,
        "class": "Real" if pred_class == 1 else "Fake"
    }

In [8]:
# Example 1: Text + Image
result = predict_fake_news(
    model=model,
    processor=processor,
    text="Audio shows a BBC reporter making a vulgar comment about an Oval Office meeting between President Donald Trump and Ukraine President Volodymyr Zelenskyy.",
    image_path="https://static.politifact.com/img/meter-pants-fire.jpg",  # Replace with actual path
    device=device
)
print(result)

result = predict_fake_news(
    model=model,
    processor=processor,
    text= "Wisconsin does not require judges to automatically recuse just because they have done some kind of legal work in the past as a lawyer" ,
    image_path="https://static.politifact.com/img/meter-true.jpg",  # Replace with actual path
    device=device
)
print(result)


{'prediction': 0, 'confidence': 0.9944514632225037, 'class': 'Fake'}
{'prediction': 0, 'confidence': 0.9979324340820312, 'class': 'Fake'}
