In [None]:
pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [None]:
from IPython import get_ipython
from IPython.display import display
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torchvision import transforms
from PIL import Image
import io # Import the io module
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import timm
import pandas as pd
from datasets import load_dataset # Import load_dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
ds = load_dataset("anson-huang/mirage-news", download_mode="force_redownload")

README.md:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/655M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/143M [00:00<?, ?B/s]

test1_nyt_mj-00000-of-00001.parquet:   0%|          | 0.00/20.2M [00:00<?, ?B/s]

test2_bbc_dalle-00000-of-00002.parquet:   0%|          | 0.00/560M [00:00<?, ?B/s]

test2_bbc_dalle-00001-of-00002.parquet:   0%|          | 0.00/19.0M [00:00<?, ?B/s]

test3_cnn_dalle-00000-of-00002.parquet:   0%|          | 0.00/559M [00:00<?, ?B/s]

test3_cnn_dalle-00001-of-00002.parquet:   0%|          | 0.00/25.8M [00:00<?, ?B/s]

test4_bbc_sdxl-00000-of-00001.parquet:   0%|          | 0.00/46.0M [00:00<?, ?B/s]

test5_cnn_sdxl-00000-of-00001.parquet:   0%|          | 0.00/54.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2500 [00:00<?, ? examples/s]

Generating test1_nyt_mj split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test2_bbc_dalle split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test3_cnn_dalle split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test4_bbc_sdxl split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test5_cnn_sdxl split:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
import pandas as pd

test_df = pd.read_parquet('/root/.cache/huggingface/hub/datasets--anson-huang--mirage-news/snapshots/b5a7e734850b4ec623ddee018a1d9e097fe248ef/data/train-00000-of-00001.parquet')
print(test_df['label'].value_counts())

label
1    5000
0    5000
Name: count, dtype: int64


In [None]:
class MIRAGE_Ensemble_Dataset(Dataset):
    def __init__(self, parquet_path, tokenizer, max_token_len=128, image_transform=None):
        self.data = pd.read_parquet(parquet_path)
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        self.image_transform = image_transform
        # Filter out rows with missing text, image, or label
        self.data = self.data.dropna(subset=['text', 'image', 'label'])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = str(row['text'])
        label = torch.tensor(row['label'], dtype=torch.long)

        # Tokenize text
        tokens = self.tokenizer.encode_plus(
            text,
            max_length=self.max_token_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt',
            return_attention_mask=True,
            add_special_tokens=True,
        )
        input_ids = tokens['input_ids'].squeeze(0)
        attention_mask = tokens['attention_mask'].squeeze(0)

        # Load image
        image_data = row['image']
        image = None

        if isinstance(image_data, dict):
            try:
                image_bytes = image_data['bytes']
                image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
            except Exception as e:
                print(f"[Warning] Failed to load image from bytes at index {idx}: {e}")
        elif isinstance(image_data, str):
            try:
                image = Image.open(image_data).convert('RGB')
            except FileNotFoundError:
                print(f"[Warning] Image file not found at path {image_data} (index {idx}).")
        else:
            print(f"[Warning] Unexpected image data type at index {idx}: {type(image_data)}")

        # Apply transforms or fallback
        if image is not None and self.image_transform:
            image = self.image_transform(image)
        elif image is None:
            # Fallback to zero tensor with the expected size after transform
            if self.image_transform:
                dummy_image = Image.new('RGB', (224, 224))
                image = self.image_transform(dummy_image)
            else:
                image = torch.zeros((3, 224, 224), dtype=torch.float32)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'image': image,
            'labels': label,
        }


In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel

class TextModelWrapper(nn.Module):
    def __init__(self, hf_model_name):
        super().__init__()
        self.model = AutoModel.from_pretrained(hf_model_name, return_dict=True)
        self.hidden = nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size)
        self.classifier = nn.Linear(self.model.config.hidden_size, 1)
        self.dropout = nn.Dropout()
        self.loss_func = nn.BCEWithLogitsLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        # Use pooler_output if available, else average last hidden states
        if hasattr(outputs, 'pooler_output') and outputs.pooler_output is not None:
            pooled_output = outputs.pooler_output
        else:
            pooled_output = torch.mean(outputs.last_hidden_state, dim=1)

        pooled_output = self.dropout(pooled_output)
        pooled_output = self.hidden(pooled_output)
        pooled_output = torch.relu(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.loss_func(logits.squeeze(1), labels.float())  # BCEWithLogitsLoss expects float labels

        return loss, logits


In [None]:
class ImageModelWrapper(nn.Module):
    def __init__(self, hf_model_name, device='cpu'):
        super().__init__()
        # Load ViT backbone without classifier (num_classes=0 means no head)
        self.vit = timm.create_model('vit_base_patch16_224', pretrained=False, num_classes=0)
        self.classifier = nn.Linear(self.vit.num_features, 1)
        self.dropout = nn.Dropout()
        self.loss_func = nn.BCEWithLogitsLoss()

        # Try loading the custom pretrained weights from Hugging Face
        try:
            checkpoint = torch.hub.load_state_dict_from_url(
                f"https://huggingface.co/{hf_model_name}/resolve/main/pytorch_model.bin",
                map_location=device,
            )

            # Load base ViT weights (ignore classifier keys)
            vit_state_dict = {k: v for k, v in checkpoint.items() if 'classifier' not in k}
            missing_keys, unexpected_keys = self.vit.load_state_dict(vit_state_dict, strict=False)

            if missing_keys:
                print(f"[Warning] Missing keys when loading ViT: {missing_keys}")
            if unexpected_keys:
                print(f"[Warning] Unexpected keys in ViT: {unexpected_keys}")

            # Try loading classifier weights if present
            classifier_state_dict = {k.replace('classifier.', ''): v for k, v in checkpoint.items() if k.startswith('classifier.')}
            if classifier_state_dict:
                try:
                    self.classifier.load_state_dict(classifier_state_dict, strict=True)
                    print("[Info] Successfully loaded classifier weights.")
                except Exception as e:
                    print(f"[Warning] Failed to load classifier weights: {e}")

        except Exception as e:
            print(f"[Error] Failed to load checkpoint from {hf_model_name}: {e}")
            print("[Info] Using default-initialized weights.")

    def forward(self, images, labels=None):
        features = self.vit(images)
        features = self.dropout(features)
        logits = self.classifier(features)

        loss = None
        if labels is not None:
            loss = self.loss_func(logits.squeeze(1), labels.float())  # Ensure logits match shape of labels

        return loss, logits


In [None]:
# Tokenizer and models
text_model_name = "darkam/fakenews-finetuned-distilroberta-base"
image_model_name = "darkam/vit-mirage-news"

tokenizer = AutoTokenizer.from_pretrained(text_model_name)
text_model = TextModelWrapper(text_model_name).to(device)

text_model.eval()


# Transforms
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
])


tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

In [None]:
import torchvision

# Load pretrained ViT from torchvision
pretrained_vit_weights = torchvision.models.ViT_B_16_Weights.DEFAULT
image_model = torchvision.models.vit_b_16(weights=pretrained_vit_weights).to(device)

# Remove the classification head
image_model.heads = nn.Identity()

image_model.eval()

VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

In [None]:
from torch.utils.data import DataLoader

# Define the path to the test parquet file
test_parquet_path = '/root/.cache/huggingface/hub/datasets--anson-huang--mirage-news/snapshots/b5a7e734850b4ec623ddee018a1d9e097fe248ef/data/validation-00000-of-00001.parquet'

# Initialize the dataset
dataset = MIRAGE_Ensemble_Dataset(
    parquet_path=test_parquet_path,
    tokenizer=tokenizer,
    max_token_len=128,
    image_transform=image_transform,
)

# Wrap the dataset in a DataLoader
test_loader = DataLoader(
    dataset,
    batch_size=64,
    shuffle=False,  # No shuffling during evaluation
    num_workers=4,  # Adjust based on your CPU capability
    pin_memory=True if torch.cuda.is_available() else False
)


In [None]:
import numpy as np

In [None]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calibrate_probs(probs):
    """Min-max normalize probabilities to [0,1]"""
    return (probs - probs.min()) / (probs.max() - probs.min() + 1e-8)

def soft_voting_ensemble_predict(text_model, image_model, dataloader, device,
                                text_weight=0.5, image_weight=0.5, threshold=0.5,
                                voting_type='soft'):
    """
    Args:
        voting_type: 'soft' for weighted average probs,
                     'hard' for majority voting of class preds.
    """
    all_preds = []
    all_labels = []

    text_model.eval()
    image_model.eval()

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['labels'].to(device)

            # Text model output handling
            text_output = text_model(input_ids=input_ids, attention_mask=attention_mask)
            if isinstance(text_output, tuple):
                text_logits = text_output[1]
            elif hasattr(text_output, 'logits'):
                text_logits = text_output.logits
            else:
                text_logits = text_output

            text_probs = torch.sigmoid(text_logits).squeeze()
            text_probs = calibrate_probs(text_probs)

            # Image model output
            image_logits = image_model(images)
            image_probs = torch.softmax(image_logits, dim=1)[:, 1].squeeze()
            image_probs = calibrate_probs(image_probs)

            if voting_type == 'soft':
                # Weighted average probabilities
                combined_probs = (text_weight * text_probs) + (image_weight * image_probs)
                preds = (combined_probs > threshold).long()

            elif voting_type == 'hard':
                # Individual binary predictions
                text_pred = (text_probs > threshold).long()
                image_pred = (image_probs > threshold).long()

                # Majority vote: 2 votes needed for positive (class 1)
                summed = text_pred + image_pred
                preds = (summed >= 1).long()  # positive if any model votes positive

            else:
                raise ValueError("voting_type must be 'soft' or 'hard'")

            all_preds.extend(preds.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())

    return all_labels, all_preds



In [None]:

# === Example Usage ===
text_weight = 0.7
image_weight = 0.3
threshold = 0.5
voting_type = 'soft'  # or 'hard'

all_labels, all_preds = soft_voting_ensemble_predict(
    text_model, image_model, test_loader, device,
    text_weight=text_weight, image_weight=image_weight,
    threshold=threshold, voting_type=voting_type
)

# Calculate metrics
all_labels_np = np.array(all_labels)
all_preds_np = np.array(all_preds)

acc = accuracy_score(all_labels_np, all_preds_np)
prec = precision_score(all_labels_np, all_preds_np, zero_division=0)
rec = recall_score(all_labels_np, all_preds_np, zero_division=0)
f1 = f1_score(all_labels_np, all_preds_np, zero_division=0)

print(f"Ensemble ({voting_type} voting) | Text weight: {text_weight}, Image weight: {image_weight}, Threshold: {threshold}")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {f1:.4f}")




ValueError: too many values to unpack (expected 2)

In [None]:
def hard_voting_ensemble_predict(text_model, image_model, dataloader, device, threshold=0.3):
    all_preds = []
    all_labels = []

    text_model.eval()
    image_model.eval()

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['labels'].to(device)

            # Text model probabilities
            text_output = text_model(input_ids=input_ids, attention_mask=attention_mask)
            text_logits = text_output[1] if isinstance(text_output, tuple) else text_output
            text_probs = torch.sigmoid(text_logits).squeeze()
            text_pred = (text_probs > threshold).long()

            # Image model probabilities
            image_logits = image_model(images)
            image_probs = torch.softmax(image_logits, dim=1)[:, 1].squeeze()
            image_pred = (image_probs > threshold).long()

            # Majority vote (2 models, so vote positive if any model predicts positive)
            ensemble_pred = (text_pred + image_pred >= 1).long()

            all_preds.extend(ensemble_pred.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_labels, all_preds


In [None]:
text_weight = 0.9
image_weight = 0.1

all_labels, all_preds = soft_voting_ensemble_predict(text_model, image_model, test_loader, device, text_weight, image_weight)

# Evaluation
all_labels_np = np.array(all_labels)
all_preds_np = np.array(all_preds)

acc = accuracy_score(all_labels_np, all_preds_np)
prec = precision_score(all_labels_np, all_preds_np, zero_division=0)
rec = recall_score(all_labels_np, all_preds_np, zero_division=0)
f1 = f1_score(all_labels_np, all_preds_np, zero_division=0)

print(f"Soft Voting Ensemble (Text Weight: {text_weight}, Image Weight: {image_weight}) Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f} | Recall: {rec:.4f} | F1 Score: {f1:.4f}")




Soft Voting Ensemble (Text Weight: 0.9, Image Weight: 0.1) Accuracy: 0.5585
Precision: 0.5691 | Recall: 0.4818 | F1 Score: 0.5218


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiModalFusionClassifier(nn.Module):
    def __init__(self, text_model, image_model, hidden_dim=512):
        super(MultiModalFusionClassifier, self).__init__()
        self.text_model = text_model
        self.image_model = image_model

        # Freeze pretrained weights if desired
        # for param in self.text_model.parameters():
        #     param.requires_grad = False
        # for param in self.image_model.parameters():
        #     param.requires_grad = False

        self.classifier = nn.Sequential(
            nn.Linear(768 + 768, hidden_dim),  # Adjust if your model uses different hidden sizes
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1)  # Binary classification
        )

    def forward(self, input_ids, attention_mask, images):
        # Text encoding
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        # Access the pooled output based on the text model's output structure
        # Assuming the pooled output is the second element in the tuple for models like BERT
        text_feat = text_outputs[1] if isinstance(text_outputs, tuple) else text_outputs.pooler_output  # (B, 768)

        # Print shape of text_feat for debugging
        print("Shape of text_feat:", text_feat.shape)

        # Image encoding
        image_outputs = self.image_model(images)
        # The image model (ViT with head removed) should output the pooled features (CLS token)
        # The output of torchvision's ViT with head removed is typically (B, num_patches + 1, hidden_dim)
        # The CLS token is at index 0
        image_feat = image_outputs[:, 0, :] # (B, 768)

        # Concatenate and classify
        combined = torch.cat((text_feat, image_feat), dim=1)  # (B, 1536)
        logits = self.classifier(combined)
        return logits.squeeze()  # (B,)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, dataloader, device, threshold=0.5):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['labels'].to(device).float()

            # Add print statements to inspect the inputs
            print("Input IDs shape:", input_ids.shape)
            print("Attention Mask shape:", attention_mask.shape)
            print("Images shape:", images.shape)

            logits = model(input_ids, attention_mask, images)
            probs = torch.sigmoid(logits)
            preds = (probs > threshold).long()

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds)
    rec = recall_score(all_labels, all_labels) # Corrected to use all_labels for recall calculation
    f1 = f1_score(all_labels, all_preds)

    print(f"Multimodal Fusion Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f} | Recall: {rec:.4f} | F1 Score: {f1:.4f}")

In [None]:
# Assume `text_model` is from transformers (e.g., BertModel)
# and `image_model` is a VisionTransformer from timm or similar

model = MultiModalFusionClassifier(text_model, image_model).to(device)
# Train the model using your optimizer and loss function (e.g., BCEWithLogitsLoss)

# Check the keys in a batch from test_loader
try:
    sample_batch = next(iter(test_loader))
    print("Keys in test_loader batch:", sample_batch.keys())
except Exception as e:
    print("Error getting batch from test_loader:", e)


# Then evaluate
evaluate_model(model, test_loader, device)



Keys in test_loader batch: dict_keys(['input_ids', 'attention_mask', 'image', 'labels'])




Input IDs shape: torch.Size([64, 128])
Attention Mask shape: torch.Size([64, 128])
Images shape: torch.Size([64, 3, 224, 224])
Shape of text_feat: torch.Size([64, 1])


IndexError: too many indices for tensor of dimension 2

In [None]:
# Re-instantiate the model just before evaluation as a debugging step
model = MultiModalFusionClassifier(text_model, image_model).to(device)

# Evaluate the model and get predictions
all_labels, all_preds = evaluate_model(model, test_loader, device)

# Display some predictions (optional)
print("\nSample Predictions:")
print(all_preds[:10])
print("Corresponding True Labels:")
print(all_labels[:10])



Input IDs shape: torch.Size([64, 128])
Attention Mask shape: torch.Size([64, 128])
Images shape: torch.Size([64, 3, 224, 224])


IndexError: too many indices for tensor of dimension 2