# Multimodal Fusion: Text + Image
This notebook builds a **late fusion classifier** that integrates three complementary signals:
- **RoBERTa text classifier** -> Produces a probability score for "bot"
- **CLIP image classifier** -> probability that the image looks like a bot profile
- **image_exists_flag** -> binary feature indicating whether the user even has a profile image
- **CLIP image-text alignment score** -> cosine similarity between a user's profile image and their textual identity

**What do we do here:**

1. **Load all three features per user**
   - `text_prob` from RoBERTa
   - `img_prob` from CLIP
   - `has_image` as a structural / social cue
   - `clip_img_text_sim` as a semantic alignment score  
     Combine them into a single feature matrix for each user.

<br> 

2. **Split users into train / validation / test**
   - Same split as the RoBERTa notebook to keep evaluation consistent.

<br> 

3. **Train fusion models**
   - Train an **XGBoost Classifier**, which handles nonlinear interactions well.
   - Optional: hyperparameter tuning via randomized search.

<br> 

4. **Evaluate fusion performance**
   - Accuracy, percision, recall, f1-score, AUROC.
   - Compare against text-only and image-only models.
   - Plot a confusion matrix.

## Section 1: Imports & Configurations

In [None]:
#---------------------------------------------------------------------------------#
# HuggingFace Cache Location                                                      #
#---------------------------------------------------------------------------------#
# By default, HuggingFace downloads pretrained models into the user directory     #
# (e.g., ~/.cache/huggingface/). To make the project fully reproducible and       #
# avoid polluting the user's  global cache, we redirect HF_HOME to a local        #
# folder inside the project.                                                      #
#                                                                                 #
# If you prefer a different cache directory, simply modify HF_CACHE below.        #
# If the folder does not exist yet, HuggingFace will create it automatically.     #
#---------------------------------------------------------------------------------#
import os
from pathlib import Path

ROOT = Path.cwd().parent.resolve()
HF_CACHE = ROOT / "hf_cache"
os.environ["HF_HOME"] = str(HF_CACHE)

import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from PIL import Image
import time

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import CLIPProcessor, CLIPModel, AutoTokenizer, AutoModelForSequenceClassification

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import seaborn as sns

# --- Paths and Configurations --- #
TWIBOT_DIR     = ROOT / "data/twibot22/processed"
USERS_PATH     = TWIBOT_DIR / "users_with_clip_sim.csv"
POSTS_PATH     = TWIBOT_DIR / "posts.csv"
TEXT_MODEL_DIR = ROOT / "outputs/roberta_twibot22/roberta_twibot22_model"
OUTPUT_DIR     = ROOT / "outputs/multimodal"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

SEED               = 42
BATCH_SIZE_IMG     = 32
MAX_SEQ_LENGTH     = 256


random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    print(f"Using cuda on {torch.cuda.get_device_name()}")
else:
    print("Using cpu")

## Section 2: Loading users with both text + image

In [None]:
users_mm = pd.read_csv(USERS_PATH)

# --- Balancing human/bot distribution --- #
humans = users_mm[users_mm["label_num"] == 0]
bots = users_mm[users_mm["label_num"] == 1]

n = min(len(humans), len(bots))
humans_bal = humans.sample(n, random_state=SEED)
bots_bal = bots.sample(n, random_state=SEED)

data_bal = pd.concat([humans_bal, bots_bal]).sample(frac=1, random_state=SEED).reset_index(drop=True)

users_mm = data_bal.copy().reset_index(drop=True)

print("\nUsers in multimodal set:", users_mm.shape)
print("Label distribution (After balancing):")
print(users_mm["label_num"].value_counts())

print("\nImage availability by class (After Balancing):")
print(pd.crosstab(users_mm["label_num"], users_mm["image_exists"]))

## Section 3: Train / Val / Test split (shared for text and image)

In [None]:
train_val_df, test_df = train_test_split(
    users_mm,
    test_size=0.2,
    stratify=users_mm["label_num"],
    random_state=SEED
)

train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.2,
    stratify=train_val_df["label_num"],
    random_state=SEED
)

print("Train users:", len(train_df))
print("Val users:",   len(val_df))
print("Test users:",  len(test_df))

for name, df in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
    print(f"\n{name} label distribution:")
    print(df["label_num"].value_counts())

## Section 4: Loading RoBERTa model + tokenizer

In [None]:
tokenizer  = AutoTokenizer.from_pretrained("roberta-base")
text_model = AutoModelForSequenceClassification.from_pretrained(TEXT_MODEL_DIR)
text_model.to(device)
text_model.eval()

print("Loaded text model from:", TEXT_MODEL_DIR)

## Section 5: Text dataset + helper to compute probabilities

In [None]:
class UserTextDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts  = df["full_text"].astype(str).tolist()
        self.labels = df["label_num"].astype(int).tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text  = self.texts[idx]
        label = self.labels[idx]
        enc = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item

def get_text_probs(df):
    dataset = UserTextDataset(df, tokenizer=tokenizer, max_len=MAX_SEQ_LENGTH)
    loader = DataLoader(dataset=dataset, batch_size=16, shuffle=False)

    all_probs  = []
    all_labels = []

    with torch.no_grad():
        for batch in loader:
            labels = batch["labels"].numpy()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = text_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # [B, 2]
            probs = torch.softmax(logits, dim=-1)[:, 1] # prob of class 1 (bot)
            all_probs.append(probs.cpu().numpy())
            all_labels.append(labels)

    return np.concatenate(all_probs), np.concatenate(all_labels)

In [None]:
# --- Computing text-based bot probabilities --- #

train_text_prob, train_labels = get_text_probs(train_df)
val_text_prob, val_labels = get_text_probs(val_df)
test_text_prob, test_labels = get_text_probs(test_df)

print("Text probs shapes:", train_text_prob, val_text_prob, test_text_prob)

## Section 6: CLIP image embeddings and image-based probabilities

In [None]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

clip_model.to(device)
clip_model.eval()

print("CLIP loaded on:", device)

In [None]:
class TwibotImageDataset(Dataset):
    def __init__(self, df):
        self.paths  = df["profile_image_path"].tolist()
        self.labels = df["label_num"].astype(int).tolist()
        self.ids    = df["id"].tolist()

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path  = self.paths[idx]
        label = self.labels[idx]
        image = Image.open(path).convert("RGB")
        uid   = self.ids[idx]
        return image, label, uid

def collate_pil(batch):
    images, labels, ids = zip(*batch)
    labels = torch.tensor(labels, dtype=torch.long)
    return list(images), labels, list(ids)

def extract_clip_features_for_split(df_split):
    df_img = df_split[df_split["image_exists"]].copy().reset_index(drop=True)
    if df_img.empty:
        return None, None, None

    dataset = TwibotImageDataset(df_img)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE_IMG, shuffle=False, collate_fn=collate_pil)

    all_feats  = []
    all_labels = []
    all_ids    = []

    with torch.no_grad():
        for images, labels, ids in tqdm(loader, desc="CLIP features"):
            inputs = clip_processor(images=images, return_tensors="pt")
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = clip_model.get_image_features(**inputs)

            feats = outputs.cpu().numpy()
            feats = feats / np.linalg.norm(feats, axis=1, keepdims=True)
            
            all_feats.append(feats)
            all_labels.append(labels.numpy())
            all_ids.extend(ids)

    feats = np.vstack(all_feats)
    labels = np.concatenate(all_labels)
    ids = np.array(all_ids)

    return feats, labels, ids

In [None]:
train_img_feats, train_img_labels, train_img_ids = extract_clip_features_for_split(train_df)
val_img_feats, val_img_labels, val_img_ids = extract_clip_features_for_split(val_df)
test_img_feats, test_img_labels, test_img_ids = extract_clip_features_for_split(test_df)

print("Image feats:", train_img_feats, val_img_feats, test_img_feats)

In [None]:
# --- Training a simple image-only classifier, mainly to get image-based bot probabilities

scaler_img = StandardScaler(with_mean=False)

train_img_scaled = scaler_img.fit_transform(train_img_feats)
val_img_scaled   = scaler_img.transform(val_img_feats)
test_img_scaled  = scaler_img.transform(test_img_feats)

img_clf = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1
)
img_clf.fit(train_img_scaled, train_img_labels)

In [None]:
train_img_prob_has = img_clf.predict_proba(train_img_scaled)[:, 1]
val_img_prob_has   = img_clf.predict_proba(val_img_scaled)[:, 1]
test_img_prob_has  = img_clf.predict_proba(test_img_scaled)[:, 1]

train_df["img_prob"] = np.nan
val_df["img_prob"]   = np.nan
test_df["img_prob"]  = np.nan

train_df.loc[train_df["id"].isin(train_img_ids), "img_prob"] = train_img_prob_has
val_df.loc[val_df["id"].isin(val_img_ids), "img_prob"]       = val_img_prob_has
test_df.loc[test_df["id"].isin(test_img_ids), "img_prob"]    = test_img_prob_has

train_img_prob_filled = train_df["img_prob"].fillna(0.5).to_numpy()
val_img_prob_filled   = val_df["img_prob"].fillna(0.5).to_numpy()
test_img_prob_filled  = test_df["img_prob"].fillna(0.5).to_numpy()

train_has_image = train_df["image_exists"].astype(int).to_numpy()
val_has_image   = val_df["image_exists"].astype(int).to_numpy()
test_has_image  = test_df["image_exists"].astype(int).to_numpy()

train_sim = train_df["clip_img_text_sim"].fillna(0.0).to_numpy()
val_sim   = val_df["clip_img_text_sim"].fillna(0.0).to_numpy()
test_sim  = test_df["clip_img_text_sim"].fillna(0.0).to_numpy()

## Section 7: Building fusion features

In [None]:
X_train = np.stack([train_text_prob, train_img_prob_filled, train_has_image, train_sim], axis=1)
X_val   = np.stack([val_text_prob,   val_img_prob_filled,   val_has_image,   val_sim],   axis=1)
X_test  = np.stack([test_text_prob,  test_img_prob_filled,  test_has_image,  test_sim],  axis=1)

y_train = train_df["label_num"].to_numpy()
y_val   = val_df["label_num"].to_numpy()
y_test  = test_df["label_num"].to_numpy()

print("Fusion feature shapes:", X_train.shape, X_val.shape, X_test.shape)

## Section 8: Training fusion classifier

In [None]:
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
scale_pos_weight = neg / pos if pos > 0 else 1.0
print("Train pos/neg:", pos, neg, "scale_pos_weight:", scale_pos_weight)

In [None]:
%%time
param_grid = {
    "n_estimators"    : [50, 100, 200, 400],
    "max_depth"       : [2, 3, 4, 5],
    "learning_rate"   : [0.005, 0.01, 0.03],
    "subsample"       : [0.005, 0.01, 0.05, 0.2, 0.5, 1.0],
    "colsample_bytree": [0.005, 0.01, 0.05, 0.2, 0.5, 1.0]
}

best_auc    = -1.0
best_params = None
results     = []

for n_estimators in param_grid["n_estimators"]:
    for max_depth in param_grid["max_depth"]:
        for learning_rate in param_grid["learning_rate"]:
            for subsample in param_grid["subsample"]:
                for colsample_bytree in param_grid["colsample_bytree"]:
                    xgb = XGBClassifier(
                        n_estimators=n_estimators,
                        max_depth=max_depth,
                        learning_rate=learning_rate,
                        subsample=subsample,
                        colsample_bytree=colsample_bytree,
                        reg_lambda=1.0,
                        objective="binary:logistic",
                        eval_metric="logloss",
                        scale_pos_weight=scale_pos_weight,
                        random_state=42,
                        n_jobs=-1,
                        tree_method="hist",   # usually fastest
                    )

                    xgb.fit(X_train, y_train)

                    val_proba = xgb.predict_proba(X_val)[:, 1]
                    auc = roc_auc_score(y_val, val_proba)

                    results.append((auc, {
                        "n_estimators": n_estimators,
                        "max_depth": max_depth,
                        "learning_rate": learning_rate,
                        "subsample": subsample,
                        "colsample_bytree": colsample_bytree,
                    }))

                    if auc > best_auc:
                        best_auc = auc
                        best_params = results[-1][1]

print("Best val AUROC:", best_auc)
print("Best params:", best_params)

## Section 9: Final test evaluation (fusion) + Confusion matrix

In [None]:
xgb_fusion = XGBClassifier(
    **best_params,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight,
    random_state=SEED,
    n_jobs=-1,
    tree_method="hist"
)

xgb_fusion.fit(X_train, y_train)
test_proba = xgb_fusion.predict_proba(X_test)[:, 1]
test_pred  = (test_proba >= 0.5).astype(int)

print("XGBoost Fusion -- test performance:")
print(classification_report(y_test, test_pred, digits=3, zero_division=0))
print("XGBoost Fusion -- test AUROC:", roc_auc_score(y_test, test_proba))

In [None]:
cm = confusion_matrix(y_test, test_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
           xticklabels=["human", "bot"],
           yticklabels=["human","bot"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Multimodal Fusion - Confusion Matrix (Test)")
plt.tight_layout()
plt.savefig(str(OUTPUT_DIR / "fusion_confusion_matrix_test.png"))
plt.show()

In [None]:
feature_names = ["text_prob", "img_prob", "has_image", "clip_img_text_sim"]
importances = xgb_fusion.feature_importances_

for name, imp in sorted(zip(feature_names, importances), key=lambda x: -x[1]):
    print(f"{name}: {imp:.3f}")
