In [None]:
!pip install transformers==4.44.2 accelerate==0.33.0


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
wentengh_twitter2015_path = kagglehub.dataset_download('wentengh/twitter2015')
wentengh_twitter2017_path = kagglehub.dataset_download('wentengh/twitter2017')

print('Data source import complete.')


In [None]:
!ls -R $wentengh_twitter2015_path
!ls -R $wentengh_twitter2017_path


In [None]:
import os

In [None]:
!git clone https://github.com/jefferyYu/UMT.git

In [None]:
!cp -r $wentengh_twitter2015_path /content/twitter2015
!cp -r $wentengh_twitter2017_path /content/twitter2017


# Twitter2015

## define dataset

In [None]:
import os, ast, math, json, random
from dataclasses import dataclass
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModel, get_cosine_schedule_with_warmup

# -------------- Repro --------------
SEED = 1337
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------------- Paths (EDIT if needed) --------------
TXT_PATHS = {
    'train': '/content/UMT/data/twitter2015/train.txt',
    'val':   '/content/UMT/data/twitter2015/valid.txt',   # will auto-fix if 'mnre_val .txt' exists
    'test':  '/content/UMT/data/twitter2015/test.txt'
}
IMG_DIRS = {
    'train': '/content/twitter2015/twitter2015/twitter2015_images',
    'val':   '/content/twitter2015/twitter2015/twitter2015_images',
    'test':  '/content/twitter2015/twitter2015/twitter2015_images'
}


In [None]:
# ================== Utility Functions ==================
def parse_twitter_conll(path: str):
    """Parse Twitter2015-style dataset: blocks separated by blank lines, starting with IMGID:xxxx"""
    samples = []
    with open(path, 'r', encoding='utf-8') as f:
        lines = [ln.rstrip('\n') for ln in f]
    img_id, toks, tags = None, [], []
    for ln in lines + ['']:  # add sentinel
        if not ln.strip():
            if img_id is not None and toks:
                samples.append({'img_id': img_id, 'tokens': toks, 'labels': tags})
            img_id, toks, tags = None, [], []
            continue
        if ln.startswith('IMGID:'):
            img_id = ln.split(':', 1)[1].strip()
        else:
            parts = ln.split()
            if len(parts) >= 2:
                toks.append(parts[0])
                tags.append(parts[1])
    return samples

def build_label_vocab(*lists_of_samples):
    """Collect all BIO labels across splits."""
    labels = set()
    for s_list in lists_of_samples:
        for s in s_list:
            labels.update(s['labels'])
    labels = sorted(labels)
    label2id = {l: i for i, l in enumerate(labels)}
    id2label = {i: l for l, i in label2id.items()}
    return label2id, id2label

def find_image_path(img_dir: str, img_id: str):
    """Find image file by ID with common extensions."""
    for ext in ('.jpg', '.jpeg', '.png', '.bmp'):
        p = os.path.join(img_dir, img_id + ext)
        if os.path.exists(p):
            return p
    return None


## CLIP models

In [None]:
# ================== Imports ==================


import os, random, numpy as np, torch, torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from torchvision import transforms
from transformers import (
    CLIPModel, CLIPProcessor,
    BertTokenizerFast, BertModel,
    get_cosine_schedule_with_warmup
)
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

# ================== Dataset (same as yours, just BERT tokenizer) ==================
class Twitter2015MNER(Dataset):
    def __init__(self, samples, img_dir, tokenizer, label2id, max_len=128, aug=False):
        self.samples = samples
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

        if aug:
            self.img_tfm = transforms.Compose([
                transforms.Resize((256,256)),
                transforms.RandomResizedCrop((224,224), scale=(0.9,1.0)),
                transforms.ColorJitter(0.15,0.15,0.15,0.05),
                transforms.RandomHorizontalFlip(0.5),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                                     std=[0.26862954, 0.26130258, 0.27577711]),
            ])
        else:
            self.img_tfm = transforms.Compose([
                transforms.Resize((224,224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                                     std=[0.26862954, 0.26130258, 0.27577711]),
            ])

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        ex = self.samples[idx]
        tokens, labels = ex['tokens'], ex['labels']

        encodings = self.tokenizer(
            tokens, is_split_into_words=True,
            padding='max_length', truncation=True,
            max_length=self.max_len, return_tensors='pt'
        )

        word_ids = encodings.word_ids(batch_index=0)
        enc = {k: v.squeeze(0) for k, v in encodings.items()}

        label_ids = []
        prev_word = None
        for w_id in word_ids:
            if w_id is None:
                label_ids.append(-100)
            elif w_id != prev_word:
                label_ids.append(self.label2id[labels[w_id]])
            else:
                label_ids.append(-100)
            prev_word = w_id
        label_ids = torch.tensor(label_ids, dtype=torch.long)

        img_path = find_image_path(self.img_dir, ex['img_id'])
        if img_path and os.path.exists(img_path):
            img = Image.open(img_path).convert('RGB')
        else:
            img = Image.new('RGB', (224,224), (0,0,0))
        img = self.img_tfm(img)

        return {
            'input_ids': enc['input_ids'],
            'attention_mask': enc['attention_mask'],
            'pixel_values': img,
            'labels': label_ids
        }

# ================== CLIP + BERT Model ==================
class CLIPBertMNER(nn.Module):
    """
    Multimodal NER model:
    - Vision encoder: CLIP (ViT-B/32)
    - Text encoder: BERT-base-uncased
    - Fusion: FiLM (γ, β modulation)
    """
    def __init__(self, num_labels):
        super().__init__()
        self.text_encoder = BertModel.from_pretrained("bert-base-uncased")
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

        hidden = self.text_encoder.config.hidden_size  # 768

        # ✅ FIXED
        self.img_proj = nn.Linear(512, hidden)

        self.gamma = nn.Linear(hidden, hidden)
        self.beta = nn.Linear(hidden, hidden)

        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden, num_labels)

    def forward(self, input_ids, attention_mask, pixel_values, labels=None):
        # Text
        text_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        seq = text_out.last_hidden_state  # [B, L, 768]

        # Image
        img_feat = self.clip_model.get_image_features(pixel_values=pixel_values)  # [B, 512]
        img_feat = self.img_proj(img_feat)  # [B, 768]

        # FiLM Fusion
        g = self.gamma(img_feat).unsqueeze(1)
        b = self.beta(img_feat).unsqueeze(1)
        seq = (1 + g) * seq + b

        seq = self.dropout(seq)
        logits = self.classifier(seq)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return logits, loss

# ================== Build Data ==================
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_samples = parse_twitter_conll(TXT_PATHS['train'])
val_samples   = parse_twitter_conll(TXT_PATHS['val'])
test_samples  = parse_twitter_conll(TXT_PATHS['test'])

label2id, id2label = build_label_vocab(train_samples, val_samples, test_samples)

train_ds = Twitter2015MNER(train_samples, IMG_DIRS['train'], tokenizer, label2id, max_len=128, aug=True)
val_ds   = Twitter2015MNER(val_samples,   IMG_DIRS['val'],   tokenizer, label2id, max_len=128)
test_ds  = Twitter2015MNER(test_samples,  IMG_DIRS['test'],  tokenizer, label2id, max_len=128)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True,  num_workers=2)
val_loader   = DataLoader(val_ds,   batch_size=12, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=12, shuffle=False, num_workers=2)

# ================== Train Setup ==================
model = CLIPBertMNER(num_labels=len(label2id)).to(device)
optim = torch.optim.AdamW(model.parameters(), lr=3e-5)
EPOCHS = 5

total_steps = len(train_loader)*EPOCHS
sched = get_cosine_schedule_with_warmup(optim, int(0.1*total_steps), total_steps)
scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))

def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    all_preds, all_labels, losses = [], [], []
    it = tqdm(loader, desc="Train" if train else "Eval", leave=False)

    for batch in it:
        ids = batch['input_ids'].to(device)
        attn = batch['attention_mask'].to(device)
        pixels = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            logits, loss = model(ids, attn, pixels, labels)

        if train:
            optim.zero_grad()
            scaler.scale(loss).backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optim)
            scaler.update()
            sched.step()

        losses.append(loss.item())
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        golds = labels.cpu().numpy()
        for p, g in zip(preds, golds):
            for pi, gi in zip(p, g):
                if gi == -100: continue
                all_preds.append(pi)
                all_labels.append(gi)

    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro')
    return np.mean(losses), acc, prec, rec, f1

# ================== Training Loop ==================
best_f1, best_state = -1, None
for ep in range(1, EPOCHS+1):
    tr_loss, tr_acc, tr_p, tr_r, tr_f1 = run_epoch(train_loader, train=True)
    vl_loss, vl_acc, vl_p, vl_r, vl_f1 = run_epoch(val_loader, train=False)
    print(f"Epoch {ep:02d} | Train F1 {tr_f1:.4f} | Val F1 {vl_f1:.4f}")
    if vl_f1 > best_f1:
        best_f1 = vl_f1
        best_state = {'model': model.state_dict(), 'label2id': label2id, 'id2label': id2label}
        torch.save(best_state, 'clip_bert_mner_best.pth')

# ================== Test Evaluation ==================
model.load_state_dict(torch.load('clip_bert_mner_best.pth', map_location=device)['model'])

def evaluate(loader):
    model.eval()
    all_preds, all_labels, losses = [], [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Test", leave=False):
            ids = batch['input_ids'].to(device)
            attn = batch['attention_mask'].to(device)
            pixels = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)
            logits, loss = model(ids, attn, pixels, labels)
            losses.append(loss.item())
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            golds = labels.cpu().numpy()
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100: continue
                    all_preds.append(pi)
                    all_labels.append(gi)
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro')
    return np.mean(losses), acc, f1

te_loss, te_acc, te_f1 = evaluate(test_loader)
print("\n===== TEST RESULTS =====")
print(f"Loss: {te_loss:.4f}")
print(f"Accuracy: {te_acc:.4f}")
print(f"F1 Score: {te_f1:.4f}")


In [None]:
from sklearn.metrics import classification_report

def classification_report_per_label(loader):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc="Per-Label Report", leave=False):
            ids = batch['input_ids'].to(device)
            attn = batch['attention_mask'].to(device)
            pixels = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)

            logits, _ = model(ids, attn, pixels, labels)
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            golds = labels.cpu().numpy()

            # collect only valid tokens (skip subwords/pads)
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100:
                        continue
                    all_preds.append(pi)
                    all_labels.append(gi)

    # Convert IDs back to label names
    target_names = [id2label[i] for i in sorted(id2label.keys())]

    print("\n===== CLASSIFICATION REPORT =====")
    print(classification_report(
        all_labels, all_preds,
        target_names=target_names,
        digits=4,
        zero_division=0
    ))

# ✅ Run on test set
classification_report_per_label(test_loader)


## VIT models

In [None]:
# ==========================================================
# 🚀 ViLT (Vision-and-Language Transformer) for Multimodal NER
# ==========================================================
# !pip -q install transformers==4.42.4 accelerate datasets==2.21.0

import os, torch, torch.nn as nn, numpy as np
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import ViltProcessor, ViltModel, get_cosine_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ==========================================================
# 🧩 Dataset (safe image + label handling)
# ==========================================================
class Twitter2015ViLTMNER(Dataset):
    def __init__(self, samples, img_dir, processor, label2id, max_len=128):
        self.samples = samples
        self.img_dir = img_dir
        self.processor = processor
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ex = self.samples[idx]
        tokens, labels = ex["tokens"], ex["labels"]
        text = " ".join(tokens)

        # ---- Safe image loading ----
        # ---- Safe image loading + force resize to fixed size ----
        img_path = find_image_path(self.img_dir, ex["img_id"])
        if img_path and os.path.exists(img_path):
            try:
                img = Image.open(img_path).convert("RGB")
            except Exception:
                img = Image.new("RGB", (384, 384), (0, 0, 0))
        else:
            img = Image.new("RGB", (384, 384), (0, 0, 0))

        # Force resize all images to 384x384 (ViLT’s native input size)
        img = img.resize((384, 384))


        # ---- Process text + image together ----
        enc = self.processor(
            images=img,
            text=text,
            padding="max_length",
            truncation=True,
            max_length=40,          # ✅ ViLT’s native text length
            return_tensors="pt",
        )


        input_ids = enc["input_ids"].squeeze(0)
        attn_mask = enc["attention_mask"].squeeze(0)
        pixel_values = enc["pixel_values"].squeeze(0)

        # ---- Align labels with tokens ----
        label_ids = [-100] * len(input_ids)
        for i, l in enumerate(labels):
            if i + 1 < len(label_ids):
                label_ids[i + 1] = self.label2id[l]
        # ensure exact length
        label_ids = label_ids[:self.max_len]
        if len(label_ids) < self.max_len:
            label_ids += [-100] * (self.max_len - len(label_ids))
        label_ids = torch.tensor(label_ids, dtype=torch.long)

        return {
            "input_ids": input_ids,
            "attention_mask": attn_mask,
            "pixel_values": pixel_values,
            "labels": label_ids,
        }

# ==========================================================
# 🧩 Safe Collate Function (handles variable shapes)
# ==========================================================
from torch.nn.utils.rnn import pad_sequence

def vilt_collate_fn(batch):
    """Safely collate variable-length ViLT samples."""
    input_ids = [b["input_ids"] for b in batch]
    attn_mask = [b["attention_mask"] for b in batch]
    pixels    = [b["pixel_values"] for b in batch]
    labels    = [b["labels"] for b in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attn_mask = pad_sequence(attn_mask, batch_first=True, padding_value=0)
    labels    = pad_sequence(labels, batch_first=True, padding_value=-100)
    pixel_values = torch.stack(pixels)

    return {
        "input_ids": input_ids,
        "attention_mask": attn_mask,
        "pixel_values": pixel_values,
        "labels": labels,
    }

# ==========================================================
# ⚙️ Model Definition
# ==========================================================
class ViltMNER(nn.Module):
    def __init__(self, num_labels, model_name="dandelin/vilt-b32-mlm"):
        super().__init__()
        self.vilt = ViltModel.from_pretrained(model_name)
        hidden = self.vilt.config.hidden_size  # 768
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden, num_labels)

    def forward(self, input_ids, attention_mask, pixel_values, labels=None):
        outputs = self.vilt(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            return_dict=True,
        )

        seq = outputs.last_hidden_state
        seq = self.dropout(seq)
        logits = self.classifier(seq)

        # Keep only text tokens (ViLT mixes visual+text)
        text_len = input_ids.size(1)
        logits_text = logits[:, :text_len, :]

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(
                logits_text.reshape(-1, logits_text.size(-1)),
                labels.reshape(-1)
            )

        return logits_text, loss



# ==========================================================
# 🔧 Build Data
# ==========================================================
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")

train_samples = parse_twitter_conll(TXT_PATHS["train"])
val_samples   = parse_twitter_conll(TXT_PATHS["val"])
test_samples  = parse_twitter_conll(TXT_PATHS["test"])

label2id, id2label = build_label_vocab(train_samples, val_samples, test_samples)
print("Labels:", label2id)

train_ds = Twitter2015ViLTMNER(train_samples, IMG_DIRS["train"], processor, label2id,max_len=40)
val_ds   = Twitter2015ViLTMNER(val_samples,   IMG_DIRS["val"],   processor, label2id,max_len=40)
test_ds  = Twitter2015ViLTMNER(test_samples,  IMG_DIRS["test"],  processor, label2id,max_len=40)

train_loader = DataLoader(train_ds, batch_size=4, shuffle=True,  num_workers=2, collate_fn=vilt_collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=8, shuffle=False, num_workers=2, collate_fn=vilt_collate_fn)
test_loader  = DataLoader(test_ds,  batch_size=8, shuffle=False, num_workers=2, collate_fn=vilt_collate_fn)

# ==========================================================
# 🧮 Training Setup
# ==========================================================
EPOCHS = 5
model = ViltMNER(num_labels=len(label2id)).to(device)
optim = torch.optim.AdamW(model.parameters(), lr=3e-5)
total_steps = len(train_loader) * EPOCHS
sched = get_cosine_schedule_with_warmup(optim, int(0.1 * total_steps), total_steps)
scaler = torch.amp.GradScaler("cuda", enabled=(device.type == "cuda"))

def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    all_preds, all_labels, losses = [], [], []
    it = tqdm(loader, desc="Train" if train else "Eval", leave=False)
    for batch in it:
        ids = batch["input_ids"].to(device)
        attn = batch["attention_mask"].to(device)
        pixels = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        with torch.amp.autocast("cuda", enabled=(device.type == "cuda")):
            logits, loss = model(ids, attn, pixels, labels)

        if train:
            optim.zero_grad()
            scaler.scale(loss).backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optim)
            scaler.update()
            sched.step()

        losses.append(loss.item())
        preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
        golds = labels.detach().cpu().numpy()

        for p, g in zip(preds, golds):
            for pi, gi in zip(p, g):
                if gi == -100: continue
                all_preds.append(pi)
                all_labels.append(gi)

    avg_loss = float(np.mean(losses))
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="micro", zero_division=0)
    return avg_loss, acc, prec, rec, f1

# ==========================================================
# 🚀 Train & Validate
# ==========================================================
best_val_f1 = -1
for ep in range(1, EPOCHS + 1):
    tr_loss, tr_acc, tr_p, tr_r, tr_f1 = run_epoch(train_loader, train=True)
    vl_loss, vl_acc, vl_p, vl_r, vl_f1 = run_epoch(val_loader, train=False)
    print(f"Epoch {ep:02d} | Train F1 {tr_f1:.4f} | Val F1 {vl_f1:.4f}")
    if vl_f1 > best_val_f1:
        best_val_f1 = vl_f1
        torch.save(model.state_dict(), "vilt_mner_best.pth")

# ==========================================================
# 🧪 Evaluate on Test
# ==========================================================
model.load_state_dict(torch.load("vilt_mner_best.pth", map_location=device))

def evaluate(loader):
    model.eval()
    all_preds, all_labels, losses = [], [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Test", leave=False):
            ids = batch["input_ids"].to(device)
            attn = batch["attention_mask"].to(device)
            pixels = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            logits, loss = model(ids, attn, pixels, labels)
            losses.append(loss.item())

            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            golds = labels.cpu().numpy()
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100: continue
                    all_preds.append(pi); all_labels.append(gi)

    avg_loss = float(np.mean(losses))
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="micro", zero_division=0)
    return avg_loss, acc, prec, rec, f1

te_loss, te_acc, te_p, te_r, te_f1 = evaluate(test_loader)
print("\n===== TEST RESULTS =====")
print(f"Loss: {te_loss:.4f}")
print(f"Accuracy: {te_acc:.4f}")
print(f"F1 Score: {te_f1:.4f}")

# ==========================================================
# 📊 Classification Report
# ==========================================================
def classification_report_per_label(loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Per-Label Report", leave=False):
            ids = batch["input_ids"].to(device)
            attn = batch["attention_mask"].to(device)
            pixels = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            logits, _ = model(ids, attn, pixels, labels)
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            golds = labels.cpu().numpy()
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100: continue
                    all_preds.append(pi); all_labels.append(gi)

    target_names = [id2label[i] for i in sorted(id2label.keys())]
    print("\n===== CLASSIFICATION REPORT =====")
    print(classification_report(all_labels, all_preds, target_names=target_names, digits=4, zero_division=0))

classification_report_per_label(test_loader)


## Blip models

In [None]:
# ==========================================================
# 🚀 BLIP (Salesforce/blip-itm-base-coco) for Multimodal NER — Final Stable Version
# ==========================================================
# !pip -q install transformers==4.42.4 accelerate datasets==2.21.0

import os, torch, torch.nn as nn, numpy as np
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import BlipProcessor, BlipModel, get_cosine_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ==========================================================
# 🧩 Dataset Definition
# ==========================================================
class TwitterBLIPMNER(Dataset):
    def __init__(self, samples, img_dir, processor, label2id, max_len=64):
        self.samples = samples
        self.img_dir = img_dir
        self.processor = processor
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ex = self.samples[idx]
        tokens, labels = ex["tokens"], ex["labels"]
        text = " ".join(tokens)

        # ---- Safe image loading ----
        img_path = find_image_path(self.img_dir, ex["img_id"])
        if img_path and os.path.exists(img_path):
            try:
                img = Image.open(img_path).convert("RGB")
            except Exception:
                img = Image.new("RGB", (384, 384), (0, 0, 0))
        else:
            img = Image.new("RGB", (384, 384), (0, 0, 0))

        img = img.resize((384, 384))

        # ---- Process image + text ----
        enc = self.processor(
            images=img,
            text=text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        input_ids = enc["input_ids"].squeeze(0)
        attn_mask = enc["attention_mask"].squeeze(0)
        pixel_values = enc["pixel_values"].squeeze(0)

        # ---- Align labels ----
        label_ids = [-100] * len(input_ids)
        for i, l in enumerate(labels):
            if i + 1 < len(label_ids):
                label_ids[i + 1] = self.label2id[l]
        label_ids = label_ids[:self.max_len]
        if len(label_ids) < self.max_len:
            label_ids += [-100] * (self.max_len - len(label_ids))
        label_ids = torch.tensor(label_ids, dtype=torch.long)

        return {
            "input_ids": input_ids,
            "attention_mask": attn_mask,
            "pixel_values": pixel_values,
            "labels": label_ids
        }

# ==========================================================
# ⚙️ Collate Function
# ==========================================================
from torch.nn.utils.rnn import pad_sequence

def blip_collate_fn(batch):
    input_ids = [b["input_ids"] for b in batch]
    attn_mask = [b["attention_mask"] for b in batch]
    pixels    = [b["pixel_values"] for b in batch]
    labels    = [b["labels"] for b in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attn_mask = pad_sequence(attn_mask, batch_first=True, padding_value=0)
    labels    = pad_sequence(labels, batch_first=True, padding_value=-100)
    pixel_values = torch.stack(pixels)

    return {
        "input_ids": input_ids,
        "attention_mask": attn_mask,
        "pixel_values": pixel_values,
        "labels": labels
    }

from transformers import BlipForImageTextRetrieval

# ==========================================================
# 🧠 BLIP-based Multimodal NER Model
# ==========================================================
class BLIPMNER(nn.Module):
    def __init__(self, num_labels, model_name="Salesforce/blip-itm-base-coco"):
        super().__init__()
        self.blip = BlipForImageTextRetrieval.from_pretrained(model_name)
        self.text_encoder = self.blip.text_encoder      # ✅ now available
        self.vision_model = self.blip.vision_model      # ✅ vision tower
        hidden = self.blip.config.text_config.hidden_size  # 768
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden, num_labels)

    def forward(self, input_ids, attention_mask, pixel_values, labels=None):
        # ---- Step 1: Vision branch ----
        vision_outputs = self.vision_model(pixel_values)
        vision_embeds = vision_outputs.last_hidden_state

        # ---- Step 2: Text encoder with visual context ----
        text_outputs = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=vision_embeds,
            encoder_attention_mask=torch.ones(
                vision_embeds.size()[:-1], dtype=torch.long, device=vision_embeds.device
            ),
            return_dict=True
        )

        seq = text_outputs.last_hidden_state  # [B, L, H]
        seq = self.dropout(seq)
        logits = self.classifier(seq)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.reshape(-1, logits.size(-1)), labels.reshape(-1))
        return logits, loss

# ==========================================================
# 🔧 Build Data
# ==========================================================
processor = BlipProcessor.from_pretrained("Salesforce/blip-itm-base-coco")

train_samples = parse_twitter_conll(TXT_PATHS["train"])
val_samples   = parse_twitter_conll(TXT_PATHS["val"])
test_samples  = parse_twitter_conll(TXT_PATHS["test"])

label2id, id2label = build_label_vocab(train_samples, val_samples, test_samples)
print("Labels:", label2id)

train_ds = TwitterBLIPMNER(train_samples, IMG_DIRS["train"], processor, label2id)
val_ds   = TwitterBLIPMNER(val_samples,   IMG_DIRS["val"],   processor, label2id)
test_ds  = TwitterBLIPMNER(test_samples,  IMG_DIRS["test"],  processor, label2id)

train_loader = DataLoader(train_ds, batch_size=4, shuffle=True,  num_workers=2, collate_fn=blip_collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=8, shuffle=False, num_workers=2, collate_fn=blip_collate_fn)
test_loader  = DataLoader(test_ds,  batch_size=8, shuffle=False, num_workers=2, collate_fn=blip_collate_fn)

# ==========================================================
# ⚙️ Training Setup
# ==========================================================
EPOCHS = 5
model = BLIPMNER(num_labels=len(label2id)).to(device)
optim = torch.optim.AdamW(model.parameters(), lr=3e-5)
total_steps = len(train_loader) * EPOCHS
sched = get_cosine_schedule_with_warmup(optim, int(0.1 * total_steps), total_steps)
scaler = torch.amp.GradScaler("cuda", enabled=(device.type == "cuda"))

def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    all_preds, all_labels, losses = [], [], []
    it = tqdm(loader, desc="Train" if train else "Eval", leave=False)
    for batch in it:
        ids = batch["input_ids"].to(device)
        attn = batch["attention_mask"].to(device)
        pixels = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        with torch.amp.autocast("cuda", enabled=(device.type == "cuda")):
            logits, loss = model(ids, attn, pixels, labels)

        if train:
            optim.zero_grad()
            scaler.scale(loss).backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optim)
            scaler.update()
            sched.step()

        losses.append(loss.item())
        preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
        golds = labels.detach().cpu().numpy()

        for p, g in zip(preds, golds):
            for pi, gi in zip(p, g):
                if gi == -100: continue
                all_preds.append(pi)
                all_labels.append(gi)

    avg_loss = float(np.mean(losses))
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="micro", zero_division=0)
    return avg_loss, acc, prec, rec, f1

# ==========================================================
# 🚀 Train & Validate
# ==========================================================
best_val_f1 = -1
for ep in range(1, EPOCHS + 1):
    tr_loss, tr_acc, tr_p, tr_r, tr_f1 = run_epoch(train_loader, train=True)
    vl_loss, vl_acc, vl_p, vl_r, vl_f1 = run_epoch(val_loader, train=False)
    print(f"Epoch {ep:02d} | Train F1 {tr_f1:.4f} | Val F1 {vl_f1:.4f}")
    if vl_f1 > best_val_f1:
        best_val_f1 = vl_f1
        torch.save(model.state_dict(), "blip_mner_best.pth")

# ==========================================================
# 🧪 Evaluate on Test
# ==========================================================
model.load_state_dict(torch.load("blip_mner_best.pth", map_location=device))

def evaluate(loader):
    model.eval()
    all_preds, all_labels, losses = [], [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Test", leave=False):
            ids = batch["input_ids"].to(device)
            attn = batch["attention_mask"].to(device)
            pixels = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            logits, loss = model(ids, attn, pixels, labels)
            losses.append(loss.item())

            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            golds = labels.cpu().numpy()
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100: continue
                    all_preds.append(pi); all_labels.append(gi)

    avg_loss = float(np.mean(losses))
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="micro", zero_division=0)
    return avg_loss, acc, prec, rec, f1

te_loss, te_acc, te_p, te_r, te_f1 = evaluate(test_loader)
print("\n===== TEST RESULTS =====")
print(f"Loss: {te_loss:.4f}")
print(f"Accuracy: {te_acc:.4f}")
print(f"F1 Score: {te_f1:.4f}")


In [None]:
def classification_report_per_label(loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Per-Label Report", leave=False):
            ids = batch["input_ids"].to(device)
            attn = batch["attention_mask"].to(device)
            pixels = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            logits, _ = model(ids, attn, pixels, labels)
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            golds = labels.cpu().numpy()
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100: continue
                    all_preds.append(pi); all_labels.append(gi)

    target_names = [id2label[i] for i in sorted(id2label.keys())]
    print("\n===== CLASSIFICATION REPORT =====")
    print(classification_report(all_labels, all_preds, target_names=target_names, digits=4, zero_division=0))

classification_report_per_label(test_loader)

# Twitter 2017 b

## Clip models 2017

In [None]:
import os, ast, math, json, random
from dataclasses import dataclass
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModel, get_cosine_schedule_with_warmup

# -------------- Repro --------------
SEED = 1337
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------------- Paths (EDIT if needed) --------------
TXT_PATHS = {
    'train': '/content/UMT/data/twitter2017/train.txt',
    'val':   '/content/UMT/data/twitter2017/valid.txt',   # will auto-fix if 'mnre_val .txt' exists
    'test':  '/content/UMT/data/twitter2017/test.txt'
}
IMG_DIRS = {
    'train': '/content/twitter2017/twitter2017/twitter2017_images',
    'val':   '/content/twitter2017/twitter2017/twitter2017_images',
    'test':  '/content/twitter2017/twitter2017/twitter2017_images'
}


In [None]:
# ================== Imports ==================
# !pip -q install transformers==4.42.4 accelerate datasets==2.21.0

import os, random, numpy as np, torch, torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from torchvision import transforms
from transformers import (
    CLIPModel, CLIPProcessor,
    BertTokenizerFast, BertModel,
    get_cosine_schedule_with_warmup
)
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

# ================== Dataset (same as yours, just BERT tokenizer) ==================
from torch.utils.data import Dataset
from PIL import Image
import torch
from torchvision import transforms
import os

class Twitter2015MNER(Dataset):
    def __init__(self, samples, img_dir, tokenizer, label2id, max_len=128, aug=False):
        """
        samples: list of dicts [{'img_id':..., 'tokens':..., 'labels':...}]
        img_dir: path to image folder
        tokenizer: HuggingFace tokenizer
        label2id: label -> id mapping
        max_len: maximum token length
        aug: apply augmentation (for training)
        """
        self.samples = samples
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

        # ---------- Image Transformations ----------
        if aug:
            self.img_tfm = transforms.Compose([
                transforms.Resize((256,256)),
                transforms.RandomResizedCrop((224,224), scale=(0.9,1.0)),
                transforms.ColorJitter(0.15,0.15,0.15,0.05),
                transforms.RandomHorizontalFlip(0.5),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                                     std=[0.26862954, 0.26130258, 0.27577711]),
            ])
        else:
            self.img_tfm = transforms.Compose([
                transforms.Resize((224,224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
                                     std=[0.26862954, 0.26130258, 0.27577711]),
            ])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ex = self.samples[idx]
        tokens, labels = ex['tokens'], ex['labels']

        # ---------- Tokenization ----------
        encodings = self.tokenizer(
            tokens,
            is_split_into_words=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        word_ids = encodings.word_ids(batch_index=0)
        enc = {k: v.squeeze(0) for k, v in encodings.items()}

        # ---------- Label Alignment ----------
        label_ids = []
        prev_word = None
        for w_id in word_ids:
            if w_id is None:
                label_ids.append(-100)
            elif w_id != prev_word:
                label_ids.append(self.label2id[labels[w_id]])
            else:
                label_ids.append(-100)
            prev_word = w_id
        label_ids = torch.tensor(label_ids, dtype=torch.long)

        # ---------- Safe Image Loading ----------
        img_path = find_image_path(self.img_dir, ex['img_id'])
        if img_path and os.path.exists(img_path):
            try:
                img = Image.open(img_path).convert('RGB')
            except Exception:
                # If unreadable (corrupted or non-image), replace with blank
                img = Image.new('RGB', (224, 224), (0, 0, 0))
                # Optional: print(f"[Warning] Skipped bad image: {img_path}")
        else:
            img = Image.new('RGB', (224, 224), (0, 0, 0))
        img = self.img_tfm(img)

        # ---------- Return Batch Dict ----------
        return {
            'input_ids': enc['input_ids'],
            'attention_mask': enc['attention_mask'],
            'pixel_values': img,
            'labels': label_ids
        }

# ================== CLIP + BERT Model ==================
class CLIPBertMNER(nn.Module):
    """
    Multimodal NER model:
    - Vision encoder: CLIP (ViT-B/32)
    - Text encoder: BERT-base-uncased
    - Fusion: FiLM (γ, β modulation)
    """
    def __init__(self, num_labels):
        super().__init__()
        self.text_encoder = BertModel.from_pretrained("bert-base-uncased")
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

        hidden = self.text_encoder.config.hidden_size  # 768

        # ✅ FIXED
        self.img_proj = nn.Linear(512, hidden)

        self.gamma = nn.Linear(hidden, hidden)
        self.beta = nn.Linear(hidden, hidden)

        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden, num_labels)

    def forward(self, input_ids, attention_mask, pixel_values, labels=None):
        # Text
        text_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        seq = text_out.last_hidden_state  # [B, L, 768]

        # Image
        img_feat = self.clip_model.get_image_features(pixel_values=pixel_values)  # [B, 512]
        img_feat = self.img_proj(img_feat)  # [B, 768]

        # FiLM Fusion
        g = self.gamma(img_feat).unsqueeze(1)
        b = self.beta(img_feat).unsqueeze(1)
        seq = (1 + g) * seq + b

        seq = self.dropout(seq)
        logits = self.classifier(seq)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return logits, loss

# ================== Build Data ==================
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_samples = parse_twitter_conll(TXT_PATHS['train'])
val_samples   = parse_twitter_conll(TXT_PATHS['val'])
test_samples  = parse_twitter_conll(TXT_PATHS['test'])

label2id, id2label = build_label_vocab(train_samples, val_samples, test_samples)

train_ds = Twitter2015MNER(train_samples, IMG_DIRS['train'], tokenizer, label2id, max_len=128, aug=True)
val_ds   = Twitter2015MNER(val_samples,   IMG_DIRS['val'],   tokenizer, label2id, max_len=128)
test_ds  = Twitter2015MNER(test_samples,  IMG_DIRS['test'],  tokenizer, label2id, max_len=128)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True,  num_workers=2)
val_loader   = DataLoader(val_ds,   batch_size=12, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=12, shuffle=False, num_workers=2)

# ================== Train Setup ==================
model = CLIPBertMNER(num_labels=len(label2id)).to(device)
optim = torch.optim.AdamW(model.parameters(), lr=3e-5)
EPOCHS = 5

total_steps = len(train_loader)*EPOCHS
sched = get_cosine_schedule_with_warmup(optim, int(0.1*total_steps), total_steps)
scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))

def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    all_preds, all_labels, losses = [], [], []
    it = tqdm(loader, desc="Train" if train else "Eval", leave=False)

    for batch in it:
        ids = batch['input_ids'].to(device)
        attn = batch['attention_mask'].to(device)
        pixels = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        with torch.cuda.amp.autocast(enabled=(device.type=='cuda')):
            logits, loss = model(ids, attn, pixels, labels)

        if train:
            optim.zero_grad()
            scaler.scale(loss).backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optim)
            scaler.update()
            sched.step()

        losses.append(loss.item())
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        golds = labels.cpu().numpy()
        for p, g in zip(preds, golds):
            for pi, gi in zip(p, g):
                if gi == -100: continue
                all_preds.append(pi)
                all_labels.append(gi)

    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro')
    return np.mean(losses), acc, prec, rec, f1

# ================== Training Loop ==================
best_f1, best_state = -1, None
for ep in range(1, EPOCHS+1):
    tr_loss, tr_acc, tr_p, tr_r, tr_f1 = run_epoch(train_loader, train=True)
    vl_loss, vl_acc, vl_p, vl_r, vl_f1 = run_epoch(val_loader, train=False)
    print(f"Epoch {ep:02d} | Train F1 {tr_f1:.4f} | Val F1 {vl_f1:.4f}")
    if vl_f1 > best_f1:
        best_f1 = vl_f1
        best_state = {'model': model.state_dict(), 'label2id': label2id, 'id2label': id2label}
        torch.save(best_state, 'clip_bert_mner_best.pth')

# ================== Test Evaluation ==================
model.load_state_dict(torch.load('clip_bert_mner_best.pth', map_location=device)['model'])

def evaluate(loader):
    model.eval()
    all_preds, all_labels, losses = [], [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Test", leave=False):
            ids = batch['input_ids'].to(device)
            attn = batch['attention_mask'].to(device)
            pixels = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)
            logits, loss = model(ids, attn, pixels, labels)
            losses.append(loss.item())
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            golds = labels.cpu().numpy()
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100: continue
                    all_preds.append(pi)
                    all_labels.append(gi)
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro')
    return np.mean(losses), acc, f1

te_loss, te_acc, te_f1 = evaluate(test_loader)
print("\n===== TEST RESULTS =====")
print(f"Loss: {te_loss:.4f}")
print(f"Accuracy: {te_acc:.4f}")
print(f"F1 Score: {te_f1:.4f}")


In [None]:
from sklearn.metrics import classification_report

def classification_report_per_label(loader):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc="Per-Label Report", leave=False):
            ids = batch['input_ids'].to(device)
            attn = batch['attention_mask'].to(device)
            pixels = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)

            logits, _ = model(ids, attn, pixels, labels)
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            golds = labels.cpu().numpy()

            # collect only valid tokens (skip subwords/pads)
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100:
                        continue
                    all_preds.append(pi)
                    all_labels.append(gi)

    # Convert IDs back to label names
    target_names = [id2label[i] for i in sorted(id2label.keys())]

    print("\n===== CLASSIFICATION REPORT =====")
    print(classification_report(
        all_labels, all_preds,
        target_names=target_names,
        digits=4,
        zero_division=0
    ))

# ✅ Run on test set
classification_report_per_label(test_loader)


## vit models

In [None]:
class Twitter2015ViLTMNER(Dataset):
    def __init__(self, samples, img_dir, processor, label2id, max_len=128, aug=False):
        self.samples = samples
        self.img_dir = img_dir
        self.processor = processor
        self.label2id = label2id
        self.max_len = max_len
        self.aug = aug

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        ex = self.samples[idx]
        tokens, labels = ex['tokens'], ex['labels']
        text = " ".join(tokens)

        # Load and preprocess image safely
        img_path = find_image_path(self.img_dir, ex['img_id'])
        if img_path and os.path.exists(img_path):
            try:
                img = Image.open(img_path).convert('RGB')
            except Exception:
                img = Image.new('RGB', (224,224), (0,0,0))
        else:
            img = Image.new('RGB', (224,224), (0,0,0))

        # Process image + text together with ViLTProcessor
        enc = self.processor(
            images=img,
            text=text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        input_ids = enc['input_ids'].squeeze(0)
        attn_mask = enc['attention_mask'].squeeze(0)
        pixel_values = enc['pixel_values'].squeeze(0)

        # Align labels (same length as text tokens)
        # ViLTProcessor adds special tokens -> need to pad labels to match
        label_ids = [-100] * len(input_ids)
        # fill in for text tokens only
        for i, token_id in enumerate(input_ids):
            if i < len(labels):
                label_ids[i] = self.label2id[labels[i]]
        label_ids = torch.tensor(label_ids, dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attn_mask,
            'pixel_values': pixel_values,
            'labels': label_ids
        }


In [None]:
from transformers import ViltProcessor

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")

train_ds = Twitter2015ViLTMNER(train_samples, IMG_DIRS['train'], processor, label2id, max_len=128)
val_ds   = Twitter2015ViLTMNER(val_samples,   IMG_DIRS['val'],   processor, label2id, max_len=128)
test_ds  = Twitter2015ViLTMNER(test_samples,  IMG_DIRS['test'],  processor, label2id, max_len=128)


In [None]:
from transformers import ViltProcessor, ViltModel

class ViltMNER(nn.Module):
    def __init__(self, num_labels, vilt_model_name="dandelin/vilt-b32-mlm"):
        super().__init__()
        self.vilt = ViltModel.from_pretrained(vilt_model_name)
        hidden_size = self.vilt.config.hidden_size  # typically 768

        # classifier over token-level hidden states
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, pixel_values, labels=None):
        """
        input_ids, attention_mask: from text tokenizer, shape [B, L]
        pixel_values: images preprocessed, shape [B, 3, H, W]
        labels: token-level labels (with -100 for ignored positions)
        """
        outputs = self.vilt(input_ids=input_ids,
                            attention_mask=attention_mask,
                            pixel_values=pixel_values,
                            return_dict=True)
        # ViLT returns last_hidden_state corresponding to *text tokens aligned with vision*
        seq = outputs.last_hidden_state  # [B, L, hidden_size]

        seq = self.dropout(seq)
        logits = self.classifier(seq)  # [B, L, num_labels]

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            # flatten
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return logits, loss


In [None]:
# ==========================================================
# 🚀 ViLT (Vision-and-Language Transformer) for Multimodal NER
# ==========================================================

import os, torch, torch.nn as nn, numpy as np
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import ViltProcessor, ViltModel, get_cosine_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ==========================================================
# 🧩 Dataset (safe image + label handling)
# ==========================================================
class Twitter2015ViLTMNER(Dataset):
    def __init__(self, samples, img_dir, processor, label2id, max_len=128):
        self.samples = samples
        self.img_dir = img_dir
        self.processor = processor
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ex = self.samples[idx]
        tokens, labels = ex["tokens"], ex["labels"]
        text = " ".join(tokens)

        # ---- Safe image loading ----
        # ---- Safe image loading + force resize to fixed size ----
        img_path = find_image_path(self.img_dir, ex["img_id"])
        if img_path and os.path.exists(img_path):
            try:
                img = Image.open(img_path).convert("RGB")
            except Exception:
                img = Image.new("RGB", (384, 384), (0, 0, 0))
        else:
            img = Image.new("RGB", (384, 384), (0, 0, 0))

        # Force resize all images to 384x384 (ViLT’s native input size)
        img = img.resize((384, 384))


        # ---- Process text + image together ----
        enc = self.processor(
            images=img,
            text=text,
            padding="max_length",
            truncation=True,
            max_length=40,          # ✅ ViLT’s native text length
            return_tensors="pt",
        )


        input_ids = enc["input_ids"].squeeze(0)
        attn_mask = enc["attention_mask"].squeeze(0)
        pixel_values = enc["pixel_values"].squeeze(0)

        # ---- Align labels with tokens ----
        label_ids = [-100] * len(input_ids)
        for i, l in enumerate(labels):
            if i + 1 < len(label_ids):
                label_ids[i + 1] = self.label2id[l]
        # ensure exact length
        label_ids = label_ids[:self.max_len]
        if len(label_ids) < self.max_len:
            label_ids += [-100] * (self.max_len - len(label_ids))
        label_ids = torch.tensor(label_ids, dtype=torch.long)

        return {
            "input_ids": input_ids,
            "attention_mask": attn_mask,
            "pixel_values": pixel_values,
            "labels": label_ids,
        }

# ==========================================================
# 🧩 Safe Collate Function (handles variable shapes)
# ==========================================================
from torch.nn.utils.rnn import pad_sequence

def vilt_collate_fn(batch):
    """Safely collate variable-length ViLT samples."""
    input_ids = [b["input_ids"] for b in batch]
    attn_mask = [b["attention_mask"] for b in batch]
    pixels    = [b["pixel_values"] for b in batch]
    labels    = [b["labels"] for b in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attn_mask = pad_sequence(attn_mask, batch_first=True, padding_value=0)
    labels    = pad_sequence(labels, batch_first=True, padding_value=-100)
    pixel_values = torch.stack(pixels)

    return {
        "input_ids": input_ids,
        "attention_mask": attn_mask,
        "pixel_values": pixel_values,
        "labels": labels,
    }

# ==========================================================
# ⚙️ Model Definition
# ==========================================================
class ViltMNER(nn.Module):
    def __init__(self, num_labels, model_name="dandelin/vilt-b32-mlm"):
        super().__init__()
        self.vilt = ViltModel.from_pretrained(model_name)
        hidden = self.vilt.config.hidden_size  # 768
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden, num_labels)

    def forward(self, input_ids, attention_mask, pixel_values, labels=None):
        outputs = self.vilt(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            return_dict=True,
        )

        seq = outputs.last_hidden_state
        seq = self.dropout(seq)
        logits = self.classifier(seq)

        # Keep only text tokens (ViLT mixes visual+text)
        text_len = input_ids.size(1)
        logits_text = logits[:, :text_len, :]

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(
                logits_text.reshape(-1, logits_text.size(-1)),
                labels.reshape(-1)
            )

        return logits_text, loss



# ==========================================================
# 🔧 Build Data
# ==========================================================
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")

train_samples = parse_twitter_conll(TXT_PATHS["train"])
val_samples   = parse_twitter_conll(TXT_PATHS["val"])
test_samples  = parse_twitter_conll(TXT_PATHS["test"])

label2id, id2label = build_label_vocab(train_samples, val_samples, test_samples)
print("Labels:", label2id)

train_ds = Twitter2015ViLTMNER(train_samples, IMG_DIRS["train"], processor, label2id,max_len=40)
val_ds   = Twitter2015ViLTMNER(val_samples,   IMG_DIRS["val"],   processor, label2id,max_len=40)
test_ds  = Twitter2015ViLTMNER(test_samples,  IMG_DIRS["test"],  processor, label2id,max_len=40)

train_loader = DataLoader(train_ds, batch_size=4, shuffle=True,  num_workers=2, collate_fn=vilt_collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=8, shuffle=False, num_workers=2, collate_fn=vilt_collate_fn)
test_loader  = DataLoader(test_ds,  batch_size=8, shuffle=False, num_workers=2, collate_fn=vilt_collate_fn)

# ==========================================================
# 🧮 Training Setup
# ==========================================================
EPOCHS = 5
model = ViltMNER(num_labels=len(label2id)).to(device)
optim = torch.optim.AdamW(model.parameters(), lr=3e-5)
total_steps = len(train_loader) * EPOCHS
sched = get_cosine_schedule_with_warmup(optim, int(0.1 * total_steps), total_steps)
scaler = torch.amp.GradScaler("cuda", enabled=(device.type == "cuda"))

def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    all_preds, all_labels, losses = [], [], []
    it = tqdm(loader, desc="Train" if train else "Eval", leave=False)
    for batch in it:
        ids = batch["input_ids"].to(device)
        attn = batch["attention_mask"].to(device)
        pixels = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        with torch.amp.autocast("cuda", enabled=(device.type == "cuda")):
            logits, loss = model(ids, attn, pixels, labels)

        if train:
            optim.zero_grad()
            scaler.scale(loss).backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optim)
            scaler.update()
            sched.step()

        losses.append(loss.item())
        preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
        golds = labels.detach().cpu().numpy()

        for p, g in zip(preds, golds):
            for pi, gi in zip(p, g):
                if gi == -100: continue
                all_preds.append(pi)
                all_labels.append(gi)

    avg_loss = float(np.mean(losses))
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="micro", zero_division=0)
    return avg_loss, acc, prec, rec, f1

# ==========================================================
# 🚀 Train & Validate
# ==========================================================
best_val_f1 = -1
for ep in range(1, EPOCHS + 1):
    tr_loss, tr_acc, tr_p, tr_r, tr_f1 = run_epoch(train_loader, train=True)
    vl_loss, vl_acc, vl_p, vl_r, vl_f1 = run_epoch(val_loader, train=False)
    print(f"Epoch {ep:02d} | Train F1 {tr_f1:.4f} | Val F1 {vl_f1:.4f}")
    if vl_f1 > best_val_f1:
        best_val_f1 = vl_f1
        torch.save(model.state_dict(), "vilt_mner_best.pth")

# ==========================================================
# 🧪 Evaluate on Test
# ==========================================================
model.load_state_dict(torch.load("vilt_mner_best.pth", map_location=device))

def evaluate(loader):
    model.eval()
    all_preds, all_labels, losses = [], [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Test", leave=False):
            ids = batch["input_ids"].to(device)
            attn = batch["attention_mask"].to(device)
            pixels = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            logits, loss = model(ids, attn, pixels, labels)
            losses.append(loss.item())

            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            golds = labels.cpu().numpy()
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100: continue
                    all_preds.append(pi); all_labels.append(gi)

    avg_loss = float(np.mean(losses))
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="micro", zero_division=0)
    return avg_loss, acc, prec, rec, f1

te_loss, te_acc, te_p, te_r, te_f1 = evaluate(test_loader)
print("\n===== TEST RESULTS =====")
print(f"Loss: {te_loss:.4f}")
print(f"Accuracy: {te_acc:.4f}")
print(f"F1 Score: {te_f1:.4f}")

# ==========================================================
# 📊 Classification Report
# ==========================================================
def classification_report_per_label(loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Per-Label Report", leave=False):
            ids = batch["input_ids"].to(device)
            attn = batch["attention_mask"].to(device)
            pixels = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            logits, _ = model(ids, attn, pixels, labels)
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            golds = labels.cpu().numpy()
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100: continue
                    all_preds.append(pi); all_labels.append(gi)

    target_names = [id2label[i] for i in sorted(id2label.keys())]
    print("\n===== CLASSIFICATION REPORT =====")
    print(classification_report(all_labels, all_preds, target_names=target_names, digits=4, zero_division=0))

classification_report_per_label(test_loader)


## Blip models

In [None]:
# ==========================================================
# 🚀 BLIP (Salesforce/blip-itm-base-coco) for Multimodal NER — Final Stable Version
# ==========================================================
# !pip -q install transformers==4.42.4 accelerate datasets==2.21.0

import os, torch, torch.nn as nn, numpy as np
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import BlipProcessor, BlipModel, get_cosine_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ==========================================================
# 🧩 Dataset Definition
# ==========================================================
class TwitterBLIPMNER(Dataset):
    def __init__(self, samples, img_dir, processor, label2id, max_len=64):
        self.samples = samples
        self.img_dir = img_dir
        self.processor = processor
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ex = self.samples[idx]
        tokens, labels = ex["tokens"], ex["labels"]
        text = " ".join(tokens)

        # ---- Safe image loading ----
        img_path = find_image_path(self.img_dir, ex["img_id"])
        if img_path and os.path.exists(img_path):
            try:
                img = Image.open(img_path).convert("RGB")
            except Exception:
                img = Image.new("RGB", (384, 384), (0, 0, 0))
        else:
            img = Image.new("RGB", (384, 384), (0, 0, 0))

        img = img.resize((384, 384))

        # ---- Process image + text ----
        enc = self.processor(
            images=img,
            text=text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        input_ids = enc["input_ids"].squeeze(0)
        attn_mask = enc["attention_mask"].squeeze(0)
        pixel_values = enc["pixel_values"].squeeze(0)

        # ---- Align labels ----
        label_ids = [-100] * len(input_ids)
        for i, l in enumerate(labels):
            if i + 1 < len(label_ids):
                label_ids[i + 1] = self.label2id[l]
        label_ids = label_ids[:self.max_len]
        if len(label_ids) < self.max_len:
            label_ids += [-100] * (self.max_len - len(label_ids))
        label_ids = torch.tensor(label_ids, dtype=torch.long)

        return {
            "input_ids": input_ids,
            "attention_mask": attn_mask,
            "pixel_values": pixel_values,
            "labels": label_ids
        }

# ==========================================================
# ⚙️ Collate Function
# ==========================================================
from torch.nn.utils.rnn import pad_sequence

def blip_collate_fn(batch):
    input_ids = [b["input_ids"] for b in batch]
    attn_mask = [b["attention_mask"] for b in batch]
    pixels    = [b["pixel_values"] for b in batch]
    labels    = [b["labels"] for b in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attn_mask = pad_sequence(attn_mask, batch_first=True, padding_value=0)
    labels    = pad_sequence(labels, batch_first=True, padding_value=-100)
    pixel_values = torch.stack(pixels)

    return {
        "input_ids": input_ids,
        "attention_mask": attn_mask,
        "pixel_values": pixel_values,
        "labels": labels
    }

from transformers import BlipForImageTextRetrieval

# ==========================================================
# 🧠 BLIP-based Multimodal NER Model
# ==========================================================
class BLIPMNER(nn.Module):
    def __init__(self, num_labels, model_name="Salesforce/blip-itm-base-coco"):
        super().__init__()
        self.blip = BlipForImageTextRetrieval.from_pretrained(model_name)
        self.text_encoder = self.blip.text_encoder      # ✅ now available
        self.vision_model = self.blip.vision_model      # ✅ vision tower
        hidden = self.blip.config.text_config.hidden_size  # 768
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden, num_labels)

    def forward(self, input_ids, attention_mask, pixel_values, labels=None):
        # ---- Step 1: Vision branch ----
        vision_outputs = self.vision_model(pixel_values)
        vision_embeds = vision_outputs.last_hidden_state

        # ---- Step 2: Text encoder with visual context ----
        text_outputs = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=vision_embeds,
            encoder_attention_mask=torch.ones(
                vision_embeds.size()[:-1], dtype=torch.long, device=vision_embeds.device
            ),
            return_dict=True
        )

        seq = text_outputs.last_hidden_state  # [B, L, H]
        seq = self.dropout(seq)
        logits = self.classifier(seq)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.reshape(-1, logits.size(-1)), labels.reshape(-1))
        return logits, loss

# ==========================================================
# 🔧 Build Data
# ==========================================================
processor = BlipProcessor.from_pretrained("Salesforce/blip-itm-base-coco")

train_samples = parse_twitter_conll(TXT_PATHS["train"])
val_samples   = parse_twitter_conll(TXT_PATHS["val"])
test_samples  = parse_twitter_conll(TXT_PATHS["test"])

label2id, id2label = build_label_vocab(train_samples, val_samples, test_samples)
print("Labels:", label2id)

train_ds = TwitterBLIPMNER(train_samples, IMG_DIRS["train"], processor, label2id)
val_ds   = TwitterBLIPMNER(val_samples,   IMG_DIRS["val"],   processor, label2id)
test_ds  = TwitterBLIPMNER(test_samples,  IMG_DIRS["test"],  processor, label2id)

train_loader = DataLoader(train_ds, batch_size=4, shuffle=True,  num_workers=2, collate_fn=blip_collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=8, shuffle=False, num_workers=2, collate_fn=blip_collate_fn)
test_loader  = DataLoader(test_ds,  batch_size=8, shuffle=False, num_workers=2, collate_fn=blip_collate_fn)

# ==========================================================
# ⚙️ Training Setup
# ==========================================================
EPOCHS = 5
model = BLIPMNER(num_labels=len(label2id)).to(device)
optim = torch.optim.AdamW(model.parameters(), lr=3e-5)
total_steps = len(train_loader) * EPOCHS
sched = get_cosine_schedule_with_warmup(optim, int(0.1 * total_steps), total_steps)
scaler = torch.amp.GradScaler("cuda", enabled=(device.type == "cuda"))

def run_epoch(loader, train=True):
    model.train() if train else model.eval()
    all_preds, all_labels, losses = [], [], []
    it = tqdm(loader, desc="Train" if train else "Eval", leave=False)
    for batch in it:
        ids = batch["input_ids"].to(device)
        attn = batch["attention_mask"].to(device)
        pixels = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        with torch.amp.autocast("cuda", enabled=(device.type == "cuda")):
            logits, loss = model(ids, attn, pixels, labels)

        if train:
            optim.zero_grad()
            scaler.scale(loss).backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optim)
            scaler.update()
            sched.step()

        losses.append(loss.item())
        preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
        golds = labels.detach().cpu().numpy()

        for p, g in zip(preds, golds):
            for pi, gi in zip(p, g):
                if gi == -100: continue
                all_preds.append(pi)
                all_labels.append(gi)

    avg_loss = float(np.mean(losses))
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="micro", zero_division=0)
    return avg_loss, acc, prec, rec, f1

# ==========================================================
# 🚀 Train & Validate
# ==========================================================
best_val_f1 = -1
for ep in range(1, EPOCHS + 1):
    tr_loss, tr_acc, tr_p, tr_r, tr_f1 = run_epoch(train_loader, train=True)
    vl_loss, vl_acc, vl_p, vl_r, vl_f1 = run_epoch(val_loader, train=False)
    print(f"Epoch {ep:02d} | Train F1 {tr_f1:.4f} | Val F1 {vl_f1:.4f}")
    if vl_f1 > best_val_f1:
        best_val_f1 = vl_f1
        torch.save(model.state_dict(), "blip_mner_best.pth")

# ==========================================================
# 🧪 Evaluate on Test
# ==========================================================
model.load_state_dict(torch.load("blip_mner_best.pth", map_location=device))

def evaluate(loader):
    model.eval()
    all_preds, all_labels, losses = [], [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Test", leave=False):
            ids = batch["input_ids"].to(device)
            attn = batch["attention_mask"].to(device)
            pixels = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            logits, loss = model(ids, attn, pixels, labels)
            losses.append(loss.item())

            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            golds = labels.cpu().numpy()
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100: continue
                    all_preds.append(pi); all_labels.append(gi)

    avg_loss = float(np.mean(losses))
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="micro", zero_division=0)
    return avg_loss, acc, prec, rec, f1

te_loss, te_acc, te_p, te_r, te_f1 = evaluate(test_loader)
print("\n===== TEST RESULTS =====")
print(f"Loss: {te_loss:.4f}")
print(f"Accuracy: {te_acc:.4f}")
print(f"F1 Score: {te_f1:.4f}")


In [None]:
def classification_report_per_label(loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Per-Label Report", leave=False):
            ids = batch["input_ids"].to(device)
            attn = batch["attention_mask"].to(device)
            pixels = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            logits, _ = model(ids, attn, pixels, labels)
            preds = torch.argmax(logits, dim=-1).cpu().numpy()
            golds = labels.cpu().numpy()
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100: continue
                    all_preds.append(pi); all_labels.append(gi)

    target_names = [id2label[i] for i in sorted(id2label.keys())]
    print("\n===== CLASSIFICATION REPORT =====")
    print(classification_report(all_labels, all_preds, target_names=target_names, digits=4, zero_division=0))

classification_report_per_label(test_loader)