In [None]:
import os

# your dataset image folder
img_dir = "/kaggle/input/twitter2015/twitter2015/twitter2015_images"

# image id (from your text file)
img_id = "1015799"

# possible extensions (Twitter2015 images are usually .jpg)
possible_exts = [".jpg", ".jpeg", ".png", ".bmp"]

found = False
for ext in possible_exts:
    img_path = os.path.join(img_dir, img_id + ext)
    if os.path.exists(img_path):
        print(f"✅ Image found: {img_path}")
        found = True
        break

if not found:
    # show a few similar files in case of different naming pattern
    print(f"❌ Image {img_id} not found in {img_dir}")
    files = [f for f in os.listdir(img_dir) if img_id in f]
    if files:
        print("Possible matches:")
        for f in files:
            print("  -", f)
    else:
        print("No similar filenames found.")


In [None]:
!git clone https://github.com/jefferyYu/UMT.git

# Twitter2015 dataset

In [None]:
import os, ast, math, json, random
from dataclasses import dataclass
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModel, get_cosine_schedule_with_warmup

# -------------- Repro --------------
SEED = 1337
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------------- Paths (EDIT if needed) --------------
TXT_PATHS = {
    'train': '/kaggle/working/UMT/data/twitter2015/train.txt',
    'val':   '/kaggle/working/UMT/data/twitter2015/valid.txt',   # will auto-fix if 'mnre_val .txt' exists
    'test':  '/kaggle/working/UMT/data/twitter2015/test.txt'
}
IMG_DIRS = {
    'train': '/kaggle/input/twitter2015/twitter2015/twitter2015_images',
    'val':   '/kaggle/input/twitter2015/twitter2015/twitter2015_images',
    'test':  '/kaggle/input/twitter2015/twitter2015/twitter2015_images'
}


In [None]:
import os, re, random, matplotlib.pyplot as plt
from PIL import Image
from IPython.display import display, HTML

# Paths (you already have these)
TXT_PATH = "/kaggle/working/UMT/data/twitter2015/train.txt"
IMG_DIR = "/kaggle/input/twitter2015/twitter2015/twitter2015_images"

# Function to parse the Twitter2015 CoNLL file
def parse_twitter_file(txt_path):
    samples = []
    with open(txt_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    current_imgid, tokens, tags = None, [], []
    for line in lines:
        line = line.strip()
        if not line:  # blank line → end of sample
            if current_imgid and tokens:
                samples.append({'img_id': current_imgid, 'tokens': tokens, 'tags': tags})
                tokens, tags = [], []
            continue
        if line.startswith("IMGID:"):
            current_imgid = line.split(":")[1].strip()
        else:
            try:
                word, label = line.split()
                tokens.append(word)
                tags.append(label)
            except:
                continue
    return samples

# Load a few samples
samples = parse_twitter_file(TXT_PATH)
print(f"Total samples parsed: {len(samples)}")

# Define colors for entity types
color_map = {
    'PER': '#FFD700',   # yellow
    'LOC': '#90EE90',   # green
    'ORG': '#87CEFA',   # blue
    'OTHER': '#FFB6C1', # pink
    'O': 'white'
}

# Helper to colorize tokens
def colorize_tokens(tokens, tags):
    html = ""
    for tok, tag in zip(tokens, tags):
        if tag == 'O':
            html += f"<span style='background-color:white'>{tok} </span>"
        else:
            ent_type = tag.split('-')[-1]
            color = color_map.get(ent_type, 'white')
            html += f"<span style='background-color:{color}; padding:2px; border-radius:4px;'>{tok} </span>"
    return html

# Random visualization
for i in range(3):
    sample = random.choice(samples)
    img_id = sample['img_id']
    # check file with any extension
    for ext in ['.jpg', '.jpeg', '.png']:
        img_path = os.path.join(IMG_DIR, img_id + ext)
        if os.path.exists(img_path):
            break
    else:
        print(f"Image not found for ID {img_id}")
        continue
    
    # Display image
    img = Image.open(img_path).convert('RGB')
    plt.figure(figsize=(6,6))
    plt.imshow(img)
    plt.axis('off')
    plt.title(f"IMGID: {img_id}")
    plt.show()

    # Display text with entities
    html = colorize_tokens(sample['tokens'], sample['tags'])
    display(HTML(html))
    print("-" * 80)


## Model train 

In [None]:
# ================== Setup & Imports ==================
# !pip -q install transformers==4.42.4 accelerate datasets==2.21.0

import os, random, math, json, numpy as np
from collections import Counter
from typing import List, Tuple

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from PIL import Image
from torchvision import models, transforms

from transformers import RobertaTokenizerFast, RobertaModel, get_cosine_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ------------------- Repro -------------------
SEED = 1337
random.seed(SEED); np.random.seed(SEED)
torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

# ================== Paths (your config) ==================
TXT_PATHS = {
    'train': '/kaggle/working/UMT/data/twitter2015/train.txt',
    'val':   '/kaggle/working/UMT/data/twitter2015/valid.txt',
    'test':  '/kaggle/working/UMT/data/twitter2015/test.txt'
}
IMG_DIRS = {
    'train': '/kaggle/input/twitter2015/twitter2015/twitter2015_images',
    'val':   '/kaggle/input/twitter2015/twitter2015/twitter2015_images',
    'test':  '/kaggle/input/twitter2015/twitter2015/twitter2015_images'
}

# ================== Utils ==================
def parse_twitter_conll(path: str):
    """Parse Twitter2015 style: blocks separated by blank lines, starting with IMGID:xxxx"""
    samples = []
    with open(path, 'r', encoding='utf-8') as f:
        lines = [ln.rstrip('\n') for ln in f]
    img_id, toks, tags = None, [], []
    for ln in lines + ['']:  # sentinel blank to flush last sample
        if not ln.strip():
            if img_id is not None and toks:
                samples.append({'img_id': img_id, 'tokens': toks, 'labels': tags})
            img_id, toks, tags = None, [], []
            continue
        if ln.startswith('IMGID:'):
            img_id = ln.split(':', 1)[1].strip()
        else:
            # token and BIO tag separated by whitespace
            parts = ln.split()
            if len(parts) >= 2:
                toks.append(parts[0])
                tags.append(parts[1])
    return samples

def build_label_vocab(*lists_of_samples):
    labels = set()
    for s_list in lists_of_samples:
        for s in s_list:
            labels.update(s['labels'])
    labels = sorted(labels)  # stable order
    label2id = {l:i for i,l in enumerate(labels)}
    id2label = {i:l for l,i in label2id.items()}
    return label2id, id2label

def find_image_path(img_dir: str, img_id: str):
    for ext in ('.jpg', '.jpeg', '.png', '.bmp'):
        p = os.path.join(img_dir, img_id + ext)
        if os.path.exists(p):
            return p
    return None

# ================== Dataset ==================
class Twitter2015MNER(Dataset):
    def __init__(self, samples, img_dir, tokenizer: RobertaTokenizerFast, label2id, max_len=128, aug=False):
        self.samples = samples
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

        if aug:
            self.img_tfm = transforms.Compose([
                transforms.Resize((256,256)),
                transforms.RandomResizedCrop((224,224), scale=(0.9,1.0)),
                transforms.ColorJitter(0.15,0.15,0.15,0.05),
                transforms.RandomHorizontalFlip(0.5),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
            ])
        else:
            self.img_tfm = transforms.Compose([
                transforms.Resize((224,224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
            ])

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        ex = self.samples[idx]
        tokens: List[str] = ex['tokens']
        labels: List[str] = ex['labels']

        # Tokenize with word alignment
        # --- Tokenize text first ---
        encodings = self.tokenizer(
            tokens,
            is_split_into_words=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        # 👉 Call .word_ids BEFORE squeezing
        word_ids = encodings.word_ids(batch_index=0)
        
        # Then convert tensors for PyTorch
        enc = {k: v.squeeze(0) for k, v in encodings.items()}
        
        # --- Align labels with subwords ---
        label_ids = []
        prev_word = None
        for w_id in word_ids:
            if w_id is None:
                label_ids.append(-100)
            elif w_id != prev_word:
                label_ids.append(self.label2id[labels[w_id]])
            else:
                label_ids.append(-100)
            prev_word = w_id
        label_ids = torch.tensor(label_ids, dtype=torch.long)

        # Image
        img_path = find_image_path(self.img_dir, ex['img_id'])
        if img_path is None:
            # fallback: blank image if missing
            img = Image.new('RGB', (224,224), color=(0,0,0))
        else:
            img = Image.open(img_path).convert('RGB')
        img = self.img_tfm(img)

        return {
            'input_ids': enc['input_ids'],
            'attention_mask': enc['attention_mask'],
            'pixel_values': img,
            'labels': label_ids
        }

# ================== Model: RoBERTa-large + ResNet50 (FiLM-style fusion) ==================
class RobertaResNet50MNER(nn.Module):
    """
    Token classification with image-conditioned modulation (FiLM-like):
      - Text encoder: roberta-large (hidden=1024)
      - Image encoder: resnet50 -> 2048-d pooled -> Linear -> 1024
      - gamma = Wg(img), beta = Wb(img)
      - h' = (1 + gamma) * h + beta  (applied to every token)
      - Token classifier to BIO tag space
    """
    def __init__(self, num_labels, text_model='roberta-large'):
        super().__init__()
        self.text = RobertaModel.from_pretrained(text_model)
        hidden = self.text.config.hidden_size  # 1024

        resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
        resnet.fc = nn.Identity()
        self.visual = resnet

        self.img_proj = nn.Linear(2048, hidden)
        self.gamma = nn.Linear(hidden, hidden)
        self.beta  = nn.Linear(hidden, hidden)

        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden, num_labels)

    def forward(self, input_ids, attention_mask, pixel_values, labels=None):
        # Text
        out = self.text(input_ids=input_ids, attention_mask=attention_mask)
        seq = out.last_hidden_state  # [B, L, 1024]

        # Image
        img_feat = self.visual(pixel_values)       # [B, 2048]
        img_feat = self.img_proj(img_feat)         # [B, 1024]

        # FiLM modulation per token
        g = self.gamma(img_feat).unsqueeze(1)      # [B, 1, 1024]
        b = self.beta(img_feat).unsqueeze(1)       # [B, 1, 1024]
        seq = (1 + g) * seq + b                    # [B, L, 1024]

        seq = self.dropout(seq)
        logits = self.classifier(seq)              # [B, L, C]

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return logits, loss

# ================== Build Data ==================
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large', add_prefix_space=True)


train_samples = parse_twitter_conll(TXT_PATHS['train'])
val_samples   = parse_twitter_conll(TXT_PATHS['val'])
test_samples  = parse_twitter_conll(TXT_PATHS['test'])

label2id, id2label = build_label_vocab(train_samples, val_samples, test_samples)
print("Labels:", label2id)

train_ds = Twitter2015MNER(train_samples, IMG_DIRS['train'], tokenizer, label2id, max_len=128, aug=True)
val_ds   = Twitter2015MNER(val_samples,   IMG_DIRS['val'],   tokenizer, label2id, max_len=128, aug=False)
test_ds  = Twitter2015MNER(test_samples,  IMG_DIRS['test'],  tokenizer, label2id, max_len=128, aug=False)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True,  num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=12, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=12, shuffle=False, num_workers=2, pin_memory=True)

# ================== Train ==================
model = RobertaResNet50MNER(num_labels=len(label2id)).to(device)

EPOCHS = 5
optim = torch.optim.AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)

total_steps = len(train_loader) * EPOCHS
warmup = int(0.1 * total_steps)
sched = get_cosine_schedule_with_warmup(optim, num_warmup_steps=warmup, num_training_steps=total_steps)

scaler = torch.cuda.amp.GradScaler(enabled=(device.type=='cuda'))

def run_epoch(loader, train=True):
    if train: model.train()
    else: model.eval()

    all_preds, all_labels = [], []
    losses = []

    it = tqdm(loader, leave=False, desc="Train" if train else "Eval")
    for batch in it:
        input_ids = batch['input_ids'].to(device)
        attn      = batch['attention_mask'].to(device)
        pixels    = batch['pixel_values'].to(device)
        labels    = batch['labels'].to(device)

        with torch.cuda.amp.autocast(enabled=(scaler is not None)):
            logits, loss = model(input_ids, attn, pixels, labels if train else labels)

        if train:
            optim.zero_grad(set_to_none=True)
            if scaler:
                scaler.scale(loss).backward()
                nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optim)
                scaler.update()
            else:
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optim.step()
            sched.step()

        losses.append(loss.item())

        # collect predictions & labels for metrics (ignore -100)
        preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
        golds = labels.detach().cpu().numpy()

        for p, g in zip(preds, golds):
            for pi, gi in zip(p, g):
                if gi == -100:  # skip subwords / pads
                    continue
                all_preds.append(pi)
                all_labels.append(gi)

    avg_loss = float(np.mean(losses))
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro', zero_division=0)
    return avg_loss, acc, prec, rec, f1

best_val_f1, best_state = -1.0, None
patience, bad = 2, 0

for epoch in range(1, EPOCHS+1):
    tr_loss, tr_acc, tr_p, tr_r, tr_f1 = run_epoch(train_loader, train=True)
    vl_loss, vl_acc, vl_p, vl_r, vl_f1 = run_epoch(val_loader,   train=False)

    print(f"Epoch {epoch:02d} | "
          f"Train loss {tr_loss:.4f} acc {tr_acc:.3f} P {tr_p:.3f} R {tr_r:.3f} F1 {tr_f1:.3f} || "
          f"Val loss {vl_loss:.4f} acc {vl_acc:.3f} P {vl_p:.3f} R {vl_r:.3f} F1 {vl_f1:.3f}")

    if vl_f1 > best_val_f1:
        best_val_f1 = vl_f1; bad = 0
        best_state = {
            'model': model.state_dict(),
            'label2id': label2id,
            'id2label': id2label,
            'config': {'text':'roberta-large','img':'resnet50','fusion':'FiLM'}
        }
        torch.save(best_state, 'roberta_resnet50_mner_best.pth')
    else:
        bad += 1
        if bad >= patience:
            print("Early stopping.")
            break

# ================== Evaluate on Test ==================
# load best
if best_state is None:
    best_state = torch.load('roberta_resnet50_mner_best.pth', map_location=device)
model.load_state_dict(best_state['model'])

def evaluate(loader):
    model.eval()
    all_preds, all_labels = [], []
    losses = []
    with torch.no_grad():
        for batch in tqdm(loader, leave=False, desc="Test"):
            input_ids = batch['input_ids'].to(device)
            attn      = batch['attention_mask'].to(device)
            pixels    = batch['pixel_values'].to(device)
            labels    = batch['labels'].to(device)
            logits, loss = model(input_ids, attn, pixels, labels)
            losses.append(loss.item())

            preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
            golds = labels.detach().cpu().numpy()
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100: continue
                    all_preds.append(pi); all_labels.append(gi)

    avg_loss = float(np.mean(losses))
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro', zero_division=0)
    return avg_loss, acc, prec, rec, f1

te_loss, te_acc, te_p, te_r, te_f1 = evaluate(test_loader)
print("\n===== TEST RESULTS (Token-level, ignore subwords) =====")
print(f"Loss: {te_loss:.4f}")
print(f"Accuracy:  {te_acc:.4f}")
print(f"Precision: {te_p:.4f}")
print(f"Recall:    {te_r:.4f}")
print(f"F1:        {te_f1:.4f}")

# (Optional) per-label report
def per_label_report(loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attn      = batch['attention_mask'].to(device)
            pixels    = batch['pixel_values'].to(device)
            labels    = batch['labels'].to(device)
            logits, _ = model(input_ids, attn, pixels, labels)
            preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
            golds = labels.detach().cpu().numpy()
            for p, g in zip(preds, golds):
                for pi, gi in zip(p, g):
                    if gi == -100: continue
                    all_preds.append(pi); all_labels.append(gi)

    from sklearn.metrics import classification_report
    target_names = [l for i,l in sorted(id2label.items())]
    print("\nPer-label report:")
    print(classification_report(all_labels, all_preds, target_names=target_names, zero_division=0))

# Uncomment to print per-label metrics:
# per_label_report(test_loader)


## evaluate our models 

In [None]:
import numpy as np
import torch
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure model is in evaluation mode
model.eval()

all_labels, all_preds = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attn      = batch['attention_mask'].to(device)
        pixels    = batch['pixel_values'].to(device)
        labels    = batch['labels'].to(device)

        logits, loss = model(input_ids, attn, pixels, labels)
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        golds = labels.cpu().numpy()

        # collect only valid tokens (ignore subwords/pads)
        for p, g in zip(preds, golds):
            for pi, gi in zip(p, g):
                if gi == -100:
                    continue
                all_preds.append(pi)
                all_labels.append(gi)

# ---------------- Overall Metrics ----------------
acc  = accuracy_score(all_labels, all_preds)
prec_micro, rec_micro, f1_micro, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro')
prec_macro, rec_macro, f1_macro, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro')
prec_weighted, rec_weighted, f1_weighted, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

print("\n===== Detailed Test Results =====")
print(f"Accuracy:        {acc:.4f}")
print(f"Micro F1:        {f1_micro:.4f}")
print(f"Macro F1:        {f1_macro:.4f}")
print(f"Weighted F1:     {f1_weighted:.4f}")
print(f"Micro Precision: {prec_micro:.4f}")
print(f"Micro Recall:    {rec_micro:.4f}")

# ---------------- Per-label report ----------------
target_names = [id2label[i] for i in range(len(id2label))]
report = classification_report(
    all_labels, all_preds,
    target_names=target_names,
    digits=4,
    zero_division=0
)
print("\nPer-label classification report:")
print(report)

# ---------------- Confusion Matrix ----------------
cm = confusion_matrix(all_labels, all_preds, labels=list(range(len(id2label))))
plt.figure(figsize=(7,6))
sns.heatmap(
    cm, annot=False, cmap='YlGnBu',
    xticklabels=target_names, yticklabels=target_names
)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix – Token Level")
plt.show()


## predict some models

In [None]:
import torch
from PIL import Image
from transformers import RobertaTokenizerFast
from torchvision import transforms

# === Load tokenizer and model ===
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-large", add_prefix_space=True)
model = RobertaResNet50MNER(num_labels=len(id2label))
ckpt = torch.load("roberta_resnet50_mner_best.pth", map_location=device)
model.load_state_dict(ckpt["model"])
model.to(device)
model.eval()

# === Image preprocessing (same as training) ===
img_tfm = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
])

# === Example: choose one image ID from Twitter2015 ===
img_id = "74960"   # change this to test others
img_path = f"/kaggle/input/twitter2015/twitter2015/twitter2015_images/{img_id}.jpg"

tweet_text = "RT @JayKenMinaj : Me outside of where George Zimmerman got shot at . You know God is so good ."

# --- Load and preprocess ---
image = Image.open(img_path).convert("RGB")
pixel_values = img_tfm(image).unsqueeze(0).to(device)

# --- Tokenize ---
tokens = tweet_text.split()
enc = tokenizer(tokens, is_split_into_words=True, padding='max_length',
                truncation=True, max_length=128, return_tensors='pt')
input_ids = enc["input_ids"].to(device)
attn = enc["attention_mask"].to(device)

# --- Inference ---
with torch.no_grad():
    logits, _ = model(input_ids, attn, pixel_values)
preds = torch.argmax(logits, dim=-1).squeeze(0).cpu().numpy()

# --- Decode predicted tags ---
word_ids = enc.word_ids(batch_index=0)
pred_tags = []
prev_word = None
for i, w_id in enumerate(word_ids):
    if w_id is None or w_id == prev_word:
        continue
    tag = id2label[int(preds[i])]
    pred_tags.append((tokens[w_id], tag))
    prev_word = w_id

# === Print results ===
print(f"Predicted entities for IMGID {img_id}:")
for tok, tag in pred_tags:
    if tag != "O":
        print(f"{tok:15s} → {tag}")

# --- visualize ---
import matplotlib.pyplot as plt
plt.imshow(image)
plt.axis("off")
plt.title(f"Predicted entities for IMGID {img_id}")
plt.show()
