# Data processing

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

from PIL import Image, ImageFile
import os
ImageFile.LOAD_TRUNCATED_IMAGES = True

df_train = pd.read_csv(r'/kaggle/input/dpl-2025/devset_images_metadata/devset_images_metadata/devset_images_metadata_cleaned.csv')
image_folder_train = r'/kaggle/input/dpl-2025/devset_images/devset_images'
df_train_labels = pd.read_csv('/kaggle/input/dpl-2025/devset_images_gt.csv')

extensions = ['.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff', '.webp']

def find_image_path(image_id):
    for ext in extensions:
        path = os.path.join(image_folder_train, f"{image_id}{ext}")
        if os.path.exists(path):
            return path
    # Nếu không tìm thấy file nào, trả về None hoặc mặc định đuôi .jpg
    return os.path.join(image_folder_train, f"{image_id}.jpg")

df_train['image_path'] = df_train['image_id'].apply(find_image_path)
df_merged_train = pd.merge(df_train, df_train_labels, left_on="image_id", right_on="id", how="inner")


df_test = pd.read_csv(r'/kaggle/input/dpl-2025/test.csv')
image_folder_test = r'/kaggle/input/dpl-2025/testset_images/testset_images'

def find_image_path_test(image_id):
    for ext in extensions:
        path = os.path.join(image_folder_test, f"{image_id}{ext}")
        if os.path.exists(path):
            return path
    return os.path.join(image_folder_test, f"{image_id}.jpg")

df_test['image_path'] = df_test['image_id'].apply(find_image_path_test)


In [2]:
df_train["image_path"]

0       /kaggle/input/dpl-2025/devset_images/devset_im...
1       /kaggle/input/dpl-2025/devset_images/devset_im...
2       /kaggle/input/dpl-2025/devset_images/devset_im...
3       /kaggle/input/dpl-2025/devset_images/devset_im...
4       /kaggle/input/dpl-2025/devset_images/devset_im...
                              ...                        
5275    /kaggle/input/dpl-2025/devset_images/devset_im...
5276    /kaggle/input/dpl-2025/devset_images/devset_im...
5277    /kaggle/input/dpl-2025/devset_images/devset_im...
5278    /kaggle/input/dpl-2025/devset_images/devset_im...
5279    /kaggle/input/dpl-2025/devset_images/devset_im...
Name: image_path, Length: 5280, dtype: object

In [3]:
columns = ["image_id", "title","description", "user_tags", "label","image_path"]
#columns = ["image_id","title","description", "user_tags", "latitude", "longitude","label"]
df_merged_train = df_merged_train[columns]

In [4]:
df_merged_train = df_merged_train.fillna("missing") 
def fill_missing(text):
    if isinstance(text, str) and text.strip() == "":
        return "missing"
    return text
df_merged_train = df_merged_train.applymap(fill_missing)
df_merged_train.head()

df_test = df_test.fillna("missing")
df_test = df_test.applymap(fill_missing)
df_test.head()

  df_merged_train = df_merged_train.applymap(fill_missing)
  df_test = df_test.applymap(fill_missing)


Unnamed: 0,image_id,title,description,user_tags,image_path
0,3483809003,"Flooded Parking Lot At Emily Fowler Library, A...",Denton Creek overflows its banks and floods Oa...,"project, slis 5715, spring 2009",/kaggle/input/dpl-2025/testset_images/testset_...
1,3712805295,L'arc de Barà / The roman arch of Barà,Sembla que fou dedicat a August entorn l'any 1...,"arc, arc_de_berà, arch, archaeology, arco, arq...",/kaggle/input/dpl-2025/testset_images/testset_...
2,379845620,Highest point over the sea level that is reach...,missing,missing,/kaggle/input/dpl-2025/testset_images/testset_...
3,7343264988,Lagos after the rains,"After heavy rain, Lagos (Nigeria) was still fl...","africa, lagos, nigeria",/kaggle/input/dpl-2025/testset_images/testset_...
4,3843337492,flooded Corley Ave,also a local black out due to the tree branch ...,"flood, storm, toronto",/kaggle/input/dpl-2025/testset_images/testset_...


In [5]:
texts = (
    df_merged_train['title'].fillna('') + ' ' +
    df_merged_train['description'].fillna('') + ' ' +
    df_merged_train['user_tags'].fillna('')
).tolist()

image_paths_train = df_merged_train['image_path'].tolist()
labels = df_merged_train['label'].tolist()

test_ids = df_test['image_id'].tolist()
test_texts = (
    df_test['title'].fillna('') + ' ' +
    df_test['description'].fillna('') + ' ' +
    df_test['user_tags'].fillna('')
).tolist()
image_paths_test = df_test['image_path'].tolist()

train_texts, val_texts, train_labels, val_labels, train_image_paths, val_image_paths = train_test_split(texts, labels, image_paths_train, 
                                                                                      test_size=0.1, stratify=labels, random_state=42)

# Early Stopping

In [6]:
class EarlyStopping:
    def __init__(self, patience=5, verbose=False, delta=0, path='checkpoint.pt'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)

        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True

        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Save model when validation loss decreases.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} → {val_loss:.6f}). Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

# Bert + ViT

In [7]:
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from tqdm import tqdm
import torch

class MultiModalDataset(Dataset):
    def __init__(self, texts, labels=None, image_paths=None, tokenizer=None, image_transform = None, max_length = 40):
        self.tokenizer = tokenizer
        self.image_transform = image_transform
        self.max_length = max_length
        self.texts = texts
        self.labels = labels
        self.image_paths = image_paths

    def __len__(self):
        return len(self.texts)

    def __getitem__(self,idx):
        text = self.texts[idx]
        image_path = self.image_paths[idx]

        image = Image.open(image_path).convert("RGB")
        image = self.image_transform(image)
        
        encoding = self.tokenizer(text, max_length=self.max_length,
                                  padding='max_length', truncation=True, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze(0)  
        attention_mask = encoding['attention_mask'].squeeze(0)

        item = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'pixel_values': image
        }

        if self.labels is not None:
            label = torch.tensor(self.labels[idx])
            item['labels'] = label

        return item
        
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # roberta-base

image_transform =transforms.Compose([
    transforms.Resize((224,224)), 
    transforms.RandomHorizontalFlip(), #Data argumentation
    transforms.ColorJitter(), #Data argumentation
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
train_dataset = MultiModalDataset(train_texts, train_labels, train_image_paths, tokenizer, image_transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)

val_dataset = MultiModalDataset(val_texts, val_labels, val_image_paths, tokenizer, image_transform)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True, num_workers=4)

test_dataset = MultiModalDataset(test_texts, None, image_paths_test, tokenizer, image_transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

In [9]:
import torch
import torch.nn as nn
from transformers import BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
import timm

class CombinedModel_need_fix(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained('roberta-base') #bert-base-uncased
        # Load Swin Transformer từ timm (pretrained ImageNet)
        self.swin = timm.create_model('swin_base_patch4_window7_224', pretrained=True)
        self.swin_head_dim = self.swin.head.in_features
        self.swin.head = nn.Identity()  # bỏ classifier head đi, chỉ lấy feature

        bert_hidden = self.bert.config.hidden_size  # thường là 768
        swin_hidden = self.swin_head_dim  # thường là 1024 với swin_base

        self.classifier = nn.Sequential(
            nn.Linear(bert_hidden + swin_hidden, 512),
            # nn.BatchNorm1d(1024),
            # nn.GELU(),
            # nn.Dropout(0.4),
            # nn.Linear(1024,512)
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(0.4),
            nn.Linear(512,256),
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1)
        )

        self.fusion = nn.TransformerEncoder(    
            nn.TransformerEncoderLayer(
                d_model=bert_hidden + swin_hidden,
                nhead=8,
                dim_feedforward=512,
                dropout=0.1,
                activation='gelu'
            ),
            num_layers=2 
        )  
        
    def forward(self, input_ids, attention_mask, pixel_values):
        # bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # bert_cls = bert_out.last_hidden_state[:, 0, :]  # CLS token
        
        # # SWIN part
        # swin_out = self.swin(pixel_values)  # (batch_size, C, H, W) nếu timm trả về 4D # output shape: (batch_size, swin_hidden)
        # if swin_out.dim() == 4:
        #     swin_out = swin_out.mean(dim=[2,3])  # global average pooling → (batch_size, C)

        # combined = torch.cat((bert_cls, swin_out), dim=1)  # concat two feature vectors: (batch_size, bert_hidden + swin_hidden)
        # # Fusion layer
        # combined_fused = self.fusion(combined.unsqueeze(1)).squeeze(1)
        
        # logits = self.classifier(combined_fused)
        # return logits.squeeze(1) # batch size = 1: return logits.view(-1)
        # #return SequenceClassifierOutput(logits=logits)
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_cls = bert_out.last_hidden_state[:, 0, :]
        print("bert_cls.shape", bert_cls.shape)

        swin_out = self.swin(pixel_values)
        print("raw swin_out.shape", swin_out.shape)

        if swin_out.dim() == 4:
            swin_out = swin_out.mean(dim=[2, 3])
            print("pooled swin_out.shape", swin_out.shape)


2025-06-27 08:29:57.330604: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751012997.459391      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751012997.497186      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
from transformers import ViltProcessor, ViltModel
class ViLTClassifier(nn.Module):
    def __init__(self, hidden_size=768):
        super(ViLTClassifier, self).__init__()
        self.vilt = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")
        self.classifier = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask, pixel_values, token_type_ids=None):
        outputs = self.vilt(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            token_type_ids=token_type_ids
        )
        pooled = outputs.pooler_output
        return self.classifier(pooled)

In [11]:
import torch
import torch.nn as nn
from transformers import BertModel
import timm

class CombinedModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Text Encoder: BERT
        self.bert = BertModel.from_pretrained('roberta-base')  # hoặc 'bert-base-uncased'
        bert_hidden = self.bert.config.hidden_size  # 768

        # Image Encoder: Swin Transformer
        self.swin = timm.create_model('swin_base_patch4_window7_224', pretrained=True)
        self.swin.head = nn.Identity()
        swin_hidden = self.swin.num_features  # 1024

        self.bert_hidden = bert_hidden
        self.swin_hidden = swin_hidden

        # Fusion Layer (Transformer Encoder)
        self.fusion = nn.TransformerEncoder(    
            nn.TransformerEncoderLayer(
                d_model=bert_hidden + swin_hidden,
                nhead=8,
                dim_feedforward=512,
                dropout=0.1,
                activation='gelu'
            ),
            num_layers=2 
        )

        # Classifier Head
        self.classifier = nn.Sequential(
            nn.Linear(bert_hidden + swin_hidden, 512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Dropout(0.4),
            nn.Linear(256, 1)
        )
    
    def forward(self, input_ids, attention_mask, pixel_values):
        # BERT Encoding
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_cls = bert_out.last_hidden_state[:, 0, :]  # (batch_size, 768)

        # SWIN Encoding
        swin_out = self.swin(pixel_values)  # shape (batch_size, H, W, C)
        
        if swin_out.dim() == 4:
            swin_out = swin_out.permute(0, 3, 1, 2)  # (batch_size, C, H, W)
            swin_out = swin_out.mean(dim=[2, 3])      # Global Average Pooling → (batch_size, 1024)

        # Concatenate features
        combined = torch.cat((bert_cls, swin_out), dim=1)  # (batch_size, 1792)

        # Fusion Transformer Encoder
        combined_fused = self.fusion(combined.unsqueeze(1)).squeeze(1)  # (batch_size, 1792)

        # Classification head
        logits = self.classifier(combined_fused)

        return logits.squeeze(1)  # (batch_size,)


In [12]:
# class ResidualBlock(nn.Module):
#     def __init__(self, in_features):
#         super().__init__()
#         self.block = nn.Sequential(
#             nn.Linear(in_features, in_features),
#             nn.BatchNorm1d(in_features),
#             nn.ReLU(),
#             nn.Dropout(0.1)
#         )

#     def forward(self, x):
#         return x + self.block(x)

# # Áp dụng trong classifier
# self.classifier = nn.Sequential(
#     nn.Linear(bert_hidden + vit_hidden, 512),
#     ResidualBlock(512),
#     nn.Linear(512, 2)
# )


In [13]:
def validation(model, dataloader, criterion, device):
    model.eval()
    total_loss, total_correct, total_samples = 0, 0, 0

    with torch.no_grad():
        loop = tqdm(val_loader, leave=True)
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device).float()

            outputs = model(input_ids, attention_mask, pixel_values).squeeze(1)
            # logits = outputs.logits
            # loss = criterion(logits, labels)
            loss = criterion(outputs, labels)

            preds = torch.sigmoid(outputs) >= 0.5       
            #preds = torch.argmax(logits, dim=1)
                        
            total_loss += loss.item() * labels.size(0)
            #total_accuracy += (preds == labels.long()).sum().item()
            total_correct += (preds == labels).sum().item()
            total_samples += labels.size(0)
 
    avg_loss = total_loss / total_samples
    accuracy = total_correct / total_samples
    return avg_loss, accuracy

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#model = CombinedModel()
model = ViLTClassifier()
model = nn.DataParallel(model).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6, weight_decay=0.01) # to reduce overfiting 
# criterion = nn.CrossEntropyLoss()
criterion = nn.BCEWithLogitsLoss()

num_epochs = 10
model.train()
early_stopper = EarlyStopping(patience=3, verbose=True)

for epoch in range(num_epochs):
    loop = tqdm(train_loader, leave=True)
    total_loss, total_accuracy, total_samples = 0, 0, 0
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device).float()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, pixel_values).squeeze(1) # use vilt thì thêm .squeeze(1) ở output
        # logits = outputs.logits  # Remove it if not use huggingface
        # loss = criterion(logits, labels)
        loss = criterion(outputs, labels) 
        loss.backward()
        optimizer.step()

        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).float()
        #preds = torch.argmax(logits, dim=1)
        
        total_loss += loss.item() * labels.size(0)
        #total_accuracy += (preds == labels.long()).sum().item() # Cross entropy
        total_accuracy += (preds == labels).sum().item() 
        total_samples += labels.size(0)
        
        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(loss=loss.item())  
        
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {total_loss/total_samples:.4f}, Train Accuracy: {total_accuracy/total_samples:.4f}")

    val_loss, val_acc = validation(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}: Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}")

    early_stopper(val_loss, model)
    if early_stopper.early_stop:
        print("Early stopping")
        break


In [16]:
model.eval()
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
        attention_mask = batch['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
        pixel_values = batch['pixel_values'].to('cuda' if torch.cuda.is_available() else 'cpu')

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, pixel_values=pixel_values)
        
        probs = torch.sigmoid(outputs).squeeze().cpu().numpy()
        preds = (probs > 0.5).astype(int)
        all_preds.extend(preds)
        # logits = outputs.logits # if use model huggingface to return result need .logit to get output
        # preds = torch.argmax(logits, dim=-1)
        #all_preds.extend(preds.cpu().numpy())

df_test['label'] = all_preds
df_test[['image_id', 'label']].to_csv("test.csv", index=False)