In [1]:
import random
import numpy as np
import torch

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # Nếu dùng GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [2]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from PIL import Image
from tqdm import tqdm
from torchvision import transforms
from timm import create_model
import json

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
b7_model = create_model('tf_efficientnet_b7_ns', pretrained=True, num_classes=0).to(DEVICE)
b7_model.eval()

  model = create_fn(


EfficientNet(
  (conv_stem): Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2), bias=False)
  (bn1): BatchNormAct2d(
    64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
    (drop): Identity()
    (act): SiLU(inplace=True)
  )
  (blocks): Sequential(
    (0): Sequential(
      (0): DepthwiseSeparableConv(
        (conv_dw): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
        (bn1): BatchNormAct2d(
          64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
          (drop): Identity()
          (act): SiLU(inplace=True)
        )
        (aa): Identity()
        (se): SqueezeExcite(
          (conv_reduce): Conv2d(64, 16, kernel_size=(1, 1), stride=(1, 1))
          (act1): SiLU(inplace=True)
          (conv_expand): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
          (gate): Sigmoid()
        )
        (conv_pw): Conv2d(64, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn2): BatchNo

In [4]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/mdeberta-v3-base')
deberta_model = AutoModel.from_pretrained('microsoft/mdeberta-v3-base').to(DEVICE)
deberta_model.eval()



DebertaV2Model(
  (embeddings): DebertaV2Embeddings(
    (word_embeddings): Embedding(251000, 768, padding_idx=0)
    (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): DebertaV2Encoder(
    (layer): ModuleList(
      (0-11): 12 x DebertaV2Layer(
        (attention): DebertaV2Attention(
          (self): DisentangledSelfAttention(
            (query_proj): Linear(in_features=768, out_features=768, bias=True)
            (key_proj): Linear(in_features=768, out_features=768, bias=True)
            (value_proj): Linear(in_features=768, out_features=768, bias=True)
            (pos_dropout): Dropout(p=0.1, inplace=False)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): DebertaV2SelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [5]:
# --- Preprocessing ảnh ---
image_transform = transforms.Compose([
    transforms.Resize((600, 600)),
    transforms.RandomResizedCrop(600, scale=(0.9, 1.0)),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    transforms.RandomRotation(degrees=3),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [6]:
def extract_b7_features(image_path):
    try:
        image = Image.open(image_path).convert('RGB')
        image = image_transform(image).unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            features = b7_model(image).squeeze(0)
        return features.cpu().numpy()
    except Exception as e:
        print(f"❌ Lỗi ảnh tại {image_path}: {e}")
        return np.zeros(2560, dtype=np.float32)

In [7]:
def extract_deberta_features(text):
    try:
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(DEVICE)
        with torch.no_grad():
            outputs = deberta_model(**inputs)
        last_hidden = outputs.last_hidden_state  # (1, seq_len, 768)
        features = last_hidden.mean(dim=1).squeeze(0).cpu().numpy()  # (768,)
        return features
    except Exception as e:
        print(f"❌ Lỗi DeBERTa với text: {text} - {e}")
        return np.zeros(768, dtype=np.float32)

In [9]:
with open('../data/devset_images_metadata.json', 'r') as f:
    json_data = json.load(f)

train_data = json_data['images']
train_df = pd.DataFrame(train_data)

cols_needed = ['image_id', 'title', 'description', 'user_tags']
available_cols = [col for col in cols_needed if col in train_df.columns]
train_df = train_df[available_cols].rename(columns={'image_id': 'id'})

train_df['id'] = train_df['id'].astype(int)
train_df.to_csv("../data/devset_images_metadata.csv", index=False)
train_df

Unnamed: 0,id,title,description,user_tags
0,3519864665,Biltmore Estate,,"[2009 road trip, obrero road trip]"
1,4896119055,Chand Minar,,"[daulatabad, daulatabad fort, ellora, road trip]"
2,3468473862,Uplifting Graffiti,"After the flood, the boarded up stores bear up...","[cedarrapids, createsouthroadtrip2009, disaste..."
3,4120853942,DSCF6487,,"[cork, enchente, flood, ireland, irlanda]"
4,4436083254,Oconoe river - flooded,,"[athens georgia, brown, current, flood, mud, r..."
...,...,...,...,...
5275,8119782888,90,,"[550d, camino, canon, canoneos550d, canoneoski..."
5276,6093294301,"Albany's Corning Preserve, day after Irene",,"[albany, ny, flood, walk, water]"
5277,6791185487,IMG_4989,,"[al, the waters in pike road]"
5278,9144682941,IMG_3011,2013 Fair Flood,"[2013, county, fair, flood, linn]"


In [11]:
import os
import pandas as pd
import unicodedata
import re
import ast

# --- Đường dẫn ---
IMG_DIR = os.path.join("../data/devset_images", "devset_images")
FEAT_DIR = os.path.join("../data/devset_images_features", "devset_images_features")
META_CSV = os.path.join("../data/devset_images_metadata.csv")
GT_CSV = os.path.join("../data/devset_images_gt.csv")

# --- Load dữ liệu ---
train_df = pd.read_csv(META_CSV)
label_df = pd.read_csv(GT_CSV)

# --- Làm sạch ID ---
train_df['id'] = train_df['id'].apply(lambda x: int(float(x)) if pd.notnull(x) else x)
label_df['id'] = label_df['id'].apply(lambda x: int(float(x)) if pd.notnull(x) else x)

# --- Chuyển user_tags từ chuỗi → list an toàn ---
def safe_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            val = ast.literal_eval(x)
            if isinstance(val, list):
                return val
        except Exception:
            pass
        return [x] if x else []
    return []

train_df["user_tags"] = train_df["user_tags"].apply(safe_list)

# --- Làm sạch từng cột cơ bản ---
def basic_clean(text):
    if not isinstance(text, str):
        return ""
    
    text = unicodedata.normalize('NFKC', text)                       # Chuẩn Unicode
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)              # Bỏ URL
    text = re.sub(r"\S+@\S+", "", text)                              # Bỏ email
    text = re.sub(r"<.*?>", "", text)                                # Bỏ HTML tags
    text = re.sub(r"[^\w\s\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\-_#]+", "", text)  # Giữ emoji + đa ngôn ngữ
    text = re.sub(r"\s+", " ", text)                                 # Rút gọn khoảng trắng
    return text.lower().strip()  

train_df["title"] = train_df["title"].fillna("").astype(str).apply(basic_clean)
train_df["description"] = train_df["description"].fillna("").astype(str).apply(basic_clean)
train_df["user_tags"] = train_df["user_tags"].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))
train_df["user_tags"] = train_df["user_tags"].apply(basic_clean)

# --- Gộp thành text cho BLIP (không clean lại nữa) ---
train_df["text"] = train_df.apply(
    lambda row: f"Title: {row['title']} | Description: {row['description']} | Tags: {row['user_tags']}",
    axis=1
)

# --- Gộp với label ---
train_df = train_df.merge(label_df, on="id", how="left")

# ✅ Kết quả
train_df[["id", "text", "label"]].head()

Unnamed: 0,id,text,label
0,3519864665,Title: biltmore estate | Description: | Tags:...,0
1,4896119055,Title: chand minar | Description: | Tags: dau...,0
2,3468473862,Title: uplifting graffiti | Description: after...,0
3,4120853942,Title: dscf6487 | Description: | Tags: cork e...,0
4,4436083254,Title: oconoe river - flooded | Description: ...,0


In [12]:
def find_image_path(image_id, exts=[".jpg", ".jpeg", ".png", ".bmp", ".gif" ]):
    for ext in exts:
        path = os.path.join(IMG_DIR, f"{image_id}{ext}")
        if os.path.isfile(path):
            return path
    return None

In [13]:
import os

# Các đuôi ảnh hợp lệ
valid_exts = [".jpg", ".png",  ".gif"]

# Lọc file ảnh hợp lệ
image_files = [
    f for f in os.listdir(IMG_DIR)
    if os.path.isfile(os.path.join(IMG_DIR, f))
    and not f.startswith("._")
    and os.path.splitext(f.lower())[1] in valid_exts
]

# In kết quả
print(f"🖼️ Tổng số ảnh hợp lệ (không bắt đầu bằng '._'): {len(image_files)}")

🖼️ Tổng số ảnh hợp lệ (không bắt đầu bằng '._'): 5280


In [14]:
from tqdm import tqdm

all_features = []
all_labels = []

for idx, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Extracting features"):
    image_id = str(int(row["id"]))
    image_path = find_image_path(image_id)

    # Mặc định là zero vector nếu lỗi
    image_feat = np.zeros(2560, dtype=np.float32)
    text_feat = np.zeros(768, dtype=np.float32)

    # Trích xuất đặc trưng ảnh nếu có ảnh
    if image_path:
        image_feat = extract_b7_features(image_path)
    else:
        print(f"Không tìm thấy ảnh cho id {image_id}")

    # Trích xuất đặc trưng văn bản
    if isinstance(row["text"], str):
        text_feat = extract_deberta_features(row["text"])

    # Gộp lại
    combined_feat = np.concatenate([image_feat, text_feat])
    all_features.append(combined_feat)
    all_labels.append(row["label"])

# Convert sang mảng numpy
all_features = np.stack(all_features)
all_labels = np.array(all_labels)

# Lưu đặc trưng và nhãn
np.save("b7_deberta_features.npy", all_features)
np.save("b7_deberta_labels.npy", all_labels)

print("Đã lưu b7_deberta_features.npy và b7_deberta_labels.npy")
print(f"Shape features: {all_features.shape}, labels: {all_labels.shape}")

Extracting features: 100%|██████████| 5280/5280 [10:11<00:00,  8.63it/s]


Đã lưu b7_deberta_features.npy và b7_deberta_labels.npy
Shape features: (5280, 3328), labels: (5280,)


In [13]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
# import numpy as np

# # 🔶 Focal Loss định nghĩa sẵn (cho binary classification)
# class FocalLoss(nn.Module):
#     def __init__(self, alpha=0.25, gamma=2.0, reduction='mean'):
#         super(FocalLoss, self).__init__()
#         self.alpha = alpha
#         self.gamma = gamma
#         self.reduction = reduction

#     def forward(self, inputs, targets):
#         BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
#         probs = torch.sigmoid(inputs)
#         pt = torch.where(targets == 1, probs, 1 - probs)
#         focal_weight = self.alpha * (1 - pt) ** self.gamma
#         loss = focal_weight * BCE_loss

#         if self.reduction == 'mean':
#             return loss.mean()
#         elif self.reduction == 'sum':
#             return loss.sum()
#         else:
#             return loss

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Subset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [16]:
X = np.load("b7_deberta_features.npy")           # (N, 3328)
y = np.load("b7_deberta_labels.npy")  

In [17]:
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1)  # (N, 1)
dataset = TensorDataset(X_tensor, y_tensor)

In [18]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [19]:
pos_weight = torch.tensor(3360 / 1920, dtype=torch.float32).to(DEVICE)  # = 1.75
# pos_weight = torch.tensor(3360 / 1680, dtype=torch.float32).to(DEVICE)  # = 2
# pos_weight = torch.tensor(3360 / 2240, dtype=torch.float32).to(DEVICE)  # = 1.5

In [20]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold

def train_fold(model, train_loader, val_loader, fold_id, total_epochs=50, lr_max=0.00068):
    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    # criterion = FocalLoss(alpha=0.25, gamma=2.0)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr_max, weight_decay=0.001)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='max', factor=0.5, patience=3, min_lr=1e-7
    )
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    #     optimizer, T_max=total_epochs, eta_min=1e-7
    # )


    best_f1 = 0.0
    best_threshold = 0.5  # sẽ được cập nhật sau
    patience = 9
    patience_counter = 0

    for epoch in range(total_epochs):
        model.train()
        train_loss = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            preds = model(xb)
            loss = criterion(preds, yb)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0.0
        all_probs, all_targets = [], []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                preds = model(xb)
                loss = criterion(preds, yb)
                val_loss += loss.item()
                probs = torch.sigmoid(preds).cpu().numpy().flatten()
                all_probs.extend(probs)
                all_targets.extend(yb.cpu().numpy().flatten())

        probs = np.array(all_probs)
        targets = np.array(all_targets)

        # Tìm threshold tối ưu theo F1
        def find_best_threshold(y_true, y_prob):
            thresholds = np.arange(0.1, 0.91, 0.01)
            best_t = 0.5
            best_f1_local = 0.0
            for t in thresholds:
                preds = (y_prob >= t).astype(int)
                f1 = f1_score(y_true, preds, zero_division=0)
                if f1 > best_f1_local:
                    best_f1_local = f1
                    best_t = t
            return best_t, best_f1_local
        
        best_threshold_this_epoch, best_f1_this_epoch = find_best_threshold(targets, probs)

        # F1 khi dùng threshold mặc định 0.5 (cho thống kê)
        preds_bin = (probs >= 0.5).astype(int)
        acc = (preds_bin == targets).mean()
        auc = roc_auc_score(targets, probs)
        precision = precision_score(targets, preds_bin, zero_division=0)
        recall = recall_score(targets, preds_bin, zero_division=0)
        f1 = f1_score(targets, preds_bin, zero_division=0)

        scheduler.step(f1)

        print(f"[Fold {fold_id}] Epoch {epoch+1}: Train Loss = {train_loss:.4f} | "
              f"Val Loss = {val_loss:.4f} | Acc = {acc:.4f} | AUC = {auc:.4f} | "
              f"Precision = {precision:.4f} | Recall = {recall:.4f} | F1 = {f1:.4f} | "
              f"Best Threshold = {best_threshold_this_epoch:.4f} | Best F1 = {best_f1_this_epoch:.4f}")

        # Cập nhật model nếu F1 tốt hơn
        if best_f1_this_epoch > best_f1:
            best_f1 = best_f1_this_epoch
            best_threshold = best_threshold_this_epoch
            patience_counter = 0
            torch.save(model.state_dict(), f"best_model_fold{fold_id}.pt")
            print(f"Fold {fold_id} - New best F1: {best_f1:.4f} at threshold {best_threshold:.4f} → model saved.")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Fold {fold_id} - Early stopping at epoch {epoch+1}")
                break

    return best_threshold, probs, targets


In [21]:
class FusionNetPlus(nn.Module):
    def __init__(self, input_dim=3328):
        super(FusionNetPlus, self).__init__()
        self.bn_input = nn.BatchNorm1d(input_dim)
        self.fc1 = nn.Linear(input_dim, 512)
        self.dropout1 = nn.Dropout(0.5)

        self.bn2 = nn.BatchNorm1d(384)
        self.fc2 = nn.Linear(512, 384)
        self.dropout2 = nn.Dropout(0.3)

        self.bn3 = nn.BatchNorm1d(128)
        self.fc3 = nn.Linear(384, 128)
        self.dropout3 = nn.Dropout(0.2)

        self.bn4 = nn.BatchNorm1d(64)
        self.fc4 = nn.Linear(128, 64)
        self.dropout4 = nn.Dropout(0.1)

        self.out = nn.Linear(64, 1)

    def forward(self, x):
        x = self.bn_input(x)
        x = self.fc1(x)
        x = F.silu(x)
        x = self.dropout1(x)

        x = self.fc2(x)
        x = self.bn2(x)
        x = F.silu(x)
        x = self.dropout2(x)

        x = self.fc3(x)
        x = self.bn3(x)
        x = F.silu(x)
        x = self.dropout3(x)

        x = self.fc4(x)
        x = self.bn4(x)
        x = F.silu(x)
        x = self.dropout4(x)

        return self.out(x)

In [22]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
best_thresholds = []
val_targets_all = []
val_probs_all = []

for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\n📦 Starting Fold {fold_id + 1}")
    
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)

    train_loader = DataLoader(train_subset, batch_size=128, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=128, shuffle=False)

    model = FusionNetPlus().to(DEVICE)
    best_threshold, val_probs, val_targets = train_fold(model, train_loader, val_loader, fold_id=fold_id + 1)
    
    best_thresholds.append(best_threshold)
    val_probs_all.extend(val_probs)
    val_targets_all.extend(val_targets)

# ✅ In toàn bộ threshold cuối cùng sau tất cả các fold
print("\n🎯 Best thresholds by fold:")
for i, th in enumerate(best_thresholds):
    print(f"Fold {i + 1}: Best threshold = {th:.4f}")


📦 Starting Fold 1
[Fold 1] Epoch 1: Train Loss = 15.5552 | Val Loss = 1.5623 | Acc = 0.8939 | AUC = 0.9742 | Precision = 0.7857 | Recall = 0.9740 | F1 = 0.8698 | Best Threshold = 0.7300 | Best F1 = 0.9072
Fold 1 - New best F1: 0.9072 at threshold 0.7300 → model saved.
[Fold 1] Epoch 2: Train Loss = 9.3121 | Val Loss = 1.1611 | Acc = 0.9261 | AUC = 0.9811 | Precision = 0.8493 | Recall = 0.9688 | F1 = 0.9051 | Best Threshold = 0.5900 | Best F1 = 0.9127
Fold 1 - New best F1: 0.9127 at threshold 0.5900 → model saved.
[Fold 1] Epoch 3: Train Loss = 6.3376 | Val Loss = 1.0885 | Acc = 0.9375 | AUC = 0.9815 | Precision = 0.8698 | Recall = 0.9740 | F1 = 0.9189 | Best Threshold = 0.5700 | Best F1 = 0.9235
Fold 1 - New best F1: 0.9235 at threshold 0.5700 → model saved.
[Fold 1] Epoch 4: Train Loss = 5.1274 | Val Loss = 1.0561 | Acc = 0.9242 | AUC = 0.9825 | Precision = 0.8585 | Recall = 0.9479 | F1 = 0.9010 | Best Threshold = 0.7000 | Best F1 = 0.9147
[Fold 1] Epoch 5: Train Loss = 3.9146 | Val 

In [69]:
# def lr_finder(model, train_loader, optimizer_class, criterion, 
#               lr_start=1e-7, lr_end=1, num_iters=100):
#     model.train()
#     lrs = []
#     losses = []

#     optimizer = optimizer_class(model.parameters(), lr=lr_start)
#     lr_lambda = lambda x: (lr_end/lr_start)**(x/num_iters)
#     scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)

#     iter_loader = iter(train_loader)
#     for i in range(num_iters):
#         try:
#             xb, yb = next(iter_loader)
#         except StopIteration:
#             iter_loader = iter(train_loader)
#             xb, yb = next(iter_loader)

#         xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#         optimizer.zero_grad()
#         preds = model(xb)
#         loss = criterion(preds, yb)
#         loss.backward()
#         optimizer.step()
#         scheduler.step()

#         current_lr = optimizer.param_groups[0]['lr']
#         lrs.append(current_lr)
#         losses.append(loss.item())

#         if i % 10 == 0:
#             print(f"Iter {i:03d} | LR: {current_lr:.6f} | Loss: {loss.item():.4f}")

#     return lrs, losses

# def plot_lr_finder(lrs, losses):
#     import matplotlib.pyplot as plt
#     plt.figure(figsize=(8, 5))
#     plt.plot(lrs, losses)
#     plt.xscale('log')
#     plt.xlabel("Learning Rate (log scale)")
#     plt.ylabel("Loss")
#     plt.title("LR Finder")
#     plt.grid(True)
#     plt.show()

In [70]:
# print("\n🔍 Chạy LR Finder sau khi huấn luyện 5 fold...")

# # Tạo full loader từ toàn bộ dataset
# full_loader = DataLoader(dataset, batch_size=128, shuffle=True)

# model = FusionNetPlus().to(DEVICE)
# criterion = torch.nn.BCEWithLogitsLoss()

# # Gọi hàm LR Finder
# lrs, losses = lr_finder(
#     model=model,
#     train_loader=full_loader,
#     optimizer_class=torch.optim.AdamW,
#     criterion=criterion,
#     lr_start=1e-7,
#     lr_end=1,
#     num_iters=100
# )

# # Vẽ biểu đồ
# plot_lr_finder(lrs, losses)

In [71]:
# learning_rates = [0.0007 , 0.00068, 0.00065]
# best_f1 = 0
# best_lr = None

# for lr in learning_rates:
#     print(f"\n🔍 Testing learning rate: {lr}")
#     model = FusionNetPlus().to(DEVICE)
#     optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)
#     criterion = torch.nn.BCEWithLogitsLoss()

#     # Huấn luyện 3 epoch nhanh trên 1 fold (hoặc dùng fold 0 làm mẫu)
#     for epoch in range(3):
#         model.train()
#         for xb, yb in train_loader:
#             xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#             optimizer.zero_grad()
#             loss = criterion(model(xb), yb)
#             loss.backward()
#             optimizer.step()

#     # Validation
#     model.eval()
#     all_probs, all_targets = [], []
#     with torch.no_grad():
#         for xb, yb in val_loader:
#             xb, yb = xb.to(DEVICE), yb.to(DEVICE)
#             preds = model(xb)
#             all_probs.extend(preds.cpu().numpy().flatten())
#             all_targets.extend(yb.cpu().numpy().flatten())

#     probs = np.array(all_probs)
#     targets = np.array(all_targets)
#     preds_bin = (probs >= 0.5).astype(int)
#     f1 = f1_score(targets, preds_bin)

#     print(f"📊 F1 = {f1:.4f} for lr = {lr}")
#     if f1 > best_f1:
#         best_f1 = f1
#         best_lr = lr

# print(f"\n✅ Best Learning Rate: {best_lr} with F1 = {best_f1:.4f}")


In [23]:
import numpy as np
np.std(best_thresholds)

np.float64(0.1794435844492635)

In [24]:
for i in best_thresholds:
    print(f"{i:.4f}", end=", ")

0.6400, 0.7000, 0.7700, 0.6600, 0.1600, 0.7700, 0.4800, 0.7900, 0.6800, 0.5500, 

In [25]:
cleaned_thresh = [t for t in best_thresholds if 0.4 <= t <= 0.8]
threshold2 = np.mean(cleaned_thresh)
print("New Mean:", threshold2)

New Mean: 0.6711111111111108


In [26]:
def find_best_threshold(y_true, y_probs):
    thresholds = np.arange(0.1, 0.91, 0.01)
    best_t = 0.5
    best_f1 = 0.0
    for t in thresholds:
        preds = (y_probs >= t).astype(int)
        f1 = f1_score(y_true, preds, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_t = t
    return best_t, best_f1

In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Đảm bảo model đang ở chế độ eval
model.eval()

# Dự đoán không cần gradient
with torch.no_grad():
    logits = model(X_tensor.to(DEVICE))  # đầu ra là logits
    probs = torch.sigmoid(logits).cpu().numpy().flatten()  # chuyển logits → xác suất
    preds = (probs >= 0.5).astype(int)

# Chuyển ground truth sang numpy
targets = y_tensor.cpu().numpy().flatten()

# Tính các chỉ số đánh giá
acc = accuracy_score(targets, preds)
precision = precision_score(targets, preds, zero_division=0)
recall = recall_score(targets, preds, zero_division=0)
f1 = f1_score(targets, preds, zero_division=0)
auc = roc_auc_score(targets, probs)

# Tìm threshold tốt nhất
val_probs_all = np.array(val_probs_all)
val_targets_all = np.array(val_targets_all)

global_threshold, global_f1 = find_best_threshold(val_targets_all, val_probs_all)

# In kết quả
print(f"Accuracy  : {acc:.6f}")
print(f"Precision : {precision:.6f}")
print(f"Recall    : {recall:.6f}")
print(f"F1 Score  : {f1:.6f}")
print(f"AUC       : {auc:.6f}")
print(f"\nGlobal optimal threshold on all val: {global_threshold:.4f} | F1 = {global_f1:.4f}")

# print(x)

Accuracy  : 0.995265
Precision : 0.990678
Recall    : 0.996354
F1 Score  : 0.993508
AUC       : 0.999517

Global optimal threshold on all val: 0.6700 | F1 = 0.9210


In [29]:
import pandas as pd
import ast
import re
import unicodedata

# --- Đọc dữ liệu ---
test_df = pd.read_csv("../data/test.csv")

# --- Hàm chuyển user_tags từ chuỗi → list an toàn ---
def safe_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            val = ast.literal_eval(x)
            if isinstance(val, list):
                return val
        except Exception:
            pass
        return [x] if x else []
    return []

test_df["user_tags"] = test_df["user_tags"].apply(safe_list)

# --- Hàm làm sạch text chuẩn --- (dùng lại từ train)
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    text = unicodedata.normalize("NFKC", text)  # chuẩn hóa Unicode
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # xóa URL
    text = re.sub(r"[^\w\s\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af\-\_#]+", "", text)  # giữ lại chữ, số, emoji, - _ #
    text = re.sub(r"\s+", " ", text)  # rút gọn khoảng trắng
    return text.lower().strip()

# --- Làm sạch từng trường ---
test_df["title"] = test_df["title"].fillna("").astype(str).apply(clean_text)
test_df["description"] = test_df["description"].fillna("").astype(str).apply(clean_text)
test_df["user_tags"] = test_df["user_tags"].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))
test_df["user_tags"] = test_df["user_tags"].apply(clean_text)

# --- Tạo văn bản đầu vào cho BLIP ---
test_df["text"] = test_df.apply(
    lambda row: f"Title: {row['title']} | Description: {row['description']} | Tags: {row['user_tags']}",
    axis=1
)

# ✅ Kết quả
test_df[["id", "text"]].head()

Unnamed: 0,id,text
0,3483809003,Title: flooded parking lot at emily fowler lib...
1,3712805295,Title: larc de barà the roman arch of barà | D...
2,379845620,Title: highest point over the sea level that i...
3,7343264988,Title: lagos after the rains | Description: af...
4,3843337492,Title: flooded corley ave | Description: also ...


In [32]:
import os

# Đường dẫn thư mục chứa ảnh
IMG_TEST_DIR = os.path.join("../data/testset_images", "testset_images")

# Các đuôi ảnh hợp lệ
valid_exts = [".jpg", ".png"]

# Lọc file ảnh hợp lệ
image_files = [
    f for f in os.listdir(IMG_TEST_DIR)
    if os.path.isfile(os.path.join(IMG_TEST_DIR, f))
    and not f.startswith("._")
    and os.path.splitext(f.lower())[1] in valid_exts
]

# In kết quả
print(f"Tổng số ảnh hợp lệ (không bắt đầu bằng '._'): {len(image_files)}")

Tổng số ảnh hợp lệ (không bắt đầu bằng '._'): 1320


In [33]:
def find_image_path_test(image_id, exts=[".jpg", ".png"]):
    for ext in exts:
        path = os.path.join(IMG_TEST_DIR, f"{image_id}{ext}")
        if os.path.isfile(path):
            return path
    return None

In [34]:
# --- Preprocessing ảnh ---
image_transform_test = transforms.Compose([
    transforms.Resize((600, 600)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [35]:
def extract_b7_features_test(image_path):
    try:
        image = Image.open(image_path).convert('RGB')
        image = image_transform_test(image).unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            features = b7_model(image).squeeze(0)
        return features.cpu().numpy()
    except Exception as e:
        print(f"❌ Lỗi ảnh tại {image_path}: {e}")
        return np.zeros(2560, dtype=np.float32)

In [36]:
import numpy as np
from tqdm import tqdm

all_features = []
error_count = 0

expected_b7 = (2560,)
expected_deberta = (768,)

for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Extracting test features"):
    try:
        image_id = str(int(row["id"]))
        image_path = find_image_path_test(image_id)
        text_input = row["text"]

        # Mặc định zero vector
        img_feat = np.zeros(expected_b7, dtype=np.float32)
        txt_feat = np.zeros(expected_deberta, dtype=np.float32)

        # Image features
        if image_path is not None:
            img_feat = extract_b7_features_test(image_path)
        else:
            print(f"Missing image: {image_id}")

        # Text features
        if isinstance(text_input, str):
            txt_feat = extract_deberta_features(text_input)

        # Validate shape
        if img_feat.shape != expected_b7:
            print(f"Wrong shape for image {image_id}")
            img_feat = np.zeros(expected_b7)
        if txt_feat.shape != expected_deberta:
            print(f"Wrong shape for text {image_id}")
            txt_feat = np.zeros(expected_deberta)

        # Combine features
        combined_feat = np.concatenate([img_feat, txt_feat])
        all_features.append(combined_feat)

    except Exception as e:
        print(f"Error at idx {idx}, id {row['id']}: {e}")
        error_count += 1
        combined_feat = np.concatenate([
            np.zeros(expected_b7, dtype=np.float32),
            np.zeros(expected_deberta, dtype=np.float32)
        ])
        all_features.append(combined_feat)


Extracting test features: 100%|██████████| 1320/1320 [02:22<00:00,  9.29it/s]


In [37]:
# Convert to numpy array and save
all_features = np.stack(all_features)
np.save("b7_deberta_features_test.npy", all_features)
print("Đã lưu b7_deberta_features_test.npy")
print(f"Shape: {all_features.shape}")
print(f"Số lượng lỗi: {error_count}")

Đã lưu b7_deberta_features_test.npy
Shape: (1320, 3328)
Số lượng lỗi: 0


In [38]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

In [39]:
b7_test = np.load("b7_deberta_features_test.npy")  # Đã được đồng bộ (shape: M, 3328)
X_test_tensor = torch.tensor(b7_test, dtype=torch.float32).to(DEVICE)

In [40]:
NUM_FOLDS = 10
all_probs = []

for fold in range(1, NUM_FOLDS + 1):
    print(f"Predicting with Fold {fold}")
    model = FusionNetPlus().to(DEVICE)
    model.load_state_dict(torch.load(f"best_model_fold{fold}.pt", map_location=DEVICE))
    model.eval()

    with torch.no_grad():
        logits = model(X_test_tensor)
        probs = torch.sigmoid(logits).cpu().numpy().flatten()  # Chuyển logits → xác suất
        all_probs.append(probs)

Predicting with Fold 1
Predicting with Fold 2
Predicting with Fold 3
Predicting with Fold 4
Predicting with Fold 5
Predicting with Fold 6
Predicting with Fold 7
Predicting with Fold 8
Predicting with Fold 9
Predicting with Fold 10


In [41]:
ensemble_probs = np.mean(all_probs, axis=0)
ensemble_preds = (ensemble_probs >= threshold2).astype(int)

In [42]:
print(threshold2)
print(global_threshold)

0.6711111111111108
0.6699999999999997


In [47]:
results_df = test_df.copy()  # test_df phải có cột 'id'
results_df["label"] = ensemble_preds
results_df["probability"] = ensemble_probs

In [51]:
results_df = results_df[["id", "label", "probability"]].copy()
results_df.sort_values(by="probability", ascending=False, inplace=True)
results_df.to_csv("../create_output/result_b7+bert.csv", index=False)

In [49]:
submission_df = results_df[["id", "label"]].copy()
submission_df.to_csv("submission_b7+bert.csv", index=False)

In [50]:
num_positives = (results_df["label"] == 1).sum()
print(f"Số lượng dự đoán là 1: {num_positives}")

Số lượng dự đoán là 1: 468
