In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from glob import glob


In [None]:
import pandas as pd
import os

# --- Load dữ liệu gốc ---
train_df = pd.read_csv("/kaggle/input/fairface/FairFace/train_labels.csv")
val_df   = pd.read_csv("/kaggle/input/fairface/FairFace/val_labels.csv")

# --- Làm sạch & thêm đường dẫn ---
for df, subset in [(train_df, "train"), (val_df, "val")]:
    df.drop(columns=['service_test'], errors='ignore', inplace=True)
    df['file'] = df['file'].apply(lambda f: os.path.join("/kaggle/input/fairface/FairFace", f))
    df['file'] = df['file'].str.replace(f"{subset}/{subset}", subset)

# --- Mapping nhãn ---
gender_map = {'Male': 0, 'Female': 1}
race_map = {
    'White': 0, 'Black': 1, 'Latino_Hispanic': 2,
    'East Asian': 3, 'Southeast Asian': 4, 'Indian': 5, 'Middle Eastern': 6
}
age_map = {
    '0-2': 0, '3-9': 1, '10-19': 2, '20-29': 3,
    '30-39': 4, '40-49': 5, '50-59': 6, '60-69': 7, 'more than 70': 8
}

train_df['gender'] = train_df['gender'].map(gender_map)
val_df['gender']   = val_df['gender'].map(gender_map)
train_df['race']   = train_df['race'].map(race_map)
val_df['race']     = val_df['race'].map(race_map)
train_df['age']    = train_df['age'].map(age_map)
val_df['age']      = val_df['age'].map(age_map)

# --- Chia riêng theo task ---
train_gender_df = train_df[['file', 'gender']].copy()
val_gender_df   = val_df[['file', 'gender']].copy()

train_race_df = train_df[['file', 'race']].copy()
val_race_df   = val_df[['file', 'race']].copy()

train_age_df = train_df[['file', 'age']].copy()
val_age_df   = val_df[['file', 'age']].copy()


In [None]:
"""
balanced_data_prep_simple.py
- Kết hợp FairFace + UTKFace
- Gộp thêm val để làm train cân bằng hơn (age, race, gender)
- Dataset an toàn (skip file lỗi)
- Tự động chia lại val nhỏ sau khi cân bằng
"""

import os, random
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T

# -------------------------
# Utils
# -------------------------
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)

# -------------------------
# Load FairFace
# -------------------------
def load_fairface(base_dir: str):
    train_df = pd.read_csv(os.path.join(base_dir, "train_labels.csv"))
    val_df = pd.read_csv(os.path.join(base_dir, "val_labels.csv"))

    gender_map = {'Male': 0, 'Female': 1}
    race_map = {
        'White': 0, 'Black': 1, 'Latino_Hispanic': 2,
        'East Asian': 3, 'Southeast Asian': 4,
        'Indian': 5, 'Middle Eastern': 6
    }
    age_map = {
        '0-2': 0, '3-9': 1, '10-19': 2, '20-29': 3, '30-39': 4,
        '40-49': 5, '50-59': 6, '60-69': 7, 'more than 70': 8
    }

    for df, subset in [(train_df, "train"), (val_df, "val")]:
        df['file'] = df['file'].apply(lambda f: os.path.join(base_dir, f.replace(f"{subset}/{subset}", subset)))
        df['gender'] = df['gender'].map(gender_map)
        df['race'] = df['race'].map(race_map)
        df['age'] = df['age'].map(age_map)
        df.dropna(subset=['gender','race','age'], inplace=True)

    return train_df[['file','age','race','gender']], val_df[['file','age','race','gender']]

# -------------------------
# Load UTKFace
# -------------------------
def load_utkface(utk_dir: str):
    recs = []
    for f in os.listdir(utk_dir):
        if not f.lower().endswith(('.jpg','.jpeg','.png')): continue
        parts = f.split('_')
        if len(parts) < 4: continue
        try:
            age, gender, race = int(parts[0]), int(parts[1]), int(parts[2])
        except: continue
        if age > 100: continue
        recs.append([os.path.join(utk_dir, f), age, gender, race])
    df = pd.DataFrame(recs, columns=['file','age_raw','gender','race'])

    bins = [2,9,19,29,39,49,59,69,200]
    df['age'] = df['age_raw'].apply(lambda a: next(i for i,b in enumerate(bins) if a <= b))
    df['race'] = df['race'].map({0:0,1:1,2:3,3:5,4:6}).dropna().astype(int)

    train_df, val_df = train_test_split(df[['file','age','race','gender']],
                                        test_size=0.2, random_state=42, stratify=df['race'])
    return train_df.reset_index(drop=True), val_df.reset_index(drop=True)

# -------------------------
# Dataset
# -------------------------
class MultiTaskFaceDataset(Dataset):
    def __init__(self, df, transform=None):
        exists_mask = df['file'].map(os.path.exists)
        if not exists_mask.all():
            print(f"{(~exists_mask).sum()} missing files skipped.")
            df = df[exists_mask]
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self): return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        try: img = Image.open(row['file']).convert('RGB')
        except: img = Image.new('RGB',(112,112),(0,0,0))
        if self.transform: img = self.transform(img)
        labels = {k: torch.tensor(int(row[k]), dtype=torch.long) for k in ['age','race','gender']}
        return img, labels

def balance_train_data(train_df, val_df):
    """Cân bằng tuyệt đối cả age, race, gender (undersample & oversample)"""
    combined = pd.concat([train_df, val_df], ignore_index=True)
    print(f"Trước cân bằng: {len(combined):,} mẫu")

    # Tạo tất cả tổ hợp nhãn (age, race, gender)
    combined['combo'] = list(zip(combined['age'], combined['race'], combined['gender']))
    group_counts = combined['combo'].value_counts()
    min_count = group_counts.min()
    max_count = group_counts.max()
    target = int((min_count + max_count) / 2)  # trung bình, cân bằng hợp lý

    balanced_parts = []
    for combo, count in group_counts.items():
        subset = combined[combined['combo'] == combo]
        if count > target:
            subset = subset.sample(target, random_state=42)
        elif count < target:
            subset = subset.sample(target, replace=True, random_state=42)
        balanced_parts.append(subset)

    balanced = pd.concat(balanced_parts, ignore_index=True)
    print(f"Sau cân bằng: {len(balanced):,} mẫu ({len(group_counts)} tổ hợp nhãn)")

    # Chia lại val nhỏ (10%)
    train_df, val_df = train_test_split(balanced, test_size=0.1, random_state=42, stratify=balanced['race'])
    return train_df.drop(columns=['combo']), val_df.drop(columns=['combo'])

# -------------------------
# Example
# -------------------------
if __name__ == "__main__":
    set_seed(42)

    FAIRFACE_DIR = "/kaggle/input/fairface/FairFace"
    UTK_DIR = "/kaggle/input/utkface-new/UTKFace"

    ff_train, ff_val = load_fairface(FAIRFACE_DIR)
    utk_train, utk_val = load_utkface(UTK_DIR)

    train_df = pd.concat([ff_train, utk_train], ignore_index=True)
    val_df = pd.concat([ff_val, utk_val], ignore_index=True)

    train_df, val_df = balance_train_data(train_df, val_df)

    print("\nPhân bố train:")
    for col in ['age','race','gender']:
        print(col, train_df[col].value_counts().sort_index().to_dict())

    IMAGE_SIZE = 112
    mean, std = [0.485,0.456,0.406], [0.229,0.224,0.225]
    train_tf = T.Compose([
        T.Resize((IMAGE_SIZE+16, IMAGE_SIZE+16)),
        T.RandomCrop(IMAGE_SIZE),
        T.RandomHorizontalFlip(),
        T.ColorJitter(0.2,0.2,0.15,0.05),
        T.ToTensor(), T.Normalize(mean,std)
    ])
    val_tf = T.Compose([T.Resize((IMAGE_SIZE,IMAGE_SIZE)), T.ToTensor(), T.Normalize(mean,std)])

    train_ds = MultiTaskFaceDataset(train_df, train_tf)
    val_ds = MultiTaskFaceDataset(val_df, val_tf)

    train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_ds, batch_size=128, shuffle=False, num_workers=2)

    print(f"\nData ready: {len(train_ds):,} train samples | {len(val_ds):,} val samples")


In [None]:
import matplotlib.pyplot as plt

def plot_label_distributions(df, name="Train"):
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    for i, col in enumerate(["age", "race", "gender"]):
        axes[i].hist(df[col], bins=len(df[col].unique()), rwidth=0.9, color='skyblue', edgecolor='black')
        axes[i].set_title(f"{name} {col} distribution")
        axes[i].set_xlabel(col)
        axes[i].set_ylabel("count")
    plt.tight_layout()
    plt.show()

plot_label_distributions(train_df, "Train")
plot_label_distributions(val_df, "Validation")


In [None]:
# ============================================================
# Simplified Multi-task Face Model (CrossEntropyLoss)
# ============================================================
import os, random, math
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm.notebook import tqdm

# ============================================================
# Utils
# ============================================================
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = True

# ============================================================
# Model
# ============================================================
class MultiTaskFaceModel(nn.Module):
    def __init__(self, backbone_type="resnet34", pretrained=True, shared_dim=256):
        super().__init__()
        # Backbone
        if backbone_type == "resnet34":
            base = models.resnet34(weights="IMAGENET1K_V1" if pretrained else None)
            feat_dim = 512
        elif backbone_type == "resnet50":
            base = models.resnet50(weights="IMAGENET1K_V1" if pretrained else None)
            feat_dim = 2048
        else:
            raise ValueError("Unsupported backbone type")

        self.backbone = nn.Sequential(*list(base.children())[:-2])
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.bn = nn.BatchNorm1d(feat_dim)

        # Shared layer
        self.shared = nn.Sequential(
            nn.Linear(feat_dim, shared_dim),
            nn.BatchNorm1d(shared_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2)
        )

        # Heads
        self.gender_head = nn.Linear(shared_dim, 2)
        self.race_head   = nn.Linear(shared_dim, 7)
        self.age_head    = nn.Linear(shared_dim, 9)

    def forward(self, x):
        x = self.backbone(x)
        x = self.pool(x).flatten(1)
        x = self.bn(x)
        shared = self.shared(x)
        return {
            "gender": self.gender_head(shared),
            "race": self.race_head(shared),
            "age": self.age_head(shared)
        }

# ============================================================
# Training & Validation
# ============================================================
def compute_loss(outputs, targets, criterions):
    return sum(criterions[t](outputs[t], targets[t]) for t in outputs.keys())

def run_epoch(model, loader, device, optimizer, phase, criterions, scheduler=None):
    is_train = (phase == "train")
    model.train(is_train)
    total_loss, total_samples = 0, 0
    correct = {"age": 0, "race": 0, "gender": 0}

    if is_train: optimizer.zero_grad(set_to_none=True)
    pbar = tqdm(loader, desc=f"{phase}", leave=False)

    for imgs, labels in pbar:
        imgs = imgs.to(device)
        labels = {k: v.to(device) for k, v in labels.items()}

        with torch.autocast("cuda", enabled=device.type == "cuda"):
            outputs = model(imgs)
            loss = compute_loss(outputs, labels, criterions)

        if is_train:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

        batch = imgs.size(0)
        total_loss += loss.item() * batch
        total_samples += batch

        for t in outputs.keys():
            pred = outputs[t].argmax(dim=1)
            correct[t] += (pred == labels[t]).sum().item()

        pbar.set_postfix({"loss": f"{total_loss / total_samples:.4f}"})

    if is_train and scheduler: scheduler.step()

    metrics = {
        f"{phase}_loss": total_loss / total_samples,
        f"{phase}_age_acc": correct["age"] / total_samples,
        f"{phase}_race_acc": correct["race"] / total_samples,
        f"{phase}_gender_acc": correct["gender"] / total_samples,
    }
    metrics[f"{phase}_avg_acc"] = (metrics[f"{phase}_age_acc"] + metrics[f"{phase}_race_acc"] + metrics[f"{phase}_gender_acc"]) / 3
    return metrics

# ============================================================
# Checkpoint
# ============================================================
def save_checkpoint(path, model, optimizer, epoch, best_val_loss, history):
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    ckpt = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "best_val_loss": best_val_loss,
        "history": history,
    }
    torch.save(ckpt, path)
    print(f"Saved checkpoint: {path} | epoch={epoch} | best_val={best_val_loss:.4f}")

def load_checkpoint(path, model, optimizer=None, device="cpu"):
    if not os.path.exists(path):
        print("No checkpoint found.")
        return 0, float("inf"), {}
    ckpt = torch.load(path, map_location=device)
    model.load_state_dict(ckpt["model_state_dict"])
    if optimizer: optimizer.load_state_dict(ckpt["optimizer_state_dict"])
    print(f"Loaded checkpoint: epoch={ckpt['epoch']} | best_val={ckpt['best_val_loss']:.4f}")
    return ckpt["epoch"], ckpt["best_val_loss"], ckpt.get("history", {})

# ============================================================
# Train
# ============================================================
def train(model, train_loader, val_loader, device,
          checkpoint_path="./checkpoints/multitask_simple.pt",
          num_epochs=30, base_lr=3e-4, weight_decay=1e-4):

    set_seed(42)
    optimizer = torch.optim.AdamW(model.parameters(), lr=base_lr, weight_decay=weight_decay)

    # Simple cosine schedule
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    criterions = {t: nn.CrossEntropyLoss(label_smoothing=0.05) for t in ["age", "race", "gender"]}

    # start_epoch, best_val_loss, history = load_checkpoint(checkpoint_path, model, optimizer, device)
    start_epoch, best_val_loss, history = load_checkpoint("/kaggle/input/resnet-gra/pytorch/default/1/multitask_simple.pt", model, optimizer, device)


    model.to(device)
    for epoch in range(start_epoch + 1, num_epochs + 1):
        print(f"\nEpoch {epoch}/{num_epochs} | lr={optimizer.param_groups[0]['lr']:.2e}")
        train_m = run_epoch(model, train_loader, device, optimizer, "train", criterions, scheduler)
        val_m = run_epoch(model, val_loader, device, optimizer, "val", criterions)

        history[epoch] = {**train_m, **val_m}
        print(f"Train loss={train_m['train_loss']:.4f} | Val loss={val_m['val_loss']:.4f}")
        print(f"Val Acc → Age={val_m['val_age_acc']:.3f} Race={val_m['val_race_acc']:.3f} Gender={val_m['val_gender_acc']:.3f}")

        if val_m["val_loss"] < best_val_loss:
            best_val_loss = val_m["val_loss"]
            save_checkpoint(checkpoint_path, model, optimizer, epoch, best_val_loss, history)

    return history


In [None]:

if __name__ == "__main__":
    # Paths - update to your dataset locations
    FAIRFACE_DIR = "/kaggle/input/fairface/FairFace"
    UTK_DIR = "/kaggle/input/utkface-new/UTKFace"

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultiTaskFaceModel(backbone_type="resnet34").to(device)
    
    history = train(
        model, train_loader, val_loader, device,
        checkpoint_path="./checkpoints/multitask_simple.pt",
        num_epochs=100, base_lr=3e-4
    )


In [None]:
kkk

In [None]:
import json
import matplotlib.pyplot as plt

ckpt = torch.load("/kaggle/working/checkpoints/multitask_simple.pt", map_location=device)
hist = ckpt.get("history", {})

epochs = sorted(hist.keys())
train_loss = [hist[e]["train_loss"] for e in epochs]
val_loss = [hist[e]["val_loss"] for e in epochs]

plt.plot(epochs, train_loss, label="Train Loss")
plt.plot(epochs, val_loss, label="Val Loss")
plt.legend()
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training History")
plt.show()

plt.plot(epochs, train_loss, label="Train Loss")
plt.plot(epochs, val_loss, label="Val Loss")
plt.legend()
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training History")
plt.show()

plt.show()


In [None]:
kkkkk

In [None]:
# file: predict_multitask.py
import torch
from PIL import Image
import torchvision.transforms as T

# ============================================================
# Cấu hình
# ============================================================
CKPT_PATH = "/kaggle/working/checkpoints/multitask_simple.pt"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Map ngược label → tên lớp
gender_map_inv = {0: "Male", 1: "Female"}
race_map_inv = {
    0: "White", 1: "Black", 2: "Latino_Hispanic",
    3: "East Asian", 4: "Southeast Asian", 5: "Indian", 6: "Middle Eastern"
}
age_map_inv = {
    0: "0-2", 1: "3-9", 2: "10-19", 3: "20-29",
    4: "30-39", 5: "40-49", 6: "50-59", 7: "60-69", 8: "70+"
}

# ============================================================
# Tiền xử lý ảnh
# ============================================================

def load_image(path):
    img = Image.open(path).convert("RGB")
    return val_tf(img).unsqueeze(0)  # thêm batch dimension

# ============================================================
# Hàm dự đoán
# ============================================================
def predict(model, img_tensor):
    model.eval()
    with torch.no_grad():
        img_tensor = img_tensor.to(DEVICE)
        outputs = model(img_tensor)

        pred_gender = outputs["gender"].argmax(dim=1).item()
        pred_race   = outputs["race"].argmax(dim=1).item()
        pred_age    = outputs["age"].argmax(dim=1).item()

    return {
        "gender": gender_map_inv[pred_gender],
        "race": race_map_inv[pred_race],
        "age": age_map_inv[pred_age],
    }

# ============================================================
# Load model + checkpoint
# ============================================================
model = MultiTaskFaceModel()
ckpt = torch.load(CKPT_PATH, map_location=DEVICE)

# nếu bạn lưu EMA shadow hoặc state_dict đầy đủ:
model.load_state_dict(ckpt["model_state_dict"], strict=False)
model.to(DEVICE)

# ============================================================
# Ví dụ chạy thử
# ============================================================
if __name__ == "__main__":
    img_path = "/kaggle/input/fairface/FairFace/val/1000.jpg"
    img_tensor = load_image(img_path)
    result = predict(model, img_tensor)
    print(f"Prediction for {img_path}:")
    print(result)


In [None]:
!zip -r /kaggle/working/gra.zip /kaggle/working/checkpoints


In [None]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))
    
download_file('/kaggle/working/checkpoints', 'checkpoints') 