In [1]:
# Import modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from PIL import Image

# DL
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.optimizer import Optimizer
from torch.nn.modules.loss import _Loss
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms
from transformers import ViTModel, ViTConfig

# Sci-kit Learn
from sklearn.preprocessing import LabelEncoder

# Typing
from typing import Tuple, Optional, Callable, Dict, List

# Unique run UUID
import uuid
RUN_ID = str(uuid.uuid4()).replace('-', '')[:6]
print(f"RUN_ID: {RUN_ID}")

RUN_ID: b01f6c


In [2]:
####################
# Set folder paths #
####################

# Root path
dataset_root = Path('./datasets/appa-real-dataset_v2')

# Labels metadata paths
labels_md_train = dataset_root / 'labels_metadata_train.csv'
labels_md_valid = dataset_root / 'labels_metadata_valid.csv'
labels_md_test = dataset_root / 'labels_metadata_test.csv'

# Dataset paths
ds_train = dataset_root / 'train_data'
ds_valid = dataset_root / 'valid_data'
ds_test = dataset_root / 'test_data'

# Create dataframe for metadata: train, valid, test
df_md_train = pd.read_csv(labels_md_train)
df_md_valid = pd.read_csv(labels_md_valid)
df_md_test = pd.read_csv(labels_md_test)

# Inspect
print("Metadata DataFrames:")
print(f"df_md_train shape: {df_md_train.shape}")
print(f"df_md_valid shape: {df_md_valid.shape}")
print(f"df_md_test shape: {df_md_test.shape}\n")

# Just the metadata dataframe, no images yet.
df_md_train.head()

Metadata DataFrames:
df_md_train shape: (4065, 5)
df_md_valid shape: (1482, 5)
df_md_test shape: (1978, 5)



Unnamed: 0,imageId,age,gender,ethnicity,emotion
0,1,5.0,male,caucasian,neutral
1,2,20.079365,female,caucasian,neutral
2,3,76.815789,female,caucasian,slightlyhappy
3,4,55.657895,female,caucasian,happy
4,5,17.666667,female,caucasian,slightlyhappy


In [3]:
def compute_sample_weights(
    df_train: pd.DataFrame,
    max_oversample_multiplier: float = 20.0,
    apply_log_smoothing: bool = False,
    balance_gender: bool = False,
    balance_ethnicity: bool = False
) -> Tuple[pd.DataFrame, Dict[str, int], Dict[str, float], List[float], WeightedRandomSampler]:
    """
    Compute per-sample weights based on age bins, and optionally gender and ethnicity balancing.
    Returns updated dataframe, frequency dicts, sample weights, and a WeightedRandomSampler.

    Args:
        df_train (pd.DataFrame): DataFrame containing 'age', 'gender', 'ethnicity'.
        max_oversample_multiplier (float): Maximum cap for oversampling multiplier.
        apply_log_smoothing (bool): Apply log smoothing to weights.
        balance_gender (bool): Whether to include gender balancing.
        balance_ethnicity (bool): Whether to include ethnicity balancing.
    
    Returns:
        Tuple containing:
            - Updated DataFrame with 'age_binned' column.
            - Age bin frequencies dictionary.
            - Oversampling weights dictionary.
            - Per-sample weights list.
            - WeightedRandomSampler instance for DataLoader.
    """
    df_train = df_train.copy()

    # ----- Age Binning -----
    bins = range(int(df_train['age'].min()), int(df_train['age'].max()) + 2)
    df_train['age_binned'] = pd.cut(df_train['age'], bins=bins, right=False)

    age_counts = df_train['age_binned'].value_counts().sort_index()
    age_bin_dict = {str(interval): int(count) for interval, count in zip(age_counts.index, age_counts.values)}

    # Compute inverse frequencies for age bins
    max_age_count = max(age_counts.values)
    age_weights = {
        str(interval): min((max_age_count / count), max_oversample_multiplier)
        for interval, count in zip(age_counts.index, age_counts.values)
        if count > 0
    }

    # ----- Gender Balancing -----
    if balance_gender:
        gender_counts = df_train['gender'].value_counts()
        max_gender_count = max(gender_counts.values)
        gender_weights = {
            gender: min((max_gender_count / count), max_oversample_multiplier)
            for gender, count in gender_counts.items()
        }
    else:
        gender_weights = {gender: 1.0 for gender in df_train['gender'].unique()}

    # ----- Ethnicity Balancing -----
    if balance_ethnicity:
        ethnicity_counts = df_train['ethnicity'].value_counts()
        max_ethnicity_count = max(ethnicity_counts.values)
        ethnicity_weights = {
            ethnicity: min((max_ethnicity_count / count), max_oversample_multiplier)
            for ethnicity, count in ethnicity_counts.items()
        }
    else:
        ethnicity_weights = {ethnicity: 1.0 for ethnicity in df_train['ethnicity'].unique()}

    # ----- Compute Final Sample Weights -----
    sample_weights: List[float] = []
    for _, row in df_train.iterrows():
        age_weight = age_weights[str(row['age_binned'])]
        gender_weight = gender_weights[row['gender']]
        ethnicity_weight = ethnicity_weights[row['ethnicity']]

        combined_weight = age_weight * gender_weight * ethnicity_weight

        if apply_log_smoothing:
            combined_weight = 1 + np.log1p(combined_weight - 1)

        sample_weights.append(min(combined_weight, max_oversample_multiplier))

    # ----- Weighted Sampler -----
    sampler = WeightedRandomSampler(
        weights=torch.DoubleTensor(sample_weights),
        num_samples=len(sample_weights),
        replacement=True
    )

    return df_train, age_bin_dict, age_weights, sample_weights, sampler


df_train_binned, age_bin_dict, age_weights, sample_weights, train_sampler = compute_sample_weights(
    df_md_train,  # Calculate the weights from the metadata descriptions
    max_oversample_multiplier=20.0,
    apply_log_smoothing=True,
    balance_gender=True,
    balance_ethnicity=True
)

print("Age Bin Frequencies:", age_bin_dict)
print("Age Oversampling Weights:", age_weights)
print("Sample Weights Example:", sample_weights[:20])  # First 20 samples

# Optional if you want gender/ethnicity weights explicitly:
#print("Gender Weights:", gender_weights)
#print("Ethnicity Weights:", ethnicity_weights)


# # DataLoader with sampler
# train_loader = DataLoader(
#     train_dataset,
#     batch_size=16,
#     sampler=train_sampler
# )

Age Bin Frequencies: {'[0, 1)': 1, '[1, 2)': 85, '[2, 3)': 33, '[3, 4)': 32, '[4, 5)': 53, '[5, 6)': 45, '[6, 7)': 39, '[7, 8)': 28, '[8, 9)': 29, '[9, 10)': 19, '[10, 11)': 13, '[11, 12)': 17, '[12, 13)': 13, '[13, 14)': 12, '[14, 15)': 28, '[15, 16)': 35, '[16, 17)': 65, '[17, 18)': 70, '[18, 19)': 68, '[19, 20)': 111, '[20, 21)': 132, '[21, 22)': 126, '[22, 23)': 156, '[23, 24)': 157, '[24, 25)': 181, '[25, 26)': 174, '[26, 27)': 161, '[27, 28)': 139, '[28, 29)': 145, '[29, 30)': 134, '[30, 31)': 126, '[31, 32)': 110, '[32, 33)': 95, '[33, 34)': 97, '[34, 35)': 74, '[35, 36)': 90, '[36, 37)': 75, '[37, 38)': 98, '[38, 39)': 61, '[39, 40)': 58, '[40, 41)': 53, '[41, 42)': 60, '[42, 43)': 46, '[43, 44)': 33, '[44, 45)': 44, '[45, 46)': 36, '[46, 47)': 46, '[47, 48)': 33, '[48, 49)': 33, '[49, 50)': 25, '[50, 51)': 37, '[51, 52)': 39, '[52, 53)': 48, '[53, 54)': 35, '[54, 55)': 30, '[55, 56)': 36, '[56, 57)': 26, '[57, 58)': 27, '[58, 59)': 18, '[59, 60)': 27, '[60, 61)': 20, '[61, 62)

In [4]:
# Instantiate encoders
gender_encoder = LabelEncoder()
ethnicity_encoder = LabelEncoder()
emotion_encoder = LabelEncoder()

# Fit on TRAIN ONLY
gender_encoder.fit(df_md_train['gender'])
ethnicity_encoder.fit(df_md_train['ethnicity'])
emotion_encoder.fit(df_md_train['emotion'])

# Transform on train, valid, and test
df_md_train['gender_encoded'] = gender_encoder.transform(df_md_train['gender'])
df_md_valid['gender_encoded'] = gender_encoder.transform(df_md_valid['gender'])
df_md_test['gender_encoded']  = gender_encoder.transform(df_md_test['gender'])

df_md_train['ethnicity_encoded'] = ethnicity_encoder.transform(df_md_train['ethnicity'])
df_md_valid['ethnicity_encoded'] = ethnicity_encoder.transform(df_md_valid['ethnicity'])
df_md_test['ethnicity_encoded']  = ethnicity_encoder.transform(df_md_test['ethnicity'])

df_md_train['emotion_encoded'] = emotion_encoder.transform(df_md_train['emotion'])
df_md_valid['emotion_encoded'] = emotion_encoder.transform(df_md_valid['emotion'])
df_md_test['emotion_encoded']  = emotion_encoder.transform(df_md_test['emotion'])

# Debug info: mappings
print("Gender Labels:", dict(enumerate(gender_encoder.classes_)))
print("Ethnicity Labels:", dict(enumerate(ethnicity_encoder.classes_)))
print("Emotion Labels:", dict(enumerate(emotion_encoder.classes_)))

Gender Labels: {0: 'female', 1: 'male'}
Ethnicity Labels: {0: 'afroamerican', 1: 'asian', 2: 'caucasian'}
Emotion Labels: {0: 'happy', 1: 'neutral', 2: 'other', 3: 'slightlyhappy'}


In [5]:
# Compute min/max from train data
age_min = df_md_train['age'].min()
age_max = df_md_train['age'].max()

# Normalize using train's min/max
df_md_train['age_normalized'] = (df_md_train['age'] - age_min) / (age_max - age_min)
df_md_valid['age_normalized'] = (df_md_valid['age'] - age_min) / (age_max - age_min)
df_md_test['age_normalized']  = (df_md_test['age']  - age_min) / (age_max - age_min)

In [6]:
df_md_train.head()

Unnamed: 0,imageId,age,gender,ethnicity,emotion,gender_encoded,ethnicity_encoded,emotion_encoded,age_normalized
0,1,5.0,male,caucasian,neutral,1,2,1,0.046207
1,2,20.079365,female,caucasian,neutral,0,2,1,0.217116
2,3,76.815789,female,caucasian,slightlyhappy,0,2,3,0.860162
3,4,55.657895,female,caucasian,happy,0,2,0,0.62036
4,5,17.666667,female,caucasian,slightlyhappy,0,2,3,0.18977


In [7]:
df_md_valid.head()

Unnamed: 0,imageId,age,gender,ethnicity,emotion,gender_encoded,ethnicity_encoded,emotion_encoded,age_normalized
0,1,26.230769,male,afroamerican,happy,1,0,0,0.286835
1,2,27.25641,male,caucasian,other,1,2,2,0.29846
2,3,23.142857,male,caucasian,slightlyhappy,1,2,3,0.251837
3,4,73.289474,female,caucasian,happy,0,2,0,0.820195
4,5,20.142857,female,caucasian,happy,0,2,0,0.217835


In [8]:
df_md_test.head()

Unnamed: 0,imageId,age,gender,ethnicity,emotion,gender_encoded,ethnicity_encoded,emotion_encoded,age_normalized
0,1,23.205128,female,caucasian,neutral,0,2,1,0.252543
1,2,70.736842,male,asian,slightlyhappy,1,1,3,0.791263
2,3,55.368421,male,asian,happy,1,1,0,0.617079
3,4,24.277778,male,caucasian,neutral,1,2,1,0.2647
4,5,25.230769,female,caucasian,neutral,0,2,1,0.275501


In [9]:
class ImageWithMetadataDataset(Dataset):
    def __init__(self, df_md: pd.DataFrame, images_dir: Path, transform=None, age_min=None, age_max=None):
        self.df = df_md
        self.images_dir = images_dir
        self.transform = transform
        self.age_min = age_min
        self.age_max = age_max

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_name = f"{int(row['imageId']):06d}.jpg"
        img_path = self.images_dir / img_name

        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        # Labels
        gender = row['gender_encoded']
        ethnicity = row['ethnicity_encoded']
        emotion = row['emotion_encoded']
        
        # Normalize age if parameters are set
        if self.age_min is not None and self.age_max is not None:
            age = (row['age'] - self.age_min) / (self.age_max - self.age_min)
        else:
            age = row['age']

        return image, torch.tensor([age], dtype=torch.float32)

In [10]:
# Make sure the images are preprocessed accordingly
train_transform: Callable = transforms.Compose([
    transforms.ToTensor(),  # ✅ Converts to Tensor with shape (C, H, W), scaled to [0,1]
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize from [0,1] → [-1,1]
])

# During evaluation DONT use data augmentation!
val_transform: Callable = transforms.Compose([
    transforms.ToTensor(),  # ✅ Converts to Tensor with shape (C, H, W), scaled to [0,1]
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# Datasets
train_dataset = ImageWithMetadataDataset(df_md_train, ds_train, transform=train_transform, age_min=age_min, age_max=age_max)
valid_dataset = ImageWithMetadataDataset(df_md_valid, ds_valid, transform=val_transform, age_min=age_min, age_max=age_max)
test_dataset  = ImageWithMetadataDataset(df_md_test,  ds_test,  transform=val_transform, age_min=age_min, age_max=age_max)

# Set batch size for saving
load_batch_size = 48

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=load_batch_size, sampler=train_sampler)  # No shuffle because sampler handles it
valid_loader = DataLoader(valid_dataset, batch_size=load_batch_size)
test_loader  = DataLoader(test_dataset, batch_size=load_batch_size)

In [11]:
import json

artifacts = {
    'run_id':RUN_ID,
    'age_min': float(age_min),
    'age_max': float(age_max),
    'gender_classes': gender_encoder.classes_.tolist(),
    'ethnicity_classes': ethnicity_encoder.classes_.tolist(),
    'emotion_classes': emotion_encoder.classes_.tolist(),
    'train_transform_config': {
        'resize': (224, 224),
        'normalize_mean': [0.5, 0.5, 0.5],
        'normalize_std': [0.5, 0.5, 0.5],
        'augmentation': {
            'random_horizontal_flip': 0.5,
            'color_jitter': {
                'brightness': 0.2,
                'contrast': 0.2,
                'saturation': 0.2,
                'hue': 0.1
            }
        }
    },
    'load_batch_size': load_batch_size
}

with open(f'artifacts_config_{RUN_ID}.json', 'w') as f:
    json.dump(artifacts, f, indent=4)

print(f"Saved artifacts_config_{RUN_ID}.json")

Saved artifacts_config_b01f6c.json


In [12]:
# https://huggingface.co/google/vit-base-patch16-224-in21k

class ViTRegressionModel(nn.Module):
    def __init__(self, pretrained_model_name='google/vit-base-patch16-224-in21k'):
        super(ViTRegressionModel, self).__init__()
        self.vit = ViTModel.from_pretrained(pretrained_model_name)
        self.regressor = nn.Linear(self.vit.config.hidden_size, 1)
    
    def forward(self,x):
        outputs = self.vit(x)
        pooled_output = outputs.pooler_output
        age_pred = self.regressor(pooled_output)
        return age_pred
    
def validate(model, loader, criterion, device):
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, ages in loader:
            images = images.to(device)
            ages = ages.to(device)
            outputs = model(images)
            loss = criterion(outputs, ages)
            val_loss += loss.item()
    avg_val_loss = val_loss / len(loader)
    return avg_val_loss

def log_training_history(epoch: int, train_loss: float, val_loss: float, log_file: str = f'training_history_{RUN_ID}.json'):
    """
    Appends training history (epoch, train_loss, val_loss) to a JSON file.
    If the file doesn't exist, it will create it.
    """
    log_path = Path(log_file)

    # If file exists, load current history
    if log_path.exists():
        with open(log_path, 'r') as f:
            history = json.load(f)
    else:
        history = {'epoch': [], 'train_loss': [], 'val_loss': []}

    # Append new epoch data
    history['epoch'].append(epoch)
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)

    # Save back to file
    with open(log_path, 'w') as f:
        json.dump(history, f, indent=4)

    print(f"Appended Epoch {epoch} to {log_file}")

In [None]:
# First freeze ViT backbone to initialize weights for regression head, potentially stablizing training and accuracy.
# Next, whole model will be trained to propagate fine-tuning throughout all layers.

# Using Mixed Precision for Lower Memory Footprint and higher speed.
from torch.amp import autocast, GradScaler

# Device Setup
device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize Model and Move to Device
model: ViTRegressionModel = ViTRegressionModel()
model = model.to(device)

# Freeze ViT backbone: Only train the regression head initially
for param in model.vit.parameters():
    param.requires_grad = False

# Loss and Optimizer
criterion: _Loss = nn.MSELoss()
optimizer: Optimizer = optim.AdamW(model.regressor.parameters(), lr=1e-4)

# Training loop (head-only) - 5 epochs
dtype_to_use = torch.bfloat16  # <- USE bfloat16 instead of float16
best_val_loss = float('inf') # Initialize to infinity
epochs: int = 5
scaler = GradScaler() # For AMP

for epoch in range(epochs):
    model.train()
    running_loss: float = 0.0

    for images, ages in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        # Move data to device
        images, ages = images.to(device), ages.to(device)

        optimizer.zero_grad()             # Reset gradients for this step
        with autocast(device_type='cuda', dtype=dtype_to_use):
            outputs: torch.Tensor = model(images)          # Forward pass
            loss: torch.Tensor = criterion(outputs, ages)  # Compute loss
        
        scaler.scale(loss).backward()                      # Backpropagation
        scaler.step(optimizer)                             # Optimizer step
        scaler.update()

        running_loss += loss.item()     # Accumulate loss for reporting

    # Print epoch-level loss
    avg_loss: float = running_loss / len(train_loader)
    
    # Validation loss is computed
    val_loss = validate(model, valid_loader, criterion, device)
    print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {avg_loss:.6f}, Val Loss: {val_loss:.6f}")

    # Save checkpoint only if validation loss improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': val_loss,
            'age_min': age_min,
            'age_max': age_max,
            'gender_encoder_classes': gender_encoder.classes_.tolist(),
            'ethnicity_encoder_classes': ethnicity_encoder.classes_.tolist(),
            'emotion_encoder_classes': emotion_encoder.classes_.tolist()
        }, f'best_vit_regressor_partial_checkpoint_{RUN_ID}.pth')
        print(f"Saved BEST checkpoint at epoch {epoch+1} with val_loss: {val_loss:.6f}")
    
    # Save train history every epoch
    log_training_history(epoch + 1, avg_loss, val_loss)

Epoch 1/5:  14%|█▍        | 12/85 [00:06<00:34,  2.09it/s]

In [None]:
# Using Mixed Precision for Lower Memory Footprint and higher speed.
from torch.amp import autocast, GradScaler

# Unfreeze ViT backbone for fine-tuning
for param in model.vit.parameters():
    param.requires_grad = True

# Re-initialize optimizer to update ALL model parameters now
optimizer: Optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Loss function remains the same
criterion: _Loss = nn.MSELoss()

# Training loop (full fine-tuning)
dtype_to_use = torch.bfloat16  # <- USE bfloat16 instead of float16
epochs: int = 15
best_val_loss = float('inf') # Initialize to infinity
scaler = GradScaler() # For AMP

for epoch in range(epochs):
    model.train()
    running_loss: float = 0.0

    for images, ages in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        images = images.to(device)
        ages = ages.to(device)

        optimizer.zero_grad()             # Reset gradients for this step
        with autocast(device_type='cuda', dtype=dtype_to_use):
            outputs: torch.Tensor = model(images)          # Forward pass
            loss: torch.Tensor = criterion(outputs, ages)  # Compute loss
        
        scaler.scale(loss).backward()                      # Backpropagation
        scaler.step(optimizer)                             # Optimizer step
        scaler.update()

        running_loss += loss.item()       # Accumulate batch loss

    # Print epoch-level loss
    avg_loss: float = running_loss / len(train_loader)
    
    # Validation loss is computed
    val_loss = validate(model, valid_loader, criterion, device)
    print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {avg_loss:.6f}, Val Loss: {val_loss:.6f}")

    # Save checkpoint only if validation loss improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': val_loss,
            'age_min': age_min,
            'age_max': age_max,
            'gender_encoder_classes': gender_encoder.classes_.tolist(),
            'ethnicity_encoder_classes': ethnicity_encoder.classes_.tolist(),
            'emotion_encoder_classes': emotion_encoder.classes_.tolist()
        }, f'best_vit_regressor_full_checkpoint_{RUN_ID}.pth')
        print(f"Saved BEST checkpoint at epoch {epoch+1} with val_loss: {val_loss:.6f}")

    # Save train history every epoch
    log_training_history(epoch + 1, avg_loss, val_loss)


In [None]:
# Final Test Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error

def evaluate(model, loader, device, age_min, age_max):
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for images, ages in loader:
            images = images.to(device)
            outputs = model(images)
            preds = outputs.cpu().numpy().flatten()
            targets = ages.numpy().flatten()
            all_preds.extend(preds)
            all_targets.extend(targets)
    
    # Denormalize predictions and targets back to actual ages
    all_preds = np.array(all_preds) * (age_max - age_min) + age_min
    all_targets = np.array(all_targets) * (age_max - age_min) + age_min

    mse = mean_squared_error(all_targets, all_preds)
    mae = mean_absolute_error(all_targets, all_preds)
    return mse, mae

# Run evaluation with denormalization
mse, mae = evaluate(model, test_loader, device, age_min, age_max)
print(f"Test MSE: {mse:.6f}, MAE: {mae:.6f}")