In [1]:
# Dependencies
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import joblib
from PIL import Image
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import cohen_kappa_score, accuracy_score
from sklearn.linear_model import LogisticRegression

# For handling module in diff dir
import sys
import os 

# Config
TEST_DATA_PATH = '../data/test/test.csv'
IMAGE_DIR = '../data/test_images/'
OUTPUT_PATH = '../submission_v1.csv'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_DIR = '../data'
MODELS_DIR = '../models/v1_stratify'
IMG_DIR = os.path.join(DATA_DIR, 'train_images') 

sys.path.append(os.path.abspath('..'))

In [2]:
# MODEL DEFINITIONS

# 1. Text Model (Code from k4)
class TransformerPetClassifier(nn.Module):
    """Transformer-based classifier for pet adoption speed prediction"""
    
    def __init__(self, model_name='bert-base-uncased', num_classes=5, dropout=0.3):
        super(TransformerPetClassifier, self).__init__()
        
        self.transformer = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        
        # Get hidden size from transformer config
        hidden_size = self.transformer.config.hidden_size
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_classes)
        )
        
    def forward(self, input_ids, attention_mask, return_features=False):
        # Get transformer outputs
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use [CLS] token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        
        if return_features:
            # Pass through the first part of classification head (up to 128 dim)
            x = self.classifier[0](pooled_output) # Linear 256
            x = self.classifier[1](x) # ReLU
            x = self.classifier[2](x) # Dropout
            x = self.classifier[3](x) # Linear 128
            return x # Return the 128-dim embedding
        
        return self.classifier(pooled_output)

# 2. Image Model
class ResNet(nn.Module):
    def __init__(self):
        super(ResNet, self).__init__()
        self.resnet = models.resnet50(pretrained=True)
        self.fc = nn.Linear(self.resnet.fc.in_features, 5) # Save the Fully connected layer seperatedly for full prediction
        self.resnet.fc = nn.Identity() # Remove the original head
        
    def forward(self, x, return_features=False): 
        features = self.resnet(x) # 2048-dim embedding
        if return_features:
            return features
        return self.fc(features)

# 3. Tabular Model
# no need to define xgboost

In [3]:
# DATASET CLASS
class EnsembleDataset(Dataset):
    def __init__(self, df, img_dir, tokenizer, transform=None):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # 1. Image Processing
        img_path = os.path.join(self.img_dir, f"{row['PetID']}-1.jpg") 
        image = Image.new('RGB', (224, 224), (0, 0, 0)) 
        if os.path.exists(img_path):
            try:
                image = Image.open(img_path).convert('RGB')
            except:
                pass 
        if self.transform: image = self.transform(image)

        # 2. Text Processing
        desc = str(row['Description']) if pd.notna(row['Description']) else "no description"
        encoding = self.tokenizer(
            desc, max_length=64, padding='max_length', truncation=True, return_tensors='pt'
        )

        return {
            'image': image,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }


In [4]:
def featurize_table(tabular_df):
    # Namelength
    tabular_df["name_length"] = tabular_df['Name'].str.len().fillna(0)
    
    # Description length
    tabular_df['description_length'] = tabular_df['Description'].str.len().fillna(0)
    
    
    # Is Mixed Breed? (Breed2 is not 0)
    tabular_df['is_mixed_breed'] = (tabular_df['Breed2'] != 0).astype(int)
    
    # Number of Colors (Count non-zero color columns)
    tabular_df['num_colors'] = (tabular_df[['Color1', 'Color2', 'Color3']] != 0).sum(axis=1)
    
    # Is Free? (Fee is 0)
    tabular_df['is_free'] = (tabular_df['Fee'] == 0).astype(int)

    # Fee per Pet (Normizalized for litters)
    tabular_df['fee_per_pet'] = tabular_df['Fee'] / tabular_df['Quantity'].replace(0, 1)

    # Total Media (Engagement proxy)
    tabular_df['total_media'] = tabular_df['PhotoAmt'] + tabular_df['VideoAmt']

    # Health Issue Flag (Health > 1 implies injury or condition)
    tabular_df['has_health_issue'] = (tabular_df['Health'] > 1).astype(int)
    # --------------------
    
    # Encode state/breed as categories
    # ADDED 'Type' to this list
    cat_cols = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 
                    'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 
                    'Sterilized', 'Health', 'State']
    for col in cat_cols:
        if col in tabular_df.columns:
            tabular_df[col] = tabular_df[col].astype('category')
    return tabular_df


In [5]:
def generate_ensemble_features(df, img_dir=IMG_DIR):
    print(f"Generating features for {len(df)} samples...")
    
    # 1. XGBoost Inference
    print("Loading XGBoost...")
    xgb_model = joblib.load(os.path.join(MODELS_DIR, 'xgb_stratify_optuna.pkl'))
    
    # Preprocess Tabular (Manual replication of src/tabular_model.py logic if not exposed as static method)
    # Ideally: from src.tabular_model import TabularModel; TabularModel.preprocess(df)
    
    df_tab = featurize_table(df)
    
    drop_cols = ['Name', 'PetID', 'RescuerID', 'Description', 'AdoptionSpeed']
    df_tab = df_tab.drop([c for c in drop_cols if c in df_tab.columns], axis=1)
    
    
    xgb_probs = xgb_model.predict_proba(df_tab) # Shape (N, 5)

    # 2. Load PyTorch Models
    print("Loading DL Models...")
    # Image
    img_model = ResNet().to(DEVICE)
    img_state = torch.load(os.path.join(MODELS_DIR, 'pet_pred_resnet50.pth'), map_location=DEVICE)
    if 'state_dict' in img_state: img_state = img_state['state_dict']
    # FIX: Add 'resnet.' prefix to match the class definition
    new_state_dict = {}
    for k, v in img_state.items():
        
        # Case 1: the key is for FC layer
        if "resnet.fc." in k:
            new_key = k.replace("resnet.fc.", "fc.")
            new_state_dict[new_key] = v
            continue
        # Case 2: the key is for the backbone
        if not k.startswith('resnet.') and 'fc.' not in k:
            new_state_dict['resnet.' + k] = v
        else:
            # It already matches
            new_state_dict[k] = v
    
    img_model.load_state_dict(new_state_dict)
    img_model.eval()

    # Text
    text_model = TransformerPetClassifier(num_classes=5).to(DEVICE)
    txt_state = torch.load(os.path.join(MODELS_DIR, 'best_transformer_model.pth'), map_location=DEVICE)
    if 'state_dict' in txt_state: txt_state = txt_state['state_dict']
    text_model.load_state_dict(txt_state, strict=False) 
    text_model.eval()

    # 3. Inference Loop
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    
    dl = DataLoader(EnsembleDataset(df, img_dir, tokenizer, transform), batch_size=32, shuffle=False)

    img_features, text_features = [], []
    with torch.no_grad():
        for batch in tqdm(dl):
            imgs = batch['image'].to(DEVICE)
            input_ids, masks = batch['input_ids'].to(DEVICE), batch['attention_mask'].to(DEVICE)
            
            # Changed extrect emb instead of probs
            # Image: Returns (Batch, 2048)
            img_emb = img_model(imgs, return_features=True) 
            # Flatten 4D tensor (N, 2048, 1, 1) -> (N, 2048)
            img_emb = img_emb.view(img_emb.size(0), -1)
            img_features.extend(img_emb.cpu().numpy())
            
            # Text: Returns (Batch, 128)
            text_emb = text_model(input_ids, masks, return_features=True)
            text_features.extend(text_emb.cpu().numpy())

    # 4. Concatenate Features: XGB(5) + Text(128) + Image(2048)
    # FIX: Removed reshape(-1, 1) and flatten() to align dimensions
    return np.hstack([xgb_probs, np.array(text_features), np.array(img_features)])


In [None]:
# Regressor instead?
class IntermediateFusionMetaModel(nn.Module):
    def __init__(self, xgb_dim=5, text_dim=128, img_dim=2048):
        super(IntermediateFusionMetaModel, self).__init__()
        
        # IDK: Normalize inputs for scaling
        self.norm_xgb = nn.BatchNorm1d(xgb_dim)
        self.norm_text = nn.BatchNorm1d(text_dim)
        self.norm_img = nn.BatchNorm1d(img_dim)
        
        # Projectors to reduce dimensionality before fusion
        self.img_projector = nn.Sequential(
            nn.Linear(img_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        self.text_projector = nn.Sequential(
            nn.Linear(text_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU()
        )
        
        # Fusion Layer
        self.fusion = nn.Sequential(
            nn.Linear(256 + 64 + xgb_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64,1) #1 for single continuous value (reg)
            
        )
        
    def forward(self, x):
        # Slice the input back into components
        # x is [xgb(5), text(128), img(2048)]
        xgb_start, xgb_end = 0, 5
        text_start, text_end = 5, 5+128
        img_start = 5+128
        
        xgb_data = x[:, xgb_start:xgb_end]
        text_data = x[:, text_start:text_end]
        img_data = x[:, img_start:]
        
        # IDK Apply Normalization
        xgb_data = self.norm_xgb(xgb_data)
        text_data = self.norm_text(text_data)
        img_data = self.norm_img(img_data)
        
        # Project
        img_emb = self.img_projector(img_data)
        text_emb = self.text_projector(text_data)
        
        # Concatenate and Classify
        combined = torch.cat([xgb_data, text_emb, img_emb], dim=1)
        return self.fusion(combined)

# Main Execution

In [None]:
full_df = pd.read_csv(os.path.join(DATA_DIR, 'train/train.csv'))

# Create Splits
train_df, test_df, y_train, y_test = train_test_split(
    full_df, full_df['AdoptionSpeed'], test_size=0.2, random_state=42, stratify=full_df['AdoptionSpeed']  # Add stratify? , stratify=full_df['AdoptionSpeed']
)
# TODO: create a mapping for count encoding rescuerID -> for inference use the rescuer_counts to map 
rescuer_mapping = pd.read_csv("../data/experimental/rescuer_counts_mapping.csv")
rescuer_counts = rescuer_mapping.set_index('RescuerID')['count']

train_df['rescuer_count'] = train_df['RescuerID'].map(rescuer_counts).fillna(1)
test_df['rescuer_count'] = test_df['RescuerID'].map(rescuer_counts).fillna(1)

train_df.drop('RescuerID', axis=1, inplace=True)
test_df.drop('RescuerID', axis=1, inplace=True)

# Generate Features (The "Level 1" Predictions)
print("Generating Meta-Features for Train...")
X_train_meta = generate_ensemble_features(train_df) 

print("Generating Meta-Features for Test...")
X_test_meta = generate_ensemble_features(test_df)


# Prepare Data for PyTorch
X_train_tensor = torch.FloatTensor(X_train_meta).to(DEVICE)
y_train_tensor = torch.LongTensor(y_train.values).to(DEVICE)
X_test_tensor = torch.FloatTensor(X_test_meta).to(DEVICE)

y_train_float = y_train_tensor.float().unsqueeze(1) 
# Training Loop
meta_model = IntermediateFusionMetaModel().to(DEVICE)
criterion = nn.SmoothL1Loss()
optimizer = torch.optim.Adam(meta_model.parameters(), lr=0.001)

print("Training MLP Meta-Learner...")
for epoch in range(150): # epochs
    optimizer.zero_grad()
    outputs = meta_model(X_train_tensor)
    loss = criterion(outputs, y_train_float)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0: print(f"Epoch {epoch} Loss: {loss.item():.4f}", end='\r')


# Evaluate w/rounding
meta_model.eval()
with torch.no_grad():
    outputs = meta_model(X_test_tensor)
    #test_preds = torch.argmax(outputs, dim=1).cpu().numpy()
    raw_preds = outputs.cpu().numpy().flatten()
    test_preds = np.clip(np.round(raw_preds), 0, 4).astype(int)

print(f"\nIntermediate Fusion Test Kappa: {cohen_kappa_score(y_test, test_preds, weights='quadratic'):.4f}")

Generating Meta-Features for Train...
Generating features for 11994 samples...
Loading XGBoost...
Loading DL Models...




Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
100%|██████████| 375/375 [07:25<00:00,  1.19s/it]


Generating Meta-Features for Test...
Generating features for 2999 samples...
Loading XGBoost...
Loading DL Models...




Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
100%|██████████| 94/94 [01:52<00:00,  1.19s/it]
  y_train_tensor = torch.LongTensor(y_train.values).to(DEVICE)


Training MLP Meta-Learner...
Epoch 140 Loss: 0.4551

NameError: name 'raw_preds' is not defined

In [18]:
from scipy.optimize import minimize
from functools import partial

class OptimizedRounder:
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]: X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]: X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]: X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]: X_p[i] = 3
            else: X_p[i] = 4
        
        ll = cohen_kappa_score(y, X_p, weights='quadratic')
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]: X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]: X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]: X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]: X_p[i] = 3
            else: X_p[i] = 4
        return X_p


# 1. Get raw float predictions from training set to learn thresholds
meta_model.eval()
with torch.no_grad():
    train_raw_preds = meta_model(X_train_tensor).cpu().numpy().flatten()
    test_raw_preds = meta_model(X_test_tensor).cpu().numpy().flatten()

# 2. Fit Rounder
optR = OptimizedRounder()
optR.fit(train_raw_preds, y_train)

# 3. Predict with dynamic thresholds
test_preds_optimized = optR.predict(test_raw_preds, optR.coef_['x']).astype(int)
print ("Rounding with own Function")
print(f"Standard Rounding Kappa: {cohen_kappa_score(y_test, np.clip(np.round(test_raw_preds), 0, 4), weights='quadratic'):.4f}")
print(f"Optimized Rounding Kappa: {cohen_kappa_score(y_test, test_preds_optimized, weights='quadratic'):.4f}")
print(f"Learned Thresholds: {optR.coef_['x']}")

Rounding with own Function
Standard Rounding Kappa: 0.3314
Optimized Rounding Kappa: 0.4205
Learned Thresholds: [0.55799874 1.72213393 2.2562546  2.83002484]


In [None]:
from oprounder import OptimizedRounder
import numpy as np
from sklearn.metrics import cohen_kappa_score

meta_model.eval()
with torch.no_grad():
    train_raw_preds = meta_model(X_train_tensor).cpu().numpy().flatten()
    test_raw_preds = meta_model(X_test_tensor).cpu().numpy().flatten()

# Fit the Optimized Rounder on Training Data
rounder = OptimizedRounder(n_classes=y_train.nunique(), n_trials=100)
rounder.fit(train_raw_preds, y_train) #TODO: Change this appro.

print ("Rounding with oprounder library")
# View the learned thresholds
print(f'Optimal thresholds: {rounder.thresholds}')

# Predict on Test Data using the new thresholds
prediction_reg_optimized = rounder.predict(test_raw_preds) # use the new threshold to pick label

# Compare how the new threshold improve kappa
kappa = cohen_kappa_score(y_test, prediction_reg_optimized, weights='quadratic')
print(f'Optimal Rounding QWK: {kappa:.4f}')

kappa = cohen_kappa_score(y_test, np.clip(np.round(test_raw_preds), 0, 4), weights='quadratic')
print(f'Standard Rounding QWK: {kappa:.4f}')


Rounding with oprounder library
Optimal thresholds: [1.0198270690051536, 1.8038770526156236, 2.190740728654829, 2.70849811533068]
Optimal Rounding QWK: 0.4145
Standard Rounding QWK: 0.3314


In [11]:
# Evaluate prediction on test.csv!
# Load test data
inference_df = pd.read_csv(TEST_DATA_PATH)

# Generate meta-features for test data
print("Generating Meta-Features for Test Data...")
rescuer_counts = pd.read_csv("../data/experimental/rescuer_counts_mapping.csv")
inference_df['rescuer_count'] = inference_df['RescuerID'].map(rescuer_counts["RescuerID"]).fillna(0)
inference_df.drop(['RescuerID'],axis=1,  inplace=True)

X_test_final_meta = generate_ensemble_features(inference_df, img_dir=IMAGE_DIR)

# Convert to tensor
X_test_final_tensor = torch.FloatTensor(X_test_final_meta).to(DEVICE)

# Make predictions using trained meta model
meta_model.eval()
with torch.no_grad():
    outputs = meta_model(X_test_final_tensor)
    final_test_preds = torch.argmax(outputs, dim=1).cpu().numpy()

# Create submission dataframe
submission_df = pd.DataFrame({
    'PetID': inference_df['PetID'],
    'AdoptionSpeed': final_test_preds
})

# Save to CSV
submission_df.to_csv("../submission_v1_intermediate.csv", index=False)
print(f"Submission saved to {"../submission_v1_intermediate.csv"}")

Generating Meta-Features for Test Data...
Generating features for 3972 samples...
Loading XGBoost...
Loading DL Models...




Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
100%|██████████| 125/125 [02:26<00:00,  1.17s/it]


Submission saved to ../submission_v1_intermediate.csv
