In [1]:
# Dependencies
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import joblib
from PIL import Image
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import cohen_kappa_score, accuracy_score
from sklearn.linear_model import LogisticRegression

# For handling module in diff dir
import sys
import os 

# Config
TEST_DATA_PATH = '../data/test/test.csv'
IMAGE_DIR = '../data/test_images/'
OUTPUT_PATH = '../submission_v1.csv'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_DIR = '../data'
MODELS_DIR = '../models/v1_stratify'
IMG_DIR = os.path.join(DATA_DIR, 'train_images') 

sys.path.append(os.path.abspath('..'))

In [2]:
# MODEL DEFINITIONS

# 1. Text Model (Code from k4)
class TransformerPetClassifier(nn.Module):
    """Transformer-based classifier for pet adoption speed prediction"""
    
    def __init__(self, model_name='bert-base-uncased', num_classes=5, dropout=0.3):
        super(TransformerPetClassifier, self).__init__()
        
        self.transformer = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        
        # Get hidden size from transformer config
        hidden_size = self.transformer.config.hidden_size
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_classes)
        )
        
    def forward(self, input_ids, attention_mask):
        # Get transformer outputs
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use [CLS] token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        
        # Classification
        logits = self.classifier(pooled_output)
        
        return logits

# 2. Image Model
class ResNet(nn.Module):
    def __init__(self):
        super(ResNet, self).__init__()
        self.resnet = models.resnet50(pretrained=True)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 5)
    def forward(self, x): return self.resnet(x)

# 3. Tabular Model
# no need to define xgboost

In [3]:
# DATASET CLASS
class EnsembleDataset(Dataset):
    def __init__(self, df, img_dir, tokenizer, transform=None):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # 1. Image Processing
        img_path = os.path.join(self.img_dir, f"{row['PetID']}-1.jpg") 
        image = Image.new('RGB', (224, 224), (0, 0, 0)) 
        if os.path.exists(img_path):
            try:
                image = Image.open(img_path).convert('RGB')
            except:
                pass 
        if self.transform: image = self.transform(image)

        # 2. Text Processing
        desc = str(row['Description']) if pd.notna(row['Description']) else "no description"
        encoding = self.tokenizer(
            desc, max_length=64, padding='max_length', truncation=True, return_tensors='pt'
        )

        return {
            'image': image,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }


In [4]:
def featurize_table(tabular_df):
    # Namelength
    tabular_df["name_length"] = tabular_df['Name'].str.len().fillna(0)
    
    # Description length
    tabular_df['description_length'] = tabular_df['Description'].str.len().fillna(0)
    
    
    # Is Mixed Breed? (Breed2 is not 0)
    tabular_df['is_mixed_breed'] = (tabular_df['Breed2'] != 0).astype(int)
    
    # Number of Colors (Count non-zero color columns)
    tabular_df['num_colors'] = (tabular_df[['Color1', 'Color2', 'Color3']] != 0).sum(axis=1)
    
    # Is Free? (Fee is 0)
    tabular_df['is_free'] = (tabular_df['Fee'] == 0).astype(int)

    # Fee per Pet (Normizalized for litters)
    tabular_df['fee_per_pet'] = tabular_df['Fee'] / tabular_df['Quantity'].replace(0, 1)

    # Total Media (Engagement proxy)
    tabular_df['total_media'] = tabular_df['PhotoAmt'] + tabular_df['VideoAmt']

    # Health Issue Flag (Health > 1 implies injury or condition)
    tabular_df['has_health_issue'] = (tabular_df['Health'] > 1).astype(int)
    # --------------------
    
    # Encode state/breed as categories
    # ADDED 'Type' to this list
    cat_cols = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 
                    'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 
                    'Sterilized', 'Health', 'State']
    for col in cat_cols:
        if col in tabular_df.columns:
            tabular_df[col] = tabular_df[col].astype('category')
    return tabular_df


In [5]:
def generate_ensemble_features(df, img_dir=IMG_DIR):
    print(f"Generating features for {len(df)} samples...")
    
    # 1. XGBoost Inference
    print("Loading XGBoost...")
    xgb_model = joblib.load(os.path.join(MODELS_DIR, 'xgb_stratify_optuna.pkl'))
    
    # Preprocess Tabular (Manual replication of src/tabular_model.py logic if not exposed as static method)
    # Ideally: from src.tabular_model import TabularModel; TabularModel.preprocess(df)
    
    df_tab = featurize_table(df)
    
    drop_cols = ['Name', 'PetID', 'RescuerID', 'Description', 'AdoptionSpeed']
    df_tab = df_tab.drop([c for c in drop_cols if c in df_tab.columns], axis=1)
    
    
    xgb_probs = xgb_model.predict_proba(df_tab) # Shape (N, 5)

    # 2. Load PyTorch Models
    print("Loading DL Models...")
    # Image
    img_model = ResNet().to(DEVICE)
    img_state = torch.load(os.path.join(MODELS_DIR, 'pet_pred_resnet50.pth'), map_location=DEVICE)
    if 'state_dict' in img_state: img_state = img_state['state_dict']
    # FIX: Add 'resnet.' prefix to match the class definition
    new_state_dict = {}
    for k, v in img_state.items():
        if not k.startswith('resnet.'):
            new_state_dict['resnet.' + k] = v
        else:
            new_state_dict[k] = v
    
    img_model.load_state_dict(new_state_dict)
    img_model.eval()

    # Text
    text_model = TransformerPetClassifier(num_classes=5).to(DEVICE)
    txt_state = torch.load(os.path.join(MODELS_DIR, 'best_transformer_model.pth'), map_location=DEVICE)
    if 'state_dict' in txt_state: txt_state = txt_state['state_dict']
    text_model.load_state_dict(txt_state, strict=False) 
    text_model.eval()

    # 3. Inference Loop
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    
    dl = DataLoader(EnsembleDataset(df, img_dir, tokenizer, transform), batch_size=32, shuffle=False)

    img_preds, text_probs = [], []
    with torch.no_grad():
        for batch in tqdm(dl):
            imgs = batch['image'].to(DEVICE)
            input_ids, masks = batch['input_ids'].to(DEVICE), batch['attention_mask'].to(DEVICE)
            
            img_out = img_model(imgs)
            # FIX: Use softmax and extend with correct shape (N, 5) instead of flattening
            img_preds.extend(torch.softmax(img_out, dim=1).cpu().numpy())
            
            text_out = text_model(input_ids, masks)
            text_probs.extend(torch.softmax(text_out, dim=1).cpu().numpy())

    # 4. Concatenate Features: XGB(5) + Text(5) + Image(5) = 15 Features
    # FIX: Removed reshape(-1, 1) and flatten() to align dimensions
    return np.hstack([xgb_probs, np.array(text_probs), np.array(img_preds)])


# Main Execution

In [None]:
full_df = pd.read_csv(os.path.join(DATA_DIR, 'train/train.csv'))

# Create Splits
train_df, test_df, y_train, y_test = train_test_split(
    full_df, full_df['AdoptionSpeed'], test_size=0.2, random_state=42, stratify=full_df['AdoptionSpeed']  # Add stratify? , stratify=full_df['AdoptionSpeed']
)
# TODO: create a mapping for count encoding rescuerID -> for inference use the rescuer_counts to map 
rescuer_counts = train_df["RescuerID"].value_counts()
#rescuer_counts.to_csv('rescuer_counts.csv')
train_df['rescuer_count'] = train_df['RescuerID'].map(rescuer_counts)

test_df['rescuer_count'] = test_df['RescuerID'].map(rescuer_counts).fillna(0)

train_df.drop('RescuerID', axis=1, inplace=True)
test_df.drop('RescuerID', axis=1, inplace=True)
# Generate Features (The "Level 1" Predictions)
print("Generating Meta-Features for Train...")
X_train_meta = generate_ensemble_features(train_df) 

print("Generating Meta-Features for Test...")
X_test_meta = generate_ensemble_features(test_df)

# - Use a Neural Network (Dense Layer) to combine & decide -
class MetaModelMLP(nn.Module):
    def __init__(self, input_dim=15, num_classes=5):
        super(MetaModelMLP, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, num_classes)
        )
        
    def forward(self, x):
        return self.network(x)

# Prepare Data for PyTorch
X_train_tensor = torch.FloatTensor(X_train_meta).to(DEVICE)
y_train_tensor = torch.LongTensor(y_train.values).to(DEVICE)
X_test_tensor = torch.FloatTensor(X_test_meta).to(DEVICE)

# Training Loop
meta_model = MetaModelMLP().to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(meta_model.parameters(), lr=0.001)

print("Training MLP Meta-Learner...")
for epoch in range(100): # 100 epochs
    optimizer.zero_grad()
    outputs = meta_model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0: print(f"Epoch {epoch} Loss: {loss.item():.4f}", end='\r')

# Evaluate
meta_model.eval()
with torch.no_grad():
    outputs = meta_model(X_test_tensor)
    test_preds = torch.argmax(outputs, dim=1).cpu().numpy()

print(f"\nMeta-Model (MLP) Test Kappa: {cohen_kappa_score(y_test, test_preds, weights='quadratic'):.4f}")


Generating Meta-Features for Train...
Generating features for 11994 samples...
Loading XGBoost...
Loading DL Models...




Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
100%|██████████| 375/375 [07:52<00:00,  1.26s/it]


Generating Meta-Features for Test...
Generating features for 2999 samples...
Loading XGBoost...
Loading DL Models...




Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
100%|██████████| 94/94 [01:50<00:00,  1.18s/it]
  y_train_tensor = torch.LongTensor(y_train.values).to(DEVICE)


Training MLP Meta-Learner...
Epoch 90 Loss: 1.1024
Meta-Model (MLP) Test Kappa: 0.4163


In [18]:
rescuer_counts

Unnamed: 0,RescuerID,count
0,fa90fa5b1ee11c86938398b60abc32cb,377
1,aa66486163b6cbc25ea62a34b11c9b91,262
2,b53c34474d9e24574bcec6a3d3306a0d,180
3,c00756f2bdd8fa88fc9f07a8309f7d5d,179
4,ee2747ce26468ec44c7194e7d1d9dad9,123
...,...,...
4784,e2af7d5c733a20fd2b1a273283986974,1
4785,089e417709c6a37839fc155af6b63196,1
4786,bc599c86ccd17d15a1c758b12d7e851b,1
4787,48d06353f65ac65dd35a8875b70962c5,1


In [23]:
# Evaluate prediction on test.csv!
# Load test data
inference_df = pd.read_csv(TEST_DATA_PATH)

# Generate meta-features for test data
print("Generating Meta-Features for Test Data...")
rescuer_counts = pd.read_csv("../data/experimental/rescuer_counts_mapping.csv")
inference_df['rescuer_count'] = inference_df['RescuerID'].map(rescuer_counts["RescuerID"]).fillna(0)
inference_df.drop(['RescuerID'],axis=1,  inplace=True)

X_test_final_meta = generate_ensemble_features(inference_df, img_dir=IMAGE_DIR)

# Convert to tensor
X_test_final_tensor = torch.FloatTensor(X_test_final_meta).to(DEVICE)

# Make predictions using trained meta model
meta_model.eval()
with torch.no_grad():
    outputs = meta_model(X_test_final_tensor)
    final_test_preds = torch.argmax(outputs, dim=1).cpu().numpy()

# Create submission dataframe
submission_df = pd.DataFrame({
    'PetID': inference_df['PetID'],
    'AdoptionSpeed': final_test_preds
})

# Save to CSV
submission_df.to_csv("../submission_v1.csv", index=False)
print(f"Submission saved to {"../submission_v1.csv"}")

Generating Meta-Features for Test Data...
Generating features for 3972 samples...
Loading XGBoost...
Loading DL Models...




Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
100%|██████████| 125/125 [02:25<00:00,  1.16s/it]


Submission saved to ../submission_v1.csv


In [24]:
torch.save({
    'model_state_dict': model.state_dict(),
    'model_name': MODEL_NAME,
    'num_classes': NUM_CLASSES,
    'max_length': MAX_LENGTH,
    'best_kappa': best_kappa
}, 'metamodel.pth')

NameError: name 'model' is not defined