In [1]:
# Dependencies
import os
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from PIL import Image
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import cohen_kappa_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.class_weight import compute_sample_weight

# For handling module in diff dir
import sys
import os 

# Config
TEST_DATA_PATH = '/kaggle/input/petfinder-adoption-prediction/test/test.csv'
IMAGE_DIR = '/kaggle/input/petfinder-adoption-prediction/test_images'
OUTPUT_PATH = '../submission.csv'
MODELS_DIR = "/kaggle/input/datasets/thanaphonnaksri/testing/models/v1_stratify"
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DATA_DIR = '/kaggle/input/petfinder-adoption-prediction'
IMG_DIR = os.path.join(DATA_DIR, 'train_images') 

sys.path.append(os.path.abspath('..'))

In [2]:
# Advanced Features
def extract_sentiment_from_json(pet_id, sentiment_dir="/kaggle/input/petfinder-adoption-prediction/train_sentiment"):
    # This assumes the sentiment files follow the pattern {PetID}.json
    filename = f"{sentiment_dir}/{pet_id}.json"
    try:
        if os.path.exists(filename):
            with open(filename, 'r') as f:
                data = json.load(f)
            # Usually 'documentSentiment' holds the overall score
            if 'documentSentiment' in data:
                return data['documentSentiment']['score'], data['documentSentiment']['magnitude']
    except:
        pass
    return 0, 0 # Default if missing


def generate_text_features(df, svd_components=20, is_train=True, fit_on_text=None):
    """
    df: The dataframe (containing 'Description' and 'PetID')
    svd_components: Number of latent features to keep
    is_train: Boolean, used to decide whether to fit or transform
    fit_on_text: If is_train=False, pass the vectorizers here (tuple: tfidf, svd)
    """
    df_text = df.copy()
    
    # 1. TF-IDF + SVD (Latent Semantic Analysis)
    print("Generating TF-IDF SVD features...")
    descriptions = df_text['Description'].fillna("none").astype(str)
    
    if is_train:
        # Fit on TRAINING descriptions
        tfidf = TfidfVectorizer(min_df=3,  max_features=1000, 
                                strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True,
                                stop_words = 'english')
        
        svd = TruncatedSVD(n_components=svd_components, random_state=42)
        
        # Fit Transform
        tf_vecs = tfidf.fit_transform(descriptions)
        svd_vecs = svd.fit_transform(tf_vecs)
        
        # Save vectorizers for inference later
        vectorizers = (tfidf, svd)
    else:
        # Load from passed tuple
        tfidf, svd = fit_on_text
        tf_vecs = tfidf.transform(descriptions)
        svd_vecs = svd.transform(tf_vecs)
        vectorizers = fit_on_text

    # Create Columns
    svd_df = pd.DataFrame(svd_vecs, columns=[f'svd_desc_{i}' for i in range(svd_components)])
    svd_df.index = df_text.index
    # We reset index to make sure concat aligns correctly row-by-row
    df_text = pd.concat([df_text, svd_df], axis=1)

    # 2. Sentiment Analysis (File-based lookup)
    # Determine directory
    sent_dir = "/kaggle/input/petfinder-adoption-prediction/train_sentiment" if is_train else "/kaggle/input/petfinder-adoption-prediction/test_sentiment"
    
    print("Extracting Sentiment...")
    # Apply row-wise (can be slow, maybe parallelize with pandarallel if needed)
    sent_data = df_text['PetID'].apply(lambda x: extract_sentiment_from_json(x, sent_dir))
    
    df_text['sentiment_score'] = [x[0] for x in sent_data]
    df_text['sentiment_magnitude'] = [x[1] for x in sent_data]
    df_text['sentiment_polarity'] = df_text['sentiment_score'] * df_text['sentiment_magnitude']

    return df_text, vectorizers

# TODO: forward selection: use only features that improves kappa
def featurize_table(data_df):
    tabular_df = data_df.copy()
    # Namelength
    tabular_df["name_length"] = tabular_df['Name'].str.len().fillna(0)
    
    # Description length
    tabular_df['description_length'] = tabular_df['Description'].str.len().fillna(0)
    
    # Is Mixed Breed? (Breed2 is not 0)
    tabular_df['is_mixed_breed'] = (tabular_df['Breed2'] != 0).astype(int)
    
    
    
    # 1. Text
    tabular_df['word_count'] = tabular_df['Description'].str.split().str.len().fillna(0)
    tabular_df['char_count'] = tabular_df['Description'].str.len().fillna(0)
    tabular_df['avg_word_len'] = tabular_df['char_count'] / (tabular_df['word_count'] + 1)
    tabular_df['num_digits'] = tabular_df['Description'].apply(lambda x: sum(c.isdigit() for c in str(x)))
    tabular_df['all_caps_ratio'] = tabular_df['Description'].apply(lambda x: sum(1 for c in str(x) if c.isupper()) / max(1, len(str(x))))

    # 2. Measures
    tabular_df['fee_per_pet'] = tabular_df['Fee'] / tabular_df['Quantity'].replace(0,1)
    tabular_df['photo_per_pet'] = tabular_df['PhotoAmt'] / tabular_df['Quantity']
    tabular_df['age_per_size'] = tabular_df['Age'] / tabular_df['MaturitySize'] # Needs careful handling of 0s
    tabular_df['total_media'] = tabular_df['PhotoAmt'] + tabular_df['VideoAmt'] # Total Media (Engagement proxy)
    tabular_df['num_colors'] = (tabular_df[['Color1', 'Color2', 'Color3']] != 0).sum(axis=1) # Number of Colors (Count non-zero color columns)
    
    
    # 3. Simple Interactions
    tabular_df['is_mixed_breed'] = (tabular_df['Breed2'] != 0) & (tabular_df['Breed2'].notnull())
    tabular_df['is_specific_color'] = (tabular_df['Color2'] != 0) # Has more than 1 color    
    tabular_df['is_free'] = (tabular_df['Fee'] == 0).astype(int)    # Is Free? (Fee is 0)
    tabular_df['has_health_issue'] = (tabular_df['Health'] > 1).astype(int)   # Health Issue Flag (Health > 1 implies injury or condition)
    
    # log transform for shit and giggles
    tabular_df['Fee'] = np.log1p(tabular_df['Fee'])
    tabular_df['PhotoAmt'] = np.log1p(tabular_df['PhotoAmt'])
    
    # Check whether this
     
    # Drop useless features -> does this actually works?
    features_to_drop = [""]
    
    # Encode categories
    cat_cols = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 
                    'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 
                    'Sterilized', 'Health', 'State']
    """
    for col in cat_cols:
        if col in tabular_df.columns:
            tabular_df[col] = tabular_df[col].astype('category')
    """    

    tabular_df.drop(['Name', 'PetID', 'Description'], axis=1, inplace=True)
    return tabular_df


In [3]:
# MODEL DEFINITIONS

# 1. Text Model (Code from k4)
class TransformerPetClassifier(nn.Module):
    """Transformer-based classifier for pet adoption speed prediction"""
    # Fix this! make it local
    def __init__(self, model_name='bert-base-uncased', num_classes=5, dropout=0.3):
        super(TransformerPetClassifier, self).__init__()
        
        try:
            self.transformer = AutoModel.from_pretrained("/kaggle/input/datasets/thanaphonnaksri/testing/models/bert_base")
        except OSError:
            # Fallback: If only config exists (lighter upload), load config -> init model
            from transformers import AutoConfig
            config = AutoConfig.from_pretrained("/kaggle/input/datasets/thanaphonnaksri/testing/models/bert_config")
            self.transformer = AutoModel.from_config(config)
            
        self.dropout = nn.Dropout(dropout)
        
        # Get hidden size from transformer config
        hidden_size = self.transformer.config.hidden_size
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, num_classes)
        )
        
    def forward(self, input_ids, attention_mask, return_features=False):
        # Get transformer outputs
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use [CLS] token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        
        if return_features:
            # Pass through the first part of classification head (up to 128 dim)
            x = self.classifier[0](pooled_output) # Linear 256
            x = self.classifier[1](x) # ReLU
            x = self.classifier[2](x) # Dropout
            x = self.classifier[3](x) # Linear 128
            return x # Return the 128-dim embedding
        
        return self.classifier(pooled_output)

# 2. Image Model
class ResNet(nn.Module):
    def __init__(self):
        super(ResNet, self).__init__()
        self.resnet = models.resnet50(pretrained=False)
        #self.resnet.load_state_dict(torch.load("/kaggle/input/datasets/thanaphonnaksri/testing/models/resnet50.pth"))
        self.fc = nn.Linear(self.resnet.fc.in_features, 5) # Save the Fully connected layer seperatedly for full prediction
        self.resnet.fc = nn.Identity() # Identity: Remove the original head
        
    def forward(self, x, return_features=False): 
        features = self.resnet(x) # 2048-dim embedding
        if return_features:
            return features
        return self.fc(features)

# 3. Tabular Model
# no need to define xgboost

In [4]:
# DATASET CLASS
class EnsembleDataset(Dataset):
    def __init__(self, df, img_dir, tokenizer, transform=None):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # 1. Image Processing
        img_path = os.path.join(self.img_dir, f"{row['PetID']}-1.jpg") 
        image = Image.new('RGB', (224, 224), (0, 0, 0)) 
        if os.path.exists(img_path):
            try:
                image = Image.open(img_path).convert('RGB')
            except:
                pass 
        if self.transform: image = self.transform(image)

        # 2. Text Processing
        desc = str(row['Description']) if pd.notna(row['Description']) else "no description"
        encoding = self.tokenizer(
            desc, max_length=64, padding='max_length', truncation=True, return_tensors='pt'
        )

        return {
            'image': image,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }


In [5]:
# TODO: forward selection: use only features that improves kappa
def featurize_table(data_df):
    tabular_df = data_df.copy()
    # Namelength
    tabular_df["name_length"] = tabular_df['Name'].str.len().fillna(0)
    
    # Description length
    tabular_df['description_length'] = tabular_df['Description'].str.len().fillna(0)
    
    # Is Mixed Breed? (Breed2 is not 0)
    tabular_df['is_mixed_breed'] = (tabular_df['Breed2'] != 0).astype(int)
    
    
    
    # 1. Text
    tabular_df['word_count'] = tabular_df['Description'].str.split().str.len().fillna(0)
    tabular_df['char_count'] = tabular_df['Description'].str.len().fillna(0)
    tabular_df['avg_word_len'] = tabular_df['char_count'] / (tabular_df['word_count'] + 1)
    tabular_df['num_digits'] = tabular_df['Description'].apply(lambda x: sum(c.isdigit() for c in str(x)))
    tabular_df['all_caps_ratio'] = tabular_df['Description'].apply(lambda x: sum(1 for c in str(x) if c.isupper()) / max(1, len(str(x))))

    # 2. Measures
    tabular_df['fee_per_pet'] = tabular_df['Fee'] / tabular_df['Quantity'].replace(0,1)
    tabular_df['photo_per_pet'] = tabular_df['PhotoAmt'] / tabular_df['Quantity']
    tabular_df['age_per_size'] = tabular_df['Age'] / tabular_df['MaturitySize'] # Needs careful handling of 0s
    tabular_df['total_media'] = tabular_df['PhotoAmt'] + tabular_df['VideoAmt'] # Total Media (Engagement proxy)
    tabular_df['num_colors'] = (tabular_df[['Color1', 'Color2', 'Color3']] != 0).sum(axis=1) # Number of Colors (Count non-zero color columns)
    
    
    # 3. Simple Interactions
    tabular_df['is_mixed_breed'] = (tabular_df['Breed2'] != 0) & (tabular_df['Breed2'].notnull())
    tabular_df['is_specific_color'] = (tabular_df['Color2'] != 0) # Has more than 1 color    
    tabular_df['is_free'] = (tabular_df['Fee'] == 0).astype(int)    # Is Free? (Fee is 0)
    tabular_df['has_health_issue'] = (tabular_df['Health'] > 1).astype(int)   # Health Issue Flag (Health > 1 implies injury or condition)
    
    # log transform for shit and giggles
    tabular_df['Fee'] = np.log1p(tabular_df['Fee'])
    tabular_df['PhotoAmt'] = np.log1p(tabular_df['PhotoAmt'])
    
    # Check whether this
     
    # Drop useless features -> does this actually works?
    features_to_drop = [""]
    
    # Encode categories
    cat_cols = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 
                    'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 
                    'Sterilized', 'Health', 'State']
      

    tabular_df.drop(['Name', 'PetID', 'Description', "RescuerID"], axis=1, inplace=True)
    return tabular_df


In [6]:
# Advanced Features
def extract_sentiment_from_json(pet_id, sentiment_dir="../data/train_sentiment/"):
    # This assumes the sentiment files follow the pattern {PetID}.json
    filename = f"{sentiment_dir}/{pet_id}.json"
    try:
        if os.path.exists(filename):
            with open(filename, 'r') as f:
                data = json.load(f)
            # Usually 'documentSentiment' holds the overall score
            if 'documentSentiment' in data:
                return data['documentSentiment']['score'], data['documentSentiment']['magnitude']
    except:
        pass
    return 0, 0 # Default if missing


def generate_text_features(df, svd_components=20, is_train=True, fit_on_text=None):
    """
    df: The dataframe (containing 'Description' and 'PetID')
    svd_components: Number of latent features to keep
    is_train: Boolean, used to decide whether to fit or transform
    fit_on_text: If is_train=False, pass the vectorizers here (tuple: tfidf, svd)
    """
    df_text = df.copy()
    
    # 1. TF-IDF + SVD (Latent Semantic Analysis)
    print("Generating TF-IDF SVD features...")
    descriptions = df_text['Description'].fillna("none").astype(str)
    
    if is_train:
        # Fit on TRAINING descriptions
        tfidf = TfidfVectorizer(min_df=3,  max_features=1000, 
                                strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True,
                                stop_words = 'english')
        
        svd = TruncatedSVD(n_components=svd_components, random_state=42)
        
        # Fit Transform
        tf_vecs = tfidf.fit_transform(descriptions)
        svd_vecs = svd.fit_transform(tf_vecs)
        
        # Save vectorizers for inference later
        vectorizers = (tfidf, svd)
    else:
        # Load from passed tuple
        tfidf, svd = fit_on_text
        tf_vecs = tfidf.transform(descriptions)
        svd_vecs = svd.transform(tf_vecs)
        vectorizers = fit_on_text

    # Create Columns
    svd_df = pd.DataFrame(svd_vecs, columns=[f'svd_desc_{i}' for i in range(svd_components)])
    # We reset index to make sure concat aligns correctly row-by-row
    df_text = pd.concat([df_text.reset_index(drop=True), svd_df], axis=1)

    # 2. Sentiment Analysis (File-based lookup)
    # Determine directory
    sent_dir = "../data/train_sentiment" if is_train else "../data/test_sentiment"
    
    print("Extracting Sentiment...")
    # Apply row-wise (can be slow, maybe parallelize with pandarallel if needed)
    sent_data = df_text['PetID'].apply(lambda x: extract_sentiment_from_json(x, sent_dir))
    
    df_text['sentiment_score'] = [x[0] for x in sent_data]
    df_text['sentiment_magnitude'] = [x[1] for x in sent_data]
    df_text['sentiment_polarity'] = df_text['sentiment_score'] * df_text['sentiment_magnitude']

    return df_text, vectorizers

In [7]:
def generate_ensemble_features(df, img_dir=IMG_DIR):
    print(f"Generating features for {len(df)} samples...")
    
    # 1. XGBoost Inference
    print("Loading XGBoost...")
    xgb_model = xgb.XGBRegressor()
    xgb_model.load_model("/kaggle/input/datasets/thanaphonnaksri/testing/models/v2/xgb_v2_reg_kfold.json")
    

    df_tab = featurize_table(df)
    
    drop_cols = ['Name', 'PetID', 'RescuerID', 'Description', 'AdoptionSpeed']
    df_tab = df_tab.drop([c for c in drop_cols if c in df_tab.columns], axis=1)

    xgb_preds = xgb_model.predict(df_tab)
    
    # 2. Load PyTorch Models
    print("Loading DL Models...")
    # Image
    img_model = ResNet().to(DEVICE)
    img_state = torch.load(os.path.join(MODELS_DIR, 'pet_pred_resnet50.pth'), map_location=DEVICE)
    if 'state_dict' in img_state: img_state = img_state['state_dict']
    # FIX: Add 'resnet.' prefix to match the class definition
    new_state_dict = {}
    for k, v in img_state.items():
        
        # Case 1: the key is for FC layer
        if "resnet.fc." in k:
            new_key = k.replace("resnet.fc.", "fc.")
            new_state_dict[new_key] = v
            continue
        # Case 2: the key is for the backbone
        if not k.startswith('resnet.') and 'fc.' not in k:
            new_state_dict['resnet.' + k] = v
        else:
            # It already matches
            new_state_dict[k] = v
    
    img_model.load_state_dict(new_state_dict)
    img_model.eval()

    # Text
    text_model = TransformerPetClassifier(num_classes=5).to(DEVICE)
    txt_state = torch.load(os.path.join(MODELS_DIR, 'best_transformer_model.pth'), map_location=DEVICE)
    if 'state_dict' in txt_state: txt_state = txt_state['state_dict']
    text_model.load_state_dict(txt_state, strict=False) 
    text_model.eval()

    # 3. Inference Loop
    # CHANGE: Point to the local directory containing tokenizer_config.json
    tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/datasets/thanaphonnaksri/testing/models/tokenizer")
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    
    dl = DataLoader(EnsembleDataset(df, img_dir, tokenizer, transform), batch_size=32, shuffle=False)
    
    img_features, text_features = [], []

    img_preds_list, text_preds_list = [], []
    
    # Softmax for converting logits to probs
    softmax = nn.Softmax(dim=1)
    with torch.no_grad():
        for batch in tqdm(dl):
            imgs = batch['image'].to(DEVICE)
            input_ids, masks = batch['input_ids'].to(DEVICE), batch['attention_mask'].to(DEVICE)

            # --- IMAGE PREDICTION ---
            img_logits = img_model(imgs) # Output: [Batch, 5]
            img_probs = softmax(img_logits).cpu().numpy()
            # Convert to scalar: sum(prob * class_idx)
            img_scores = np.sum(img_probs * np.arange(5), axis=1)
            img_preds_list.extend(img_scores)
            
            # --- TEXT PREDICTION ---
            text_logits = text_model(input_ids, masks) # Output: [Batch, 5]
            text_probs = softmax(text_logits).cpu().numpy()
            text_scores = np.sum(text_probs * np.arange(5), axis=1)
            text_preds_list.extend(text_scores)
            """
            # Changed extrect emb instead of probs
            # Image: Returns (Batch, 2048)
            img_emb = img_model(imgs, return_features=True) 
            # Flatten 4D tensor (N, 2048, 1, 1) -> (N, 2048)
            img_emb = img_emb.view(img_emb.size(0), -1)
            img_features.extend(img_emb.cpu().numpy())
            
            # Text: Returns (Batch, 128)
            text_emb = text_model(input_ids, masks, return_features=True)
            text_features.extend(text_emb.cpu().numpy())
            
    # 4. Concatenate Features: XGB(5) + Text(128) + Image(2048)
    # FIX: Removed reshape(-1, 1) and flatten() to align dimensions
    return np.hstack([xgb_probs, np.array(text_features), np.array(img_features)])
            """
    return xgb_preds, np.array(img_preds_list), np.array(text_preds_list)


In [8]:
# Regressor instead?
class IntermediateFusionMetaModel(nn.Module):
    def __init__(self, xgb_dim=5, text_dim=128, img_dim=2048):
        super(IntermediateFusionMetaModel, self).__init__()
        
        # IDK: Normalize inputs for scaling
        self.norm_xgb = nn.BatchNorm1d(xgb_dim)
        self.norm_text = nn.BatchNorm1d(text_dim)
        self.norm_img = nn.BatchNorm1d(img_dim)
        
        # Projectors to reduce dimensionality before fusion
        self.img_projector = nn.Sequential(
            nn.Linear(img_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        self.text_projector = nn.Sequential(
            nn.Linear(text_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU()
        )
        
        # Fusion Layer
        self.fusion = nn.Sequential(
            nn.Linear(256 + 64 + xgb_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64,1) #1 for single continuous value (reg)
            
        )
        
    def forward(self, x):
        # Slice the input back into components
        # x is [xgb(5), text(128), img(2048)]
        xgb_start, xgb_end = 0, 5
        text_start, text_end = 5, 5+128
        img_start = 5+128
        
        xgb_data = x[:, xgb_start:xgb_end]
        text_data = x[:, text_start:text_end]
        img_data = x[:, img_start:]
        
        # IDK Apply Normalization
        xgb_data = self.norm_xgb(xgb_data)
        text_data = self.norm_text(text_data)
        img_data = self.norm_img(img_data)
        
        # Project
        img_emb = self.img_projector(img_data)
        text_emb = self.text_projector(text_data)
        
        # Concatenate and Classify
        combined = torch.cat([xgb_data, text_emb, img_emb], dim=1)
        return self.fusion(combined)

In [9]:
import scipy as sp
from functools import partial

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = cohen_kappa_score(y, X_p, weights='quadratic')
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead') # Optimizer

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p.astype(int)

    def coefficients(self):
        return self.coef_['x']

# Main Execution

In [10]:
from sklearn.linear_model import LogisticRegression
full_df = pd.read_csv(os.path.join(DATA_DIR, 'train/train.csv'))

X_train, X_eval, y_train, y_test = train_test_split(
    full_df, full_df['AdoptionSpeed'], test_size=0.2, random_state=42, stratify=full_df['AdoptionSpeed']
)

print("Generating Train Features...")
train_text, vec_tuple = generate_text_features(X_train, is_train=True)
X_train_meta = generate_ensemble_features(train_text)

Generating Train Features...
Generating TF-IDF SVD features...
Extracting Sentiment...
Generating features for 11994 samples...
Loading XGBoost...
Loading DL Models...


2026-02-17 20:01:39.759000: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771358499.776142    1069 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771358499.781149    1069 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771358499.794808    1069 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771358499.794832    1069 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771358499.794835    1069 computation_placer.cc:177] computation placer alr

In [11]:
def shaping(meta_feature):
    if isinstance(meta_feature, tuple):
        p1, p2, p3 = meta_feature
        
        # Ensure all are 2D
        if p1.ndim == 1: p1 = p1.reshape(-1, 1)
        if p2.ndim == 1: p2 = p2.reshape(-1, 1)
        if p3.ndim == 1: p3 = p3.reshape(-1, 1)
        
        meta_feature = np.hstack([p1, p2, p3])
        print("New shape:", meta_feature.shape)
        return meta_feature
X_train_meta = shaping(X_train_meta)
print("Fitting Meta Model...")
meta_model = LogisticRegression(C=1.0) # You can tune C
meta_model.fit(X_train_meta, y_train)

print("Optimizing Rounder Thresholds...")
# Need val features
val_text, _ = generate_text_features(X_eval, is_train=False, fit_on_text=vec_tuple)
X_val_meta = generate_ensemble_features(val_text)
X_val_meta = shaping(X_val_meta)
val_probs = meta_model.predict_proba(X_val_meta)
val_scores = np.sum(val_probs * np.arange(5), axis=1) # Expected Value

New shape: (11994, 3)
Fitting Meta Model...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Optimizing Rounder Thresholds...
Generating TF-IDF SVD features...
Extracting Sentiment...
Generating features for 2999 samples...
Loading XGBoost...
Loading DL Models...


100%|██████████| 94/94 [00:26<00:00,  3.59it/s]

New shape: (2999, 3)





In [12]:
optR = OptimizedRounder()
optR.fit(val_scores, y_test.values)
coefficients = optR.coefficients()
print(f"Optimized Coefficients: {coefficients}")

# Check Val Score
val_preds = optR.predict(val_scores, coefficients)
print(f"Optimized Validation Kappa: {cohen_kappa_score(y_test, val_preds, weights='quadratic'):.4f}")

Optimized Coefficients: [0.52120002 1.58311194 2.59319347 3.12369591]
Optimized Validation Kappa: 0.4621


In [13]:
# --- INFERENCE PHASE ---
print("Starting Inference on Test Data...")
inference_df = pd.read_csv(TEST_DATA_PATH)

# 1. Feature Gen (Test)
test_final_text, _ = generate_text_features(inference_df, is_train=False, fit_on_text=vec_tuple)
X_test_final_meta = generate_ensemble_features(test_final_text, img_dir=IMAGE_DIR)
X_test_final_meta = shaping(X_test_final_meta)
# 2. Predict Probabilities -> Regression Score
test_probs = meta_model.predict_proba(X_test_final_meta)
test_scores = np.sum(test_probs * np.arange(5), axis=1)

# 3. Apply Thresholds
final_test_preds = optR.predict(test_scores, coefficients)

# 4. Save
submission_df = pd.DataFrame({
    'PetID': inference_df['PetID'],
    'AdoptionSpeed': final_test_preds
})
submission_df.to_csv("submission.csv", index=False)
print("Submission saved.")

Starting Inference on Test Data...
Generating TF-IDF SVD features...
Extracting Sentiment...
Generating features for 3972 samples...
Loading XGBoost...
Loading DL Models...


100%|██████████| 125/125 [00:33<00:00,  3.68it/s]

New shape: (3972, 3)
Submission saved.



