In [1]:
# This Notebook is for experimenting with features & trying to improve boosting models
# Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedGroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import make_scorer, cohen_kappa_score, accuracy_score

pd.set_option('display.max_columns', None)
import joblib
import optuna
import torch
import os
import json


In [2]:
# TODO: forward selection: use only features that improves kappa
def featurize_table(data_df):
    tabular_df = data_df.copy()
    # Namelength
    tabular_df["name_length"] = tabular_df['Name'].str.len().fillna(0)
    
    # Description length
    tabular_df['description_length'] = tabular_df['Description'].str.len().fillna(0)
    
    # Is Mixed Breed? (Breed2 is not 0)
    tabular_df['is_mixed_breed'] = (tabular_df['Breed2'] != 0).astype(int)
    
    
    
    # 1. Text
    tabular_df['word_count'] = tabular_df['Description'].str.split().str.len().fillna(0)
    tabular_df['char_count'] = tabular_df['Description'].str.len().fillna(0)
    tabular_df['avg_word_len'] = tabular_df['char_count'] / (tabular_df['word_count'] + 1)
    tabular_df['num_digits'] = tabular_df['Description'].apply(lambda x: sum(c.isdigit() for c in str(x)))
    tabular_df['all_caps_ratio'] = tabular_df['Description'].apply(lambda x: sum(1 for c in str(x) if c.isupper()) / max(1, len(str(x))))

    # 2. Measures
    tabular_df['fee_per_pet'] = tabular_df['Fee'] / tabular_df['Quantity'].replace(0,1)
    tabular_df['photo_per_pet'] = tabular_df['PhotoAmt'] / tabular_df['Quantity']
    tabular_df['age_per_size'] = tabular_df['Age'] / tabular_df['MaturitySize'] # Needs careful handling of 0s
    tabular_df['total_media'] = tabular_df['PhotoAmt'] + tabular_df['VideoAmt'] # Total Media (Engagement proxy)
    tabular_df['num_colors'] = (tabular_df[['Color1', 'Color2', 'Color3']] != 0).sum(axis=1) # Number of Colors (Count non-zero color columns)
    
    
    # 3. Simple Interactions
    tabular_df['is_mixed_breed'] = (tabular_df['Breed2'] != 0) & (tabular_df['Breed2'].notnull())
    tabular_df['is_specific_color'] = (tabular_df['Color2'] != 0) # Has more than 1 color    
    tabular_df['is_free'] = (tabular_df['Fee'] == 0).astype(int)    # Is Free? (Fee is 0)
    tabular_df['has_health_issue'] = (tabular_df['Health'] > 1).astype(int)   # Health Issue Flag (Health > 1 implies injury or condition)
    
    # log transform for shit and giggles
    tabular_df['Fee'] = np.log1p(tabular_df['Fee'])
    tabular_df['PhotoAmt'] = np.log1p(tabular_df['PhotoAmt'])
    
    
    # Encode categories
    """
    cat_cols = ['Type', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 
                    'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 
                    'Sterilized', 'Health', 'State']
    tabular_df[cat_cols] = tabular_df[cat_cols].astype('category')
    """
    
    # Drop useless features
    features_to_drop = []
    #features_to_drop = ['svd_desc_9', 'Color3', 'VideoAmt', 'name_length', 'sentiment_magnitude','Color3', 'Health', 'num_digits', 'Dewormed', 'sentiment_polarity', 'is_specific_color', 'num_colors', 'Color2', 'svd_desc_11', 'svd_desc_18', 'svd_desc_16', 'svd_desc_14', 'svd_desc_2', 'sentiment_score', 'word_count']
    print ("Dropping", len(features_to_drop), "features")
    tabular_df = tabular_df.drop(features_to_drop, axis=1, inplace=False)
    # Drop text and ID columns
    tabular_df.drop(['Name', 'PetID', 'Description'], axis=1, inplace=True)
    return tabular_df


In [3]:
# Advanced Features
def extract_sentiment_from_json(pet_id, sentiment_dir="../data/train_sentiment/"):
    # This assumes the sentiment files follow the pattern {PetID}.json
    filename = f"{sentiment_dir}/{pet_id}.json"
    try:
        if os.path.exists(filename):
            with open(filename, 'r') as f:
                data = json.load(f)
            # Usually 'documentSentiment' holds the overall score
            if 'documentSentiment' in data:
                return data['documentSentiment']['score'], data['documentSentiment']['magnitude']
    except:
        pass
    return 0, 0 # Default if missing

# TODO: decrease svd bcuz small data
def generate_text_features(df, svd_components=5, is_train=True, fit_on_text=None):
    """
    df: The dataframe (containing 'Description' and 'PetID')
    svd_components: Number of latent features to keep
    is_train: Boolean, used to decide whether to fit or transform
    fit_on_text: If is_train=False, pass the vectorizers here (tuple: tfidf, svd)
    """
    df_text = df.copy()
    
    # 1. TF-IDF + SVD (Latent Semantic Analysis)
    print("Generating TF-IDF SVD features...")
    descriptions = df_text['Description'].fillna("none").astype(str)
    
    if is_train:
        # Fit on TRAINING descriptions
        tfidf = TfidfVectorizer(min_df=3,  max_features=1000, 
                                strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                                ngram_range=(1, 3), use_idf=True, smooth_idf=True, sublinear_tf=True,
                                stop_words = 'english')
        
        svd = TruncatedSVD(n_components=svd_components, random_state=42)
        
        # Fit Transform
        tf_vecs = tfidf.fit_transform(descriptions)
        svd_vecs = svd.fit_transform(tf_vecs)
        
        # Save vectorizers for inference later
        vectorizers = (tfidf, svd)
    else:
        # Load from passed tuple
        tfidf, svd = fit_on_text
        tf_vecs = tfidf.transform(descriptions)
        svd_vecs = svd.transform(tf_vecs)
        vectorizers = fit_on_text

    # Create Columns
    svd_df = pd.DataFrame(svd_vecs, columns=[f'svd_desc_{i}' for i in range(svd_components)])
    # We reset index to make sure concat aligns correctly row-by-row
    df_text = pd.concat([df_text.reset_index(drop=True), svd_df], axis=1)

    # 2. Sentiment Analysis (File-based lookup)
    # Determine directory
    sent_dir = "../data/train_sentiment" if is_train else "../data/test_sentiment"
    
    print("Extracting Sentiment...")
    # Apply row-wise (can be slow, maybe parallelize with pandarallel if needed)
    sent_data = df_text['PetID'].apply(lambda x: extract_sentiment_from_json(x, sent_dir))
    
    df_text['sentiment_score'] = [x[0] for x in sent_data]
    df_text['sentiment_magnitude'] = [x[1] for x in sent_data]
    df_text['sentiment_polarity'] = df_text['sentiment_score'] * df_text['sentiment_magnitude']

    return df_text, vectorizers

In [4]:
# Load Data
full_df = pd.read_csv("../data/train/train.csv") #Beware of directory

# Splitting the data from train.csv
X = full_df.drop(['AdoptionSpeed'], axis=1)
y = full_df['AdoptionSpeed'] 
X_train_raw, X_eval_raw, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Generate  Features
X_train_text, vec_tuple = generate_text_features(X_train_raw, is_train=True)
X_eval_text, _ = generate_text_features(X_eval_raw,is_train=False, fit_on_text=vec_tuple)


X_train = featurize_table(X_train_text)
X_eval = featurize_table(X_eval_text)
X_train

Generating TF-IDF SVD features...
Extracting Sentiment...
Generating TF-IDF SVD features...
Extracting Sentiment...
Dropping 0 features
Dropping 0 features


Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,RescuerID,VideoAmt,PhotoAmt,svd_desc_0,svd_desc_1,svd_desc_2,svd_desc_3,svd_desc_4,sentiment_score,sentiment_magnitude,sentiment_polarity,name_length,description_length,is_mixed_breed,word_count,char_count,avg_word_len,num_digits,all_caps_ratio,fee_per_pet,photo_per_pet,age_per_size,total_media,num_colors,is_specific_color,is_free,has_health_issue
0,1,2,307,307,1,1,0,0,2,2,2,1,2,1,3,0.0,41326,ba248f761903dcd4c4342cc724a52145,0,2.197225,0.244532,-0.037003,0.009080,0.022057,0.125729,0.3,1.0,0.30,17.0,450.0,True,80.0,450.0,5.555556,4,0.008889,0.0,2.666667,1.0,8.0,1,False,1,0
1,1,12,307,0,2,2,0,0,1,1,1,1,1,1,1,0.0,41326,744fa4278196568a18fcf5cd7d324ed2,0,1.386294,0.129967,-0.014343,-0.011156,0.216831,-0.013246,0.6,1.8,1.08,5.0,77.0,False,12.0,77.0,5.923077,0,0.038961,0.0,3.000000,12.0,3.0,1,False,1,0
2,1,2,307,307,1,1,2,7,2,1,2,2,2,1,1,0.0,41326,4d2400be2e2e78265f5c84345b7a3415,0,0.693147,0.036106,0.006342,0.009166,-0.041622,0.041715,0.0,0.0,0.00,11.0,29.0,True,5.0,29.0,4.833333,0,0.137931,0.0,1.000000,1.0,1.0,3,True,1,0
3,1,2,307,307,1,3,0,0,2,2,1,1,2,1,1,0.0,41326,b53c34474d9e24574bcec6a3d3306a0d,0,1.098612,0.230322,0.946173,0.060399,0.014193,-0.104804,0.0,0.0,0.00,6.0,12.0,True,2.0,12.0,4.000000,0,0.083333,0.0,2.000000,1.0,2.0,1,False,1,0
4,2,4,265,0,3,1,2,0,2,2,2,2,2,1,4,0.0,41326,b953d651238f379c63e732925f71a5a2,0,1.098612,0.136872,0.353946,-0.019203,-0.115202,-0.007164,0.2,0.2,0.04,11.0,20.0,False,3.0,20.0,5.000000,0,0.050000,0.0,0.500000,2.0,2.0,2,True,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11989,2,8,265,299,2,1,2,0,2,2,1,1,2,1,1,0.0,41326,bc599c86ccd17d15a1c758b12d7e851b,0,1.098612,0.205444,-0.049383,-0.056765,0.033504,0.039277,0.5,2.1,1.05,6.0,340.0,True,66.0,340.0,5.074627,1,0.014706,0.0,2.000000,4.0,2.0,2,True,1,0
11990,1,2,307,0,2,6,7,0,1,1,2,1,3,1,1,0.0,41326,48d06353f65ac65dd35a8875b70962c5,0,0.693147,0.224185,-0.049285,0.025776,0.000251,0.107381,0.2,2.1,0.42,6.0,313.0,False,63.0,313.0,4.890625,1,0.015974,0.0,1.000000,2.0,1.0,2,True,1,0
11991,1,2,307,307,1,1,0,0,2,2,1,1,2,1,1,0.0,41401,a52ad34e621f25688c3a0a579c31ca4d,0,1.098612,0.161576,-0.049220,0.056163,0.033891,-0.002027,0.4,1.9,0.76,8.0,284.0,True,50.0,284.0,5.568627,0,0.021127,0.0,2.000000,1.0,2.0,1,False,1,0
11992,1,2,307,307,1,1,2,7,2,2,2,2,2,1,1,0.0,41326,95481e953f8aed9ec3d16fc4509537e8,0,1.098612,0.163324,-0.050172,-0.018623,0.013714,0.013797,0.3,1.4,0.42,5.0,219.0,True,42.0,219.0,5.093023,0,0.027397,0.0,2.000000,1.0,2.0,3,True,1,0


In [5]:
# Plan 1: normal classification
# Hyperparameter-tuning w/Optuna
"""
# Calculate weights inversely proportional to class frequencies
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)
def objective(trial):
    params = {
        # Config for how to predict
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'num_class': 5,
        'tree_method': 'hist', # Faster training
        #'enable_categorical': True,
        'device': 'cuda' if torch.cuda.is_available() else 'cpu', # Use GPU if available
        
        # Tuning parameters
        'n_estimators': trial.suggest_int('n_estimators', 200, 1000), # More trees, but early stopping handles it
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0), # % of feature used per tree
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
    }

    # Split for early stopping (Optuna needs a validation set)
    # Using specific validation set (futher split from train set)
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, 
                                                stratify=y_train
                                                )

    # Choose regressor if trying to use with optimied rounder
    model = xgb.XGBClassifier(**params, early_stopping_rounds=50)
    
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    preds = model.predict(X_val)
    kappa = cohen_kappa_score(y_val, preds, weights='quadratic')
    return kappa

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50) # Run 50 smart trials

print(f"Best trial value: {study.best_value}")
print(f"Best params: {study.best_params}")

# Use best params
best_params = study.best_params
# Add fixed params back
best_params['objective'] = 'multi:softprob'
best_params['num_class'] = 5

xgb_optuna = xgb.XGBClassifier(**best_params)
xgb_optuna.fit(X_train, y_train, sample_weight=sample_weights) 
pred_xgb = xgb_optuna.predict(X_test)
#joblib.dump(xgb_optuna, 'xgb_optuna_full.pkl')
"""

'\n# Calculate weights inversely proportional to class frequencies\nsample_weights = compute_sample_weight(class_weight=\'balanced\', y=y_train)\ndef objective(trial):\n    params = {\n        # Config for how to predict\n        \'objective\': \'multi:softprob\',\n        \'eval_metric\': \'mlogloss\',\n        \'num_class\': 5,\n        \'tree_method\': \'hist\', # Faster training\n        #\'enable_categorical\': True,\n        \'device\': \'cuda\' if torch.cuda.is_available() else \'cpu\', # Use GPU if available\n\n        # Tuning parameters\n        \'n_estimators\': trial.suggest_int(\'n_estimators\', 200, 1000), # More trees, but early stopping handles it\n        \'learning_rate\': trial.suggest_float(\'learning_rate\', 0.005, 0.2, log=True),\n        \'max_depth\': trial.suggest_int(\'max_depth\', 3, 12),\n        \'subsample\': trial.suggest_float(\'subsample\', 0.6, 1.0),\n        \'colsample_bytree\': trial.suggest_float(\'colsample_bytree\', 0.6, 1.0), # % of feature used

XGB Regressor

In [6]:
# Plan 2: reg with optimized rounder
# Hyperparameter-tuning w/Optuna (Regressor)
groups = X_train['RescuerID']
X_train_features = X_train.drop(['RescuerID'], axis=1)
X_eval_features = X_eval.drop(['RescuerID'], axis=1)
# Spaghetti split  (1 for xgb 1 for rounder)
#X_train_model, X_val_rounder, y_train_model, y_val_rounder = train_test_split(X_train_features, y_train, test_size=0.1, random_state=42, stratify=y_train)
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'tree_method': 'hist', # Faster training
        'device': 'cuda' if torch.cuda.is_available() else 'cpu', # Use GPU if available
        
        # Tuning parameters
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000), # More trees, but early stopping handles it
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 5),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 3, 15), # Higher = prevent isolating outliers
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 10), # Regularizations (a,b,g)
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0), # 
    }

    sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
    
    kappa_scores = []

    for train_idx, val_idx in sgkf.split(X_train_features, y_train, groups=groups):
        X_tr_fold = X_train_features.iloc[train_idx]
        y_tr_fold = y_train.iloc[train_idx]
        
        X_val_fold = X_train_features.iloc[val_idx]
        y_val_fold = y_train.iloc[val_idx]
        
        model = xgb.XGBRegressor(**params, early_stopping_rounds=50)
        
        model.fit(
            X_tr_fold, y_tr_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            verbose=False
        )
        
        preds = model.predict(X_val_fold)
        preds_rounded = np.rint(preds).astype(int).clip(0, 4)
        fold_kappa = cohen_kappa_score(y_val_fold, preds_rounded, weights='quadratic')
        kappa_scores.append(fold_kappa)
    
    return np.mean(kappa_scores)

# 1. Run Optuna to get best parameters
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25) # Run 50 smart trials

print(f"Best trial value: {study.best_value}")
print(f"Best params: {study.best_params}")

# Use best params
best_params = study.best_params
# Add fixed params back
best_params['objective'] = 'reg:squarederror'

[32m[I 2026-02-18 05:11:30,446][0m A new study created in memory with name: no-name-87913d75-d1a6-4d8d-a791-dbc0415b1011[0m
[32m[I 2026-02-18 05:11:36,009][0m Trial 0 finished with value: 0.244375343013171 and parameters: {'n_estimators': 992, 'learning_rate': 0.005072180964762284, 'max_depth': 3, 'subsample': 0.7170060240138587, 'colsample_bytree': 0.6308962850887303, 'min_child_weight': 11, 'reg_alpha': 5.213591924603724, 'reg_lambda': 7.0156860095858855, 'gamma': 0.0489982839126879}. Best is trial 0 with value: 0.244375343013171.[0m
[32m[I 2026-02-18 05:11:42,383][0m Trial 1 finished with value: 0.2593623069106171 and parameters: {'n_estimators': 999, 'learning_rate': 0.007581658626109493, 'max_depth': 5, 'subsample': 0.8085953208066867, 'colsample_bytree': 0.8246705885156356, 'min_child_weight': 3, 'reg_alpha': 5.174355588552064, 'reg_lambda': 9.102319978755805, 'gamma': 0.12159376200198913}. Best is trial 1 with value: 0.2593623069106171.[0m
[32m[I 2026-02-18 05:11:47,17

Best trial value: 0.2699798635356395
Best params: {'n_estimators': 1757, 'learning_rate': 0.0583429663217349, 'max_depth': 3, 'subsample': 0.6480151830745275, 'colsample_bytree': 0.6394123307818426, 'min_child_weight': 3, 'reg_alpha': 1.6198673843535316, 'reg_lambda': 6.539737725211661, 'gamma': 3.9203654525127636}


In [7]:
xgb_optuna_reg = xgb.XGBRegressor(**best_params, early_stopping_rounds=50)

# 2. FINAL TRAINING with StratifiedGroupKFold
# We retrain on 5 folds to get OOF predictions for the Rounder
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_train)) #TODO: change the origin
test_preds_list = []

# Directory
save_dir = "../models/v2_stratify"
os.makedirs(save_dir, exist_ok=True)

print("Training Final Models...")
for fold, (train_idx, val_idx) in enumerate(sgkf.split(X_train_features, y_train, groups=groups)):
    X_tr, y_tr = X_train_features.iloc[train_idx], y_train.iloc[train_idx]
    X_val, y_val = X_train_features.iloc[val_idx], y_train.iloc[val_idx]
    
    # Train fold model
    model = xgb.XGBRegressor(**best_params, early_stopping_rounds=50)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
    
    # Save Model
    model.save_model(f"{save_dir}/xgb_fold_{fold}.json")
    print(f"Saved model for Fold {fold}")
    
    # Save OOF preds (for Rounder optimization)
    # We must map pandas index back to position for oof array
    val_pos_idx = X_train.index.get_indexer(X_val.index) #TODO: change the origin
    oof_preds[val_pos_idx] = model.predict(X_val)
    
    # Predict on unseen test set (average later)
    test_preds_list.append(model.predict(X_eval_features))

# Average the test predictions from all 5 models (Bagging)
pred_xgb_reg_raw = np.mean(test_preds_list, axis=0) # This is your final raw prediction

# 3. Rename variables for the next cells to work
pred_xgb_reg_val = oof_preds # These are your "clean" validation predictions
y_val_rounder = y_train    # The labels matching your OOF preds

# Round for evaluation
pred_xgb_reg_rounded = np.rint(pred_xgb_reg_val).astype(int).clip(0, 4)
print ("Reg QWK:", cohen_kappa_score(pred_xgb_reg_rounded, y_train, weights="quadratic"))

Training Final Models...
Saved model for Fold 0
Saved model for Fold 1
Saved model for Fold 2
Saved model for Fold 3
Saved model for Fold 4
Reg QWK: 0.27019001746534055


In [8]:
import scipy as sp
from functools import partial

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = cohen_kappa_score(y, X_p, weights='quadratic')
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead') # Optimizer

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p.astype(int)

    def coefficients(self):
        return self.coef_['x']

In [9]:
# Optimize Thresholds
pred_eval_raw = model.predict(X_eval_features)
optR = OptimizedRounder()
optR.fit(pred_xgb_reg_val, y_val_rounder)
res = optR.coefficients()
print(f"Optimized Thresholds: {res}")

# Final Predictions
pred_eval = optR.predict(pred_eval_raw, res)
print(f"Optimized QWK: {cohen_kappa_score(y_eval, pred_eval, weights='quadratic'):.4f}")

Optimized Thresholds: [0.44387645 2.0385008  2.48836424 2.7660213 ]
Optimized QWK: 0.3743


In [10]:
from oprounder import OptimizedRounder

# for fitting the rounder
#pred_train_reg = xgb_optuna_reg.predict(X_train)
#pred_val_rounder = xgb_optuna_reg.predict(pred_xgb_reg_val) 

# what we want to predict (eval)
pred_eval_reg = model.predict(X_eval_features)

# Fit the Optimized Rounder on Training Data
rounder = OptimizedRounder(n_classes=y_train.nunique(), n_trials=100)
rounder.fit(oof_preds, y_val_rounder) 


# View the learned thresholds
print(f'Optimal thresholds: {rounder.thresholds}')

# Predict on Test Data using the new thresholds
pred_reg_optimized = rounder.predict(pred_eval_reg) # use the new threshold to pick label

# Compare how the new threshold improve kappa
kappa = cohen_kappa_score(y_eval, pred_reg_optimized, weights='quadratic')
print(f'Optimal Quadratic kappa: {kappa:.4f}')

kappa = cohen_kappa_score(y_eval, np.rint(pred_eval).astype(int).clip(0, 4), weights='quadratic')
print(f'Original Quadratic kappa: {kappa:.4f}')

Optimal thresholds: [1.8248840243475104, 2.059340081076014, 2.5441489214855713, 2.7531737285340787]
Optimal Quadratic kappa: 0.3900
Original Quadratic kappa: 0.3743


CatBoost

In [11]:
"""
from catboost import CatBoostClassifier, Pool

# Define categorical features indices
cat_feature_names = list(X_train.select_dtypes(include=['category']).columns)
print(f"Categorical features for CatBoost: {cat_feature_names}")

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, 
                                                stratify=y_train
                                                )

# CatBoost handles the categories automatically (no need for OHE)
clf = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    eval_metric='Kappa',
    loss_function='MultiClass',
    cat_features=cat_feature_names,
    verbose=100
)

clf.fit(X_tr, y_tr, eval_set=(X_val, y_val))

pred_cat = clf.predict(X_eval)
"""

'\nfrom catboost import CatBoostClassifier, Pool\n\n# Define categorical features indices\ncat_feature_names = list(X_train.select_dtypes(include=[\'category\']).columns)\nprint(f"Categorical features for CatBoost: {cat_feature_names}")\n\nX_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, \n                                                stratify=y_train\n                                                )\n\n# CatBoost handles the categories automatically (no need for OHE)\nclf = CatBoostClassifier(\n    iterations=1000,\n    learning_rate=0.05,\n    eval_metric=\'Kappa\',\n    loss_function=\'MultiClass\',\n    cat_features=cat_feature_names,\n    verbose=100\n)\n\nclf.fit(X_tr, y_tr, eval_set=(X_val, y_val))\n\npred_cat = clf.predict(X_eval)\n'

In [12]:
# Save the models
#joblib.dump(xgb_optuna, 'xgb_v2.pkl')
#joblib.dump(xgb_optuna_reg, 'xgb_v2_reg.pkl')
#joblib.dump(vec_tuple[0], 'tfidf_vectorizer.pkl')
#joblib.dump(vec_tuple[1], 'svd_transformer.pkl')

#xgb_optuna.save_model("xgb_v2.json") 
#xgb_optuna_reg.save_model("xgb_v2_reg_kfold.json") 

# Evaluate

In [13]:
from sklearn.metrics import cohen_kappa_score, accuracy_score
# Testing if loaded models' working
model_testing = joblib.load("../models/v2/xgb_v2.pkl")
loaded_tfidf = joblib.load("../models/v2/tfidf_vectorizer.pkl")
loaded_svd = joblib.load("../models/v2/svd_transformer.pkl")
vec_tuple = (loaded_tfidf, loaded_svd)
X_testing_raw = X_eval_raw.copy()
X_testing_text, _ = generate_text_features(X_testing_raw, is_train=False, fit_on_text=vec_tuple)
X_testing = featurize_table(X_testing_text)
pred_testing = model_testing.predict(X_eval)


def evaluate_model(model, model_prediction):
    print(f"Model: {model.__class__.__name__}")
    print(f"Kappa Score: {cohen_kappa_score(model_prediction, y_eval, weights='quadratic'):.4f}")
    print(f"Accuracy Score: {accuracy_score(model_prediction, y_eval):.4f}")
   

print (evaluate_model(xgb_optuna, pred_xgb))
print("")
print (evaluate_model(xgb_optuna_reg, pred_xgb_reg))
#print(f'Optimal Quadratic kappa: {cohen_kappa_score(y_test, pred_reg_optimized, weights='quadratic'):.4f}')
#print ("Loaded QWK:" , cohen_kappa_score(y_test, pred_testing, weights='quadratic'))

#print (evaluate_model(clf, pred_cat))


Generating TF-IDF SVD features...


ValueError: Shape of passed values is (2999, 20), indices imply (2999, 5)

In [None]:
# Feature Importance: which factors benefit the prediction the most (extracted from XGBoost)
# ensure usage of original feature names not just np array indices
"""
xgb_optuna.get_booster().feature_names = list(X_train.columns)

plt.figure(figsize=(12,6))
xgb.plot_importance(xgb_optuna, max_num_features=15, height=0.5, importance_type="gain", values_format = "{v:.2f}") # Weight ใช้อะไรมากที่สุดเป็นส่วนประกอบ tree, Gain อะไรแบ่งได้มากสุด
plt.title("Feature Importance (Gain)")
plt.xlabel("Importance Score")
plt.ylabel("")
plt.grid(False)
plt.show()

"""
# Also plot for the regressor model
xgb_optuna_reg.get_booster().feature_names = list(X_train.columns)
plt.figure(figsize=(12,6))
xgb.plot_importance(xgb_optuna_reg, max_num_features=15, height=0.5, importance_type="gain", values_format="{v:.2f}")
plt.title("Feature Importance (Gain) - Regressor")
plt.xlabel("Importance Score")
plt.ylabel("")
plt.grid(False)
plt.show()

In [None]:
# Get feature importance from the classifier model
xgb_optuna_reg.get_booster().feature_names = list(X_train.columns)
importance_dict = xgb_optuna_reg.get_booster().get_score(importance_type='gain')

# Sort by importance (ascending to get the least important features)
sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1])

# Get bottom n features
n = 10
bottom_n = sorted_importance[:n]

print("Bottom 5 least important features (by gain):")
for feature, importance in bottom_n:
    print(f"{feature}: {importance:.4f}")


print("Shit to drop:", [f for f, _ in bottom_n])