In [1]:
import numpy as np
import pandas as pd
import os
import sys

# Config

In [2]:
HYPER_OPT = False 
HYPER_OPT_TIME = 3600*8
USE_ORIGINAL_DATA = False 
SUBMIT_TO_KAGGLE = False

In [3]:
# Define the base directory (where the notebook is running)
base_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Define the data directory
data_dir = os.path.join(base_dir, "data")

model_dir = os.path.join(base_dir, "models")

sys.path.append(base_dir)

# Data Extraction

In [4]:
from axyom_utilities.data_extraction import extract_data

X_train, y_train, X_test, X_orig, y_orig = extract_data(data_dir, USE_ORIGINAL_DATA)

In [5]:
oof_clean_xgboost = pd.read_csv(os.path.join(base_dir, "notebooks\\Clean_XGBoost\\oof_preds.csv"))
oof_clean_lgbm = pd.read_csv(os.path.join(base_dir, "notebooks\\Clean_LGBM\\oof_preds.csv"))
oof_clean_catboost = pd.read_csv(os.path.join(base_dir, "notebooks\\Clean_Catboost\\oof_preds.csv"))
oof_clean_autogluon = pd.read_csv(os.path.join(base_dir, "notebooks\\AutoGluon\\oof_preds.csv"))

test_clean_xgboost = pd.read_csv(os.path.join(base_dir, "notebooks\\Clean_XGBoost\\test_preds.csv"))
test_clean_lgbm = pd.read_csv(os.path.join(base_dir, "notebooks\\Clean_LGBM\\test_preds.csv"))
test_clean_catboost = pd.read_csv(os.path.join(base_dir, "notebooks\\Clean_Catboost\\test_preds.csv"))
test_clean_autogluon = pd.read_csv(os.path.join(base_dir, "notebooks\\AutoGluon\\test_preds.csv"))

# Data Cleaning

In [6]:
from axyom_utilities.preprocessing import preprocess 

X_train = preprocess(X_train)
X_test = preprocess(X_test)
if USE_ORIGINAL_DATA:
    X_orig = preprocess(X_orig)

# Joining Data

In [35]:
X_train_L1 = pd.concat([X_train, oof_clean_xgboost, oof_clean_lgbm, oof_clean_catboost, oof_clean_autogluon], axis=1)
X_test_L1 = pd.concat([X_test.reset_index(drop=True), test_clean_xgboost, test_clean_lgbm, test_clean_catboost, test_clean_autogluon], axis=1)

In [36]:
X_test_L1

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,...,Weekday,DaySin,DayCos,WeekdaySin,WeekdayCos,DaysSinceStart,preds_clean_XGBoost,preds_clean_lgbm,preds_clean_catboost,preds_clean_autogluon
0,28.0,Female,2310.0,Unknown,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,...,6,7.431448e-01,0.669131,-0.781831,0.623490,576.0,6.723132,6.726636,6.710793,6.742937
1,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,...,0,-9.945219e-01,-0.104528,0.000000,1.000000,253.0,6.683443,6.683843,6.663115,6.689930
2,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,...,2,8.660254e-01,0.500000,0.974928,-0.222521,636.0,6.670720,6.677606,6.667378,6.692963
3,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,...,2,-8.660254e-01,0.500000,0.974928,-0.222521,433.0,6.687591,6.690707,6.682231,6.690712
4,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,...,4,-7.431448e-01,0.669131,-0.433884,-0.900969,1131.0,6.623063,6.626821,6.615705,6.616789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,50.0,Female,38782.0,Married,1.0,Bachelor's,Unknown,14.498639,Rural,Premium,...,4,9.510565e-01,-0.309017,-0.433884,-0.900969,1271.0,6.876002,6.877139,6.827030,6.879146
799996,,Female,73462.0,Single,0.0,Master's,Unknown,8.145748,Rural,Basic,...,1,-4.067366e-01,0.913545,0.781831,0.623490,644.0,6.375881,6.356761,6.368920,6.384323
799997,26.0,Female,35178.0,Single,0.0,Master's,Employed,6.636583,Urban,Comprehensive,...,0,-1.133108e-15,1.000000,0.000000,1.000000,1919.0,6.722048,6.713081,6.706430,6.718744
799998,34.0,Female,45661.0,Single,3.0,Master's,Unknown,15.937248,Urban,Premium,...,0,9.510565e-01,-0.309017,0.000000,1.000000,967.0,6.707286,6.697231,6.683486,6.695484


# HyperOpt

In [9]:
import json
from axyom_utilities.wrappers import XGBRegressorWrapper
from axyom_utilities.training import train_model_cv
import optuna
import torch
from optuna.samplers import TPESampler
from optuna.visualization.matplotlib import (
    plot_optimization_history, 
    plot_param_importances, 
    plot_parallel_coordinate,
    plot_slice,
    plot_contour
)
import matplotlib.pyplot as plt

if HYPER_OPT:
    fixed_params = {
        "n_estimators": 10000,
        "objective": "reg:squarederror",  # XGBoost regression objective
        "tree_method": "gpu_hist" if torch.cuda.is_available() else "auto",
        "verbosity": 0,
        "enable_categorical": True
    }
    
    # Define the Optuna objective function
    def objective(trial):
        # Define hyperparameter space
        varying_params = {
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 8, 15),
            "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 50, log=True),
            "subsample": trial.suggest_float("subsample", 0.4, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 5),
            "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
            "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True)
        }

        xgb_params = {**fixed_params, **varying_params}
        
        # Initialize XGBRegressor with trial parameters
        model = XGBRegressorWrapper(**xgb_params)
        
        # Evaluate using K-Fold CV with early stopping
        results = train_model_cv(\
            model, 
            X_train_L1, 
            y_train, 
            X_test_L1, 
            X_orig,
            cv_splits=5, 
            early_stopping_rounds=50
        )
        score = results['cv_scores'].mean()

        trial.set_user_attr("best_iteration", results['best_iteration'])
        
        return score
    
    # Prepare data
    # Replace X_train, y_train, and X_test with your data
    # Example:
    # X_train, X_test, y_train = ...
    
    # Run Optuna optimization
    study = optuna.create_study( \
        direction="minimize", 
        study_name="XGB_v1", 
        storage="sqlite:///xgb_study_v1.db", 
        load_if_exists=True,
        sampler=TPESampler(seed=666)
    )
    study.optimize(objective, n_trials=100, timeout=HYPER_OPT_TIME)
    
    # Best parameters and result
    print("Best Trial: ", study.best_trial.params)
    print("Best RMSE: ", study.best_value)

    xgb_best_params = {**fixed_params, **study.best_trial.params}

    xgb_best_params["n_estimators"] = study.best_trial.user_attrs.get("best_iteration", None)

    with open("xgb_best_params.json", "w") as f:
        json.dump(xgb_best_params, f, indent=4)
   
    plot_optimization_history(study)
    plt.show()
    
    plot_param_importances(study)
    plt.show()
    
    plot_slice(study)
    plt.show()

else:
    xgb_best_params = {
        "n_estimators": 2936,
        "objective": "reg:squarederror",
        "tree_method": "gpu_hist",
        "verbosity": 0,
        "enable_categorical": True,
        "learning_rate": 0.0015809559369696921,
        "max_depth": 8,
        "min_child_weight": 0.0010519063716596696,
        "subsample": 0.9259500688120887,
        "colsample_bytree": 0.5156374474989557,
        "gamma": 4.750602896904545,
        "lambda": 7.589009467266231,
        "alpha": 3.6958778093250424
    }


In [10]:
xgb_best_params

{'n_estimators': 2936,
 'objective': 'reg:squarederror',
 'tree_method': 'gpu_hist',
 'verbosity': 0,
 'enable_categorical': True,
 'learning_rate': 0.0015809559369696921,
 'max_depth': 8,
 'min_child_weight': 0.0010519063716596696,
 'subsample': 0.9259500688120887,
 'colsample_bytree': 0.5156374474989557,
 'gamma': 4.750602896904545,
 'lambda': 7.589009467266231,
 'alpha': 3.6958778093250424}

In [11]:
xgb_best_params["n_estimators"] = 10000

# Model Training

In [37]:
# Initialize a CatBoost Regressor
model = XGBRegressorWrapper(**xgb_best_params)

# Use the train_model function to train and evaluate the model
results = train_model_cv(
    model=model,
    X_train=X_train_L1,
    y_train=y_train,
    X_test=X_test_L1,
    X_orig=X_orig,
    early_stopping_rounds=100,
    cv_splits=7
)

Training fold 1...
Fold 1 RMSE: 1.0484
Training fold 2...
Fold 2 RMSE: 1.0415
Training fold 3...
Fold 3 RMSE: 1.0457
Training fold 4...
Fold 4 RMSE: 1.0421
Training fold 5...
Fold 5 RMSE: 1.0461
Training fold 6...
Fold 6 RMSE: 1.0475
Training fold 7...
Fold 7 RMSE: 1.0416
Mean CV RMSE: 1.0447 ± 0.0027


In [38]:
mean_score = np.mean(results["cv_scores"])
std_score = np.std(results["cv_scores"])

# Prepare the data
data = {
    "mean_score": mean_score,
    "std_score": std_score
}

# Save to a JSON file
with open("score.json", "w") as json_file:
    json.dump(data, json_file, indent=4)  # Use `indent` for readability

In [39]:
import joblib

# Save OOF predictions as a CSV file
oof_preds_df = pd.DataFrame({"oof_preds_stacking_L1_XGB": results["oof_preds"]})
oof_preds_df.to_csv("oof_preds_stacking_L1_XGB.csv", index=False)

test_preds_df = pd.DataFrame({"test_preds_stacking_L1_XGB": results["test_preds"]})
test_preds_df.to_csv("test_preds_stacking_L1_XGB.csv", index=False)

joblib.dump(results["models"], "xgboost_models.pkl")

['xgboost_models.pkl']

# Submission

In [40]:
y_min = y_train.min()
print(f"y_min = {y_min}")
y_max = y_train.max() 
print(f"y_max = {y_max}")

y_min = 0    3.044522
dtype: float64
y_max = 0    8.517193
dtype: float64


In [42]:
y_pred = np.expm1(results['test_preds'])
submission = pd.DataFrame({
    'id': X_test.index,  
    'Premium Amount': y_pred
})

FILE_PATH = f"stacking_v1_{mean_score:.4f}.csv"

submission.to_csv(FILE_PATH, index=False)

In [43]:
import os

# Define your message and file paths
COMP_NAME = "playground-series-s4e12"


SUBMIT_MESSAGE = f"Stacking v1: Mean score: {mean_score:.4f} +/- {std_score:.4f}"

# Submit to Kaggle

if True: 
    os.system(f'kaggle competitions submit -c {COMP_NAME} -f {FILE_PATH} -m "{SUBMIT_MESSAGE}"')