In [5]:
import numpy as np
import pandas as pd

# Config

In [26]:
HYPER_OPT = False 
HYPER_OPT_TIME = 3600*5
USE_ORIGINAL_DATA = False 
SUBMIT_TO_KAGGLE = False

# Data Extraction

In [7]:
import os
import pandas as pd

# Define the base directory (where the notebook is running)
base_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Define the data directory
data_dir = os.path.join(base_dir, "data")

model_dir = os.path.join(base_dir, "models")

# Construct file paths
train_file = os.path.join(data_dir, "train.csv")
test_file = os.path.join(data_dir, "test.csv")
original_file = os.path.join(data_dir, "Insurance Premium Prediction Dataset.csv")

# Load the datasets
train_df = pd.read_csv(train_file, index_col="id")
test_df = pd.read_csv(test_file, index_col="id")
original_df = pd.read_csv(original_file)

# Data Cleaning

In [8]:
original_df = original_df.dropna(subset=["Premium Amount"])

In [9]:
def preprocess(df_in):
    df = df_in.copy()
    categorical_features = df.select_dtypes(include=['object', 'category']).columns
    
    df[categorical_features] = df[categorical_features].fillna("Unknown")

    for col in categorical_features:
        df[col] = df[col].astype('category')
    
    df["Policy Start Date"] = pd.to_datetime(df["Policy Start Date"])
    df["Month"]       = df["Policy Start Date"].dt.month
    df["Day"]         = df["Policy Start Date"].dt.day
    df["Week"]        = df["Policy Start Date"].dt.isocalendar().week
    df["Weekday"]     = df["Policy Start Date"].dt.weekday
    df['DaySin']      = np.sin(2 * np.pi * df['Day'] / 30)  
    df['DayCos']      = np.cos(2 * np.pi * df['Day'] / 30)
    df['WeekdaySin']  = np.sin(2 * np.pi * df['Weekday'] / 7)
    df['WeekdayCos']  = np.cos(2 * np.pi * df['Weekday'] / 7)
    
    df['DaysSinceStart']  = \
    np.ceil(
        (pd.to_datetime("12-31-2024") - df["Policy Start Date"])/ pd.Timedelta(1, "d")
    )

    df = df.drop("Policy Start Date", axis=1, errors = "ignore")

    return df

train_df = preprocess(train_df)
test_df = preprocess(test_df)
original_df = preprocess(original_df)

# Data Splitting

In [None]:
X_train = train_df.drop('Premium Amount', axis=1)
#X_train["Premium Amount"] = np.log1p(X_train['Premium Amount'].values)
y_train = pd.DataFrame(np.log1p(train_df['Premium Amount'].values))

X_test = test_df

if USE_ORIGINAL_DATA:
    X_train["Synthetic"] = 1
    X_test["Synthetic"] = 1
    X_orig = original_df.drop('Premium Amount', axis=1)
    X_orig["Synthetic"] = 0
    y_orig = pd.DataFrame(np.log1p(original_df['Premium Amount'].values))
    
else:
    X_orig = None

# Training Function

In [11]:
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
import numpy as np

def train_model_cv(model, X_train, y_train, X_test, X_orig, cv_splits=7, early_stopping_rounds=None):
    # Initialize the K-Fold for CV
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=84)
    
    # Initialize placeholders for results
    oof_preds = np.zeros(X_train.shape[0])
    test_preds = np.zeros(X_test.shape[0])
    cv_scores = np.zeros(cv_splits)
    best_iterations = np.zeros(cv_splits)
    models = []
    
    # Loop through each fold
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        print(f"Training fold {fold + 1}...")
        
        # Split data
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        if X_orig is not None:
            # Append rows
            X_train_fold = pd.concat([X_train_fold, X_orig], ignore_index=True)
            y_train_fold = pd.concat([y_train_fold, y_orig], ignore_index=True)
        
        # Fit the model on training data
        if early_stopping_rounds:
            model.fit(
                X_train_fold, y_train_fold, 
                eval_set=(X_val_fold, y_val_fold),
                early_stopping_rounds=early_stopping_rounds,
                verbose=False
            )
            best_iterations[fold]=model.get_best_iteration()
        else:
            model.fit(X_train_fold, y_train_fold)
        
        # Predict on validation and test data
        oof_preds[val_idx] = model.predict(X_val_fold)
        test_preds += model.predict(X_test)
        
        # Calculate score for this fold
        fold_score = root_mean_squared_error(y_val_fold, oof_preds[val_idx])
        cv_scores[fold] = fold_score
        models.append(model)
        
        print(f"Fold {fold + 1} RMSE: {fold_score:.4f}")
    
    # Summary statistics
    test_preds /= cv_splits
    mean_score = np.mean(cv_scores)
    std_score = np.std(cv_scores)
    best_iteration = best_iterations.mean()
    print(f"Mean CV RMSE: {mean_score:.4f} ± {std_score:.4f}")

    return {\
        "oof_preds": oof_preds,
        "test_preds": test_preds,
        "cv_scores": cv_scores,
        "models": models,
        "best_iteration": int(best_iterations.mean())
    }


# XGBOOST Wrapper

In [12]:
from xgboost import XGBRegressor
from sklearn.base import BaseEstimator, RegressorMixin
import pandas as pd
from xgboost.callback import EarlyStopping

class XGBRegressorWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.params = kwargs

    def fit(self, X, y, eval_set=None, early_stopping_rounds=None, verbose=False):
        """
        Train the XGBRegressor model.

        Parameters:
        - X: pd.DataFrame or array-like
          Training features.
        - y: array-like
          Training labels.
        - eval_set: tuple or None
          Optional validation set for early stopping, in the form [(X_val, y_val)].
        - early_stopping_rounds: int or None
          Number of rounds for early stopping. Set to None to disable.
        - verbose: bool
          Whether to print training progress.
        """
        # Ensure X is a DataFrame
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        # Initialize and train the XGB Regressor
        self.params["early_stopping_rounds"] = early_stopping_rounds 
        
        self.xgb_model_ = XGBRegressor(**self.params)
        
        # callbacks = []
        # if early_stopping_rounds and eval_set:
        #     callbacks.append(EarlyStopping(rounds=early_stopping_rounds, save_best=True, maximize=False, metric_name="rmse"))


        # Train the model with early stopping if validation set is provided
        self.xgb_model_.fit(
            X,
            y,
            eval_set=[eval_set],
            #early_stopping_rounds=early_stopping_rounds,
            #callbacks=callbacks,
            verbose=verbose
        )
        
        return self

    def predict(self, X):
        return self.xgb_model_.predict(X)

    def get_best_iteration(self):
        """
        Get the best iteration for early stopping.
        """
        return self.xgb_model_.best_iteration if hasattr(self.xgb_model_, "best_iteration") else None

    def get_params(self, deep=True):
        return self.params
    
    def set_params(self, **parameters):
        self.params.update(parameters)
        return self


# HyperOpt

In [13]:
import json
import optuna
import torch
from optuna.samplers import TPESampler
from optuna.visualization.matplotlib import (
    plot_optimization_history, 
    plot_param_importances, 
    plot_parallel_coordinate,
    plot_slice,
    plot_contour
)
import matplotlib.pyplot as plt

if HYPER_OPT:
    fixed_params = {
        "n_estimators": 10000,
        "objective": "reg:squarederror",  # XGBoost regression objective
        "tree_method": "gpu_hist" if torch.cuda.is_available() else "auto",
        "verbosity": 0,
        "enable_categorical": True
    }
    
    # Define the Optuna objective function
    def objective(trial):
        # Define hyperparameter space
        varying_params = {
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 8, 15),
            "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 50, log=True),
            "subsample": trial.suggest_float("subsample", 0.4, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 5),
            "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
            "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True)
        }

        xgb_params = {**fixed_params, **varying_params}
        
        # Initialize XGBRegressor with trial parameters
        model = XGBRegressorWrapper(**xgb_params)
        
        # Evaluate using K-Fold CV with early stopping
        results = train_model_cv(\
            model, 
            X_train, 
            y_train, 
            X_test, 
            X_orig,
            cv_splits=5, 
            early_stopping_rounds=50
        )
        score = results['cv_scores'].mean()

        trial.set_user_attr("best_iteration", results['best_iteration'])
        
        return score
    
    # Prepare data
    # Replace X_train, y_train, and X_test with your data
    # Example:
    # X_train, X_test, y_train = ...
    
    # Run Optuna optimization
    study = optuna.create_study( \
        direction="minimize", 
        study_name="XGB_v1", 
        storage="sqlite:///xgb_study_v1.db", 
        load_if_exists=True,
        sampler=TPESampler(seed=666)
    )
    study.optimize(objective, n_trials=100, timeout=HYPER_OPT_TIME)
    
    # Best parameters and result
    print("Best Trial: ", study.best_trial.params)
    print("Best RMSE: ", study.best_value)

    xgb_best_params = {**fixed_params, **study.best_trial.params}

    xgb_best_params["n_estimators"] = study.best_trial.user_attrs.get("best_iteration", None)

    with open("xgb_best_params.json", "w") as f:
        json.dump(xgb_best_params, f, indent=4)
   
    plot_optimization_history(study)
    plt.show()
    
    plot_param_importances(study)
    plt.show()
    
    plot_slice(study)
    plt.show()

else:
    xgb_best_params = {
        'n_estimators': 2225,
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'verbosity': 0,
        'enable_categorical': True,
        'learning_rate': 0.003059929305190928,
        'max_depth': 8,
        'min_child_weight': 12.496270561250991,
        'subsample': 0.8428246186530037,
        'colsample_bytree': 0.9999895920675128,
        'gamma': 2.937438656382514,
        'lambda': 1.5752155403171972,
        'alpha': 0.4038060866963702
    }


  from .autonotebook import tqdm as notebook_tqdm


In [14]:
xgb_best_params

{'n_estimators': 2225,
 'objective': 'reg:squarederror',
 'tree_method': 'gpu_hist',
 'verbosity': 0,
 'enable_categorical': True,
 'learning_rate': 0.003059929305190928,
 'max_depth': 8,
 'min_child_weight': 12.496270561250991,
 'subsample': 0.8428246186530037,
 'colsample_bytree': 0.9999895920675128,
 'gamma': 2.937438656382514,
 'lambda': 1.5752155403171972,
 'alpha': 0.4038060866963702}

In [15]:
xgb_best_params["n_estimators"] = 10000

# Model Training

In [16]:
# Initialize a CatBoost Regressor
model = XGBRegressorWrapper(**xgb_best_params)

# Use the train_model function to train and evaluate the model
results = train_model_cv(
    model=model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    X_orig=X_orig,
    early_stopping_rounds=100,
    cv_splits=7
)

Training fold 1...
Fold 1 RMSE: 1.0491
Training fold 2...


KeyboardInterrupt: 

In [None]:
mean_score = np.mean(results["cv_scores"])
std_score = np.std(results["cv_scores"])

# Prepare the data
data = {
    "mean_score": mean_score,
    "std_score": std_score
}

# Save to a JSON file
with open("score.json", "w") as json_file:
    json.dump(data, json_file, indent=4)  # Use `indent` for readability

In [None]:
import joblib

# Save OOF predictions as a CSV file
oof_preds_df = pd.DataFrame({"oof_preds": results["oof_preds"]})
oof_preds_df.to_csv("oof_preds", index=False)
print("OOF predictions saved to oof_preds.csv.")

test_preds_df = pd.DataFrame({"test_preds": results["test_preds"]})
test_preds_df.to_csv("test_preds", index=False)
print("Test predictions saved to test_preds.csv.")

joblib.dump(results["models"], "xgboost_models.pkl")

OOF predictions saved to oof_preds.csv.
Test predictions saved to oof_preds.csv.


['xgboost_models.pkl']

# Submission

In [None]:
y_pred = np.expm1(results['test_preds'])

submission = pd.DataFrame({
    'id': X_test.index,  
    'Premium Amount': y_pred
})

submission.to_csv('submission.csv', index=False)

In [28]:
import os

# Define your message and file paths
COMP_NAME = "playground-series-s4e12"
FILE_PATH = "submission.csv"
mean_score = 155.0
std_score = 2.0
test_var=0

SUBMIT_MESSAGE = f"Clean XGB: Mean score: {mean_score:.4f} +/- {std_score:.4f}"

# Submit to Kaggle
if SUBMIT_TO_KAGGLE: 
    os.system(f'kaggle competitions submit -c {COMP_NAME} -f {FILE_PATH} -m "{SUBMIT_MESSAGE}"')

# Git commit and push
GIT_COMMIT_MESSAGE = f"Submission: {SUBMIT_MESSAGE}"

# Commands for Git
os.system("code --command workbench.action.files.save")
os.system("git add .")  # Stage all changes (adjust if you only want specific files)
os.system(f'git commit -m "{GIT_COMMIT_MESSAGE}"')  # Commit changes with a message
os.system("git push origin main")  # Push to the main branch (change branch if needed)

0