In [1]:
import numpy as np
import pandas as pd
import os
import pandas as pd
import sys

# Config

In [2]:
HYPER_OPT = False 
HYPER_OPT_TIME = 3600*5
USE_ORIGINAL_DATA = False 
SUBMIT_TO_KAGGLE = False

In [3]:
# Define the base directory (where the notebook is running)
base_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Define the data directory
data_dir = os.path.join(base_dir, "data")

model_dir = os.path.join(base_dir, "models")

sys.path.append(base_dir)

# Data Extraction

In [4]:

# # Construct file paths
# train_file = os.path.join(data_dir, "train.csv")
# test_file = os.path.join(data_dir, "test.csv")
# original_file = os.path.join(data_dir, "Insurance Premium Prediction Dataset.csv")

# # Load the datasets
# train_df = pd.read_csv(train_file, index_col="id")
# test_df = pd.read_csv(test_file, index_col="id")
# original_df = pd.read_csv(original_file)

In [5]:
from axyom_utilities.data_extraction import extract_data

X_train, y_train, X_test, X_orig, y_orig = extract_data(data_dir, USE_ORIGINAL_DATA)

# Data Cleaning

In [6]:
from axyom_utilities.preprocessing import preprocess 

#original_df = original_df.dropna(subset=["Premium Amount"])

# train_df = preprocess(train_df)
# test_df = preprocess(test_df)
# original_df = preprocess(original_df)

X_train = preprocess(X_train)
X_test = preprocess(X_test)
if USE_ORIGINAL_DATA:
    X_orig = preprocess(X_orig)

# Data Splitting

In [7]:
# X_train = train_df.drop('Premium Amount', axis=1)
# y_train = pd.DataFrame(np.log1p(train_df['Premium Amount'].values)) # Log Space
# X_test = test_df

# if USE_ORIGINAL_DATA:
#     X_train["Synthetic"] = 1
#     X_test["Synthetic"] = 1
#     X_orig = original_df.drop('Premium Amount', axis=1)
#     X_orig["Synthetic"] = 0
#     y_orig = pd.DataFrame(np.log1p(original_df['Premium Amount'].values))
    
# else:
#     X_orig = None

# HyperOpt

In [8]:
import json
from axyom_utilities.wrappers import XGBRegressorWrapper
from axyom_utilities.training import train_model_cv
import optuna
import torch
from optuna.samplers import TPESampler
from optuna.visualization.matplotlib import (
    plot_optimization_history, 
    plot_param_importances, 
    plot_parallel_coordinate,
    plot_slice,
    plot_contour
)
import matplotlib.pyplot as plt

if HYPER_OPT:
    fixed_params = {
        "n_estimators": 10000,
        "objective": "reg:squarederror",  # XGBoost regression objective
        "tree_method": "gpu_hist" if torch.cuda.is_available() else "auto",
        "verbosity": 0,
        "enable_categorical": True
    }
    
    # Define the Optuna objective function
    def objective(trial):
        # Define hyperparameter space
        varying_params = {
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 8, 15),
            "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 50, log=True),
            "subsample": trial.suggest_float("subsample", 0.4, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 5),
            "lambda": trial.suggest_float("lambda", 1e-3, 10, log=True),
            "alpha": trial.suggest_float("alpha", 1e-3, 10, log=True)
        }

        xgb_params = {**fixed_params, **varying_params}
        
        # Initialize XGBRegressor with trial parameters
        model = XGBRegressorWrapper(**xgb_params)
        
        # Evaluate using K-Fold CV with early stopping
        results = train_model_cv(\
            model, 
            X_train, 
            y_train, 
            X_test, 
            X_orig,
            cv_splits=5, 
            early_stopping_rounds=50
        )
        score = results['cv_scores'].mean()

        trial.set_user_attr("best_iteration", results['best_iteration'])
        
        return score
    
    # Prepare data
    # Replace X_train, y_train, and X_test with your data
    # Example:
    # X_train, X_test, y_train = ...
    
    # Run Optuna optimization
    study = optuna.create_study( \
        direction="minimize", 
        study_name="XGB_v1", 
        storage="sqlite:///xgb_study_v1.db", 
        load_if_exists=True,
        sampler=TPESampler(seed=666)
    )
    study.optimize(objective, n_trials=100, timeout=HYPER_OPT_TIME)
    
    # Best parameters and result
    print("Best Trial: ", study.best_trial.params)
    print("Best RMSE: ", study.best_value)

    xgb_best_params = {**fixed_params, **study.best_trial.params}

    xgb_best_params["n_estimators"] = study.best_trial.user_attrs.get("best_iteration", None)

    with open("xgb_best_params.json", "w") as f:
        json.dump(xgb_best_params, f, indent=4)
   
    plot_optimization_history(study)
    plt.show()
    
    plot_param_importances(study)
    plt.show()
    
    plot_slice(study)
    plt.show()

else:
    xgb_best_params = {
        'n_estimators': 2225,
        'objective': 'reg:squarederror',
        'tree_method': 'gpu_hist',
        'verbosity': 0,
        'enable_categorical': True,
        'learning_rate': 0.003059929305190928,
        'max_depth': 8,
        'min_child_weight': 12.496270561250991,
        'subsample': 0.8428246186530037,
        'colsample_bytree': 0.9999895920675128,
        'gamma': 2.937438656382514,
        'lambda': 1.5752155403171972,
        'alpha': 0.4038060866963702
    }


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
xgb_best_params

{'n_estimators': 2225,
 'objective': 'reg:squarederror',
 'tree_method': 'gpu_hist',
 'verbosity': 0,
 'enable_categorical': True,
 'learning_rate': 0.003059929305190928,
 'max_depth': 8,
 'min_child_weight': 12.496270561250991,
 'subsample': 0.8428246186530037,
 'colsample_bytree': 0.9999895920675128,
 'gamma': 2.937438656382514,
 'lambda': 1.5752155403171972,
 'alpha': 0.4038060866963702}

In [10]:
xgb_best_params["n_estimators"] = 10000

# Model Training

In [11]:
# Initialize a CatBoost Regressor
model = XGBRegressorWrapper(**xgb_best_params)

# Use the train_model function to train and evaluate the model
results = train_model_cv(
    model=model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    X_orig=X_orig,
    early_stopping_rounds=100,
    cv_splits=7
)

Training fold 1...
Fold 1 RMSE: 1.0491
Training fold 2...


KeyboardInterrupt: 

In [None]:
mean_score = np.mean(results["cv_scores"])
std_score = np.std(results["cv_scores"])

# Prepare the data
data = {
    "mean_score": mean_score,
    "std_score": std_score
}

# Save to a JSON file
with open("score.json", "w") as json_file:
    json.dump(data, json_file, indent=4)  # Use `indent` for readability

In [None]:
import joblib

# Save OOF predictions as a CSV file
oof_preds_df = pd.DataFrame({"oof_preds": results["oof_preds"]})
oof_preds_df.to_csv("oof_preds", index=False)
print("OOF predictions saved to oof_preds.csv.")

test_preds_df = pd.DataFrame({"test_preds": results["test_preds"]})
test_preds_df.to_csv("test_preds", index=False)
print("Test predictions saved to test_preds.csv.")

joblib.dump(results["models"], "xgboost_models.pkl")

# Submission

In [None]:
y_pred = np.expm1(results['test_preds'])

submission = pd.DataFrame({
    'id': X_test.index,  
    'Premium Amount': y_pred
})

submission.to_csv('submission.csv', index=False)

In [None]:
import os
from IPython.display import display, Javascript

# Define your message and file paths
COMP_NAME = "playground-series-s4e12"
FILE_PATH = "submission.csv"

SUBMIT_MESSAGE = f"Clean XGB: Mean score: {mean_score:.4f} +/- {std_score:.4f}"

# Submit to Kaggle
if SUBMIT_TO_KAGGLE: 
    os.system(f'kaggle competitions submit -c {COMP_NAME} -f {FILE_PATH} -m "{SUBMIT_MESSAGE}"')

# Git commit and push
GIT_COMMIT_MESSAGE = f"Submission: {SUBMIT_MESSAGE}"

# save notebook
# display(Javascript('IPython.notebook.save_checkpoint()'))

# Commands for Git
os.system("git add .")  # Stage all changes (adjust if you only want specific files)
os.system(f'git commit -m "{GIT_COMMIT_MESSAGE}"')  # Commit changes with a message
os.system("git push origin main")  # Push to the main branch (change branch if needed)