In [None]:
import numpy as np
import pandas as pd
import os
import pandas as pd
import sys

# Config

In [None]:
HYPER_OPT = False 
HYPER_OPT_TIME = 3600*5
USE_ORIGINAL_DATA = False 
SUBMIT_TO_KAGGLE = False
AUTOGLUON_TIME = 3600*11

In [None]:
# Define the base directory (where the notebook is running)
base_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Define the data directory
data_dir = os.path.join(base_dir, "data")

model_dir = os.path.join(base_dir, "models")

sys.path.append(base_dir)

# Data Extraction

In [None]:
from axyom_utilities.data_extraction import extract_data

X_train, y_train, X_test, X_orig, y_orig = extract_data(data_dir, USE_ORIGINAL_DATA)

In [None]:
X_train["Premium Amount"] = y_train # for autogluon

# Data Cleaning

In [None]:
from axyom_utilities.preprocessing import preprocess 

X_train = preprocess(X_train)
X_test = preprocess(X_test)
if USE_ORIGINAL_DATA:
    X_orig = preprocess(X_orig)

# Model Training

In [10]:
from autogluon.tabular import TabularDataset, TabularPredictor

# Initialize a CatBoost Regressor
predictor = TabularPredictor(
    problem_type='regression',
    eval_metric='rmse',
    label="Premium Amount",
    #groups='fold',
    verbosity=2
)

No path specified. Models will be saved in: "AutogluonModels\ag-20241212_033419"


In [None]:
predictor.fit(
    train_data=X_train,
    time_limit=AUTOGLUON_TIME,
    presets='best_quality',
    excluded_model_types=['KNN', 'NN_TORCH', 'FASTAI', 'RF'],
    ag_args_fit={'num_gpus': 1}
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.16
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          12
Memory Avail:       4.52 GB / 15.82 GB (28.6%)
Disk Space Avail:   223.20 GB / 930.65 GB (24.0%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 9900s of the 39600s of remaining time (25%).
		Con

In [3]:
from autogluon.tabular import TabularDataset, TabularPredictor

predictor = TabularPredictor.load("AutogluonModels/ag-20241212_033419")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
predictor.leaderboard(silent=True).style.background_gradient(subset=['score_val'], cmap='RdYlGn')

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBM_r131_BAG_L2,-1.044636,root_mean_squared_error,1579.042057,26067.802836,32.618942,339.805474,2,True,34
1,LightGBM_BAG_L2,-1.04474,root_mean_squared_error,1550.652359,25809.377939,4.229245,81.380576,2,True,28
2,CatBoost_BAG_L2,-1.044752,root_mean_squared_error,1546.741123,26068.520046,0.318008,340.522683,2,True,29
3,LightGBMLarge_BAG_L2,-1.044763,root_mean_squared_error,1552.845029,25861.189059,6.421915,133.191697,2,True,32
4,CatBoost_r9_BAG_L2,-1.044784,root_mean_squared_error,1547.145721,25868.023745,0.722606,140.026383,2,True,35
5,XGBoost_BAG_L2,-1.04479,root_mean_squared_error,1550.789508,25852.221941,4.366394,124.224578,2,True,31
6,CatBoost_r177_BAG_L2,-1.044804,root_mean_squared_error,1546.715791,25822.085183,0.292677,94.087821,2,True,33
7,LightGBMXT_BAG_L2,-1.044808,root_mean_squared_error,1552.702361,25822.746146,6.279246,94.748784,2,True,27
8,LightGBM_r96_BAG_L2,-1.044826,root_mean_squared_error,1626.663992,26306.824973,80.240877,578.827611,2,True,36
9,XGBoost_r33_BAG_L2,-1.044846,root_mean_squared_error,1550.936574,25940.808794,4.513459,212.811431,2,True,37


In [None]:
def get_ensemble_weights(predictor):
    info = predictor.info()
    ensemble_weights = {}
    for model_name, values in info["model_info"].items():
        if "Ensemble" in model_name:
            children_info = values["children_info"]
            ensemble_weights[model_name] = values["children_info"][list(children_info.keys())[0]]["model_weights"]
    return ensemble_weights

In [None]:
import matplotlib.pyplot as plt

ensemble_weights = get_ensemble_weights(predictor)

for key, value in ensemble_weights.items():
    plt.figure(figsize=(6, 6))
    plt.pie(value.values(), labels=value.keys(), autopct='%1.1f%%', colors=sns.color_palette('Set2', len(value)))
    plt.title(key)
    plt.tight_layout()
    plt.show()

In [None]:
# mean_score = np.mean(results["cv_scores"])
# std_score = np.std(results["cv_scores"])

# # Prepare the data
# data = {
#     "mean_score": mean_score,
#     "std_score": std_score
# }

# # Save to a JSON file
# with open("score.json", "w") as json_file:
#     json.dump(data, json_file, indent=4)  # Use `indent` for readability

In [None]:
import joblib

# Save OOF predictions as a CSV file
oof_preds_df = pd.DataFrame({"oof_preds": results["oof_preds"]})
oof_preds_df.to_csv("oof_preds", index=False)
print("OOF predictions saved to oof_preds.csv.")

test_preds_df = pd.DataFrame({"test_preds": results["test_preds"]})
test_preds_df.to_csv("test_preds", index=False)
print("Test predictions saved to test_preds.csv.")

joblib.dump(results["models"], "xgboost_models.pkl")

# Submission

In [None]:
y_pred = np.expm1(results['test_preds'])

submission = pd.DataFrame({
    'id': X_test.index,  
    'Premium Amount': y_pred
})

submission.to_csv('submission.csv', index=False)

In [None]:
import os

# Define your message and file paths
COMP_NAME = "playground-series-s4e12"
FILE_PATH = "submission.csv"

SUBMIT_MESSAGE = f"Clean XGB: Mean score: {mean_score:.4f} +/- {std_score:.4f}"

# Submit to Kaggle

if SUBMIT_TO_KAGGLE: 
    os.system(f'kaggle competitions submit -c {COMP_NAME} -f {FILE_PATH} -m "{SUBMIT_MESSAGE}"')

In [None]:
# import os
# from IPython.display import display, Javascript

# # Define your message and file paths
# COMP_NAME = "playground-series-s4e12"
# FILE_PATH = "submission.csv"

# SUBMIT_MESSAGE = f"Clean XGB: Mean score: {mean_score:.4f} +/- {std_score:.4f}"

# # Submit to Kaggle
# if SUBMIT_TO_KAGGLE: 
#     os.system(f'kaggle competitions submit -c {COMP_NAME} -f {FILE_PATH} -m "{SUBMIT_MESSAGE}"')

# # Git commit and push
# GIT_COMMIT_MESSAGE = f"Submission: {SUBMIT_MESSAGE}"

# # save notebook
# # display(Javascript('IPython.notebook.save_checkpoint()'))

# # Commands for Git
# os.system("git add .")  # Stage all changes (adjust if you only want specific files)
# os.system(f'git commit -m "{GIT_COMMIT_MESSAGE}"')  # Commit changes with a message
# os.system("git push origin main")  # Push to the main branch (change branch if needed)