In [1]:
import numpy as np
import os
import pandas as pd
import sys

# Config

In [2]:
HYPER_OPT = False 
HYPER_OPT_TIME = 3*60*60
USE_ORIGINAL_DATA = False 
SUBMIT_TO_KAGGLE = False

In [3]:
# Define the base directory (where the notebook is running)
base_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Define the data directory
data_dir = os.path.join(base_dir, "data")

model_dir = os.path.join(base_dir, "models")

sys.path.append(base_dir)

# Data Extraction

In [4]:
from axyom_utilities.data_extraction import extract_data

X_train, y_train, X_test, X_orig, y_orig = extract_data(data_dir, USE_ORIGINAL_DATA, log_transform=False)

# Data Cleaning

In [5]:
from axyom_utilities.preprocessing import * 
from toolz import pipe

preprocessor = lambda df: pipe(df,\
    preprocess_dates,
    all_to_string    
)

X_train = preprocessor(X_train)
X_test = preprocessor(X_test)
if USE_ORIGINAL_DATA:
    X_orig = preprocessor(X_orig)

# HyperOpt

In [6]:
from axyom_utilities.hyperparameter_tuning import CatBoostTuner

if HYPER_OPT:
    #tune_lgbm(X_train, y_train, HYPER_OPT_TIME)
    catboost_tuner = CatBoostTuner(X_train, y_train, HYPER_OPT_TIME)
    best_params = catboost_tuner.tune()
else:
    best_params = {
        "iterations":1000,
        "learning_rate":0.1,
        "depth":6,
        "eval_metric":"RMSE",
        "random_seed":42,
        "verbose":200,
        "task_type":'GPU',
        "l2_leaf_reg" :0.7,
    }

In [7]:
best_params

{'iterations': 1000,
 'learning_rate': 0.1,
 'depth': 6,
 'eval_metric': 'RMSE',
 'random_seed': 42,
 'verbose': 200,
 'task_type': 'GPU',
 'l2_leaf_reg': 0.7}

In [8]:
best_params["iterations"] = 10000

# Model Training

In [9]:
from axyom_utilities.training import train_model_cv
from axyom_utilities.wrappers import CatBoostRegressorWrapper
import json

model = CatBoostRegressorWrapper(**best_params)

# Use the train_model function to train and evaluate the model
results = train_model_cv(
    model=model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    X_orig=X_orig,
    early_stopping_rounds=100,
    cv_splits=5
)

Training fold 1...
0:	learn: 863.2677193	test: 860.9104663	best: 860.9104663 (0)	total: 411ms	remaining: 1h 8m 26s
200:	learn: 844.8718757	test: 838.9274270	best: 838.9273049 (199)	total: 26.7s	remaining: 21m 42s
400:	learn: 843.5997939	test: 838.4290682	best: 838.4290682 (400)	total: 54.5s	remaining: 21m 43s
600:	learn: 842.4894869	test: 838.1354494	best: 838.1354494 (600)	total: 1m 22s	remaining: 21m 28s
800:	learn: 841.5527084	test: 838.0537504	best: 838.0537504 (800)	total: 1m 50s	remaining: 21m 9s
1000:	learn: 840.6650320	test: 837.9982753	best: 837.9974199 (993)	total: 2m 19s	remaining: 20m 50s
1200:	learn: 839.8060881	test: 837.9463404	best: 837.9445888 (1187)	total: 2m 47s	remaining: 20m 29s
bestTest = 837.9161963
bestIteration = 1276
Shrink model to first 1277 iterations.
Fold 1 RMSE: 837.9160
Training fold 2...
0:	learn: 862.4022758	test: 863.8295536	best: 863.8295536 (0)	total: 419ms	remaining: 1h 9m 45s
200:	learn: 843.8669190	test: 841.7237323	best: 841.7237323 (200)	total

In [14]:
results["oof_preds"] = np.maximum(0, results["oof_preds"])

In [15]:
from sklearn.metrics import mean_squared_log_error

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_score = rmsle(y_train, results["oof_preds"])

print(f"RMSLE score = {rmsle_score}")

RMSLE score = 1.1419423458662257


In [16]:
# mean_score = np.mean(results["cv_scores"])
# std_score = np.std(results["cv_scores"])

# # Prepare the data
# data = {
#     "mean_score": mean_score,
#     "std_score": std_score
# }

# # Save to a JSON file
# with open("score.json", "w") as json_file:
#     json.dump(data, json_file, indent=4)  # Use `indent` for readability

In [None]:
import joblib

# Save OOF predictions as a CSV file
oof_preds_df = pd.DataFrame({"oof_preds_nonlog": results["oof_preds"]})
oof_preds_df.to_csv("oof_preds_nonlog.csv", index=False)

test_preds_df = pd.DataFrame({"test_preds_nonlog": results["test_preds"]})
test_preds_df.to_csv("test_preds_nonlog.csv", index=False)

#joblib.dump(results["models"], "models.pkl")

['models.pkl']

# Submission

In [12]:
# y_pred = np.expm1(results['test_preds'])

# submission = pd.DataFrame({
#     'id': X_test.index,  
#     'Premium Amount': y_pred
# })

# FILE_PATH = f"nonlog_preds_RMSE_{rmsle_score:.4f}.csv"

# submission.to_csv(FILE_PATH, index=False)

In [13]:
if False: 
    # Define your message and file paths
    COMP_NAME = "playground-series-s4e12"

    SUBMIT_MESSAGE = f"Clean LGBM: Mean score: {mean_score:.4f} +/- {std_score:.4f}"

    # Submit to Kaggle
    os.system(f'kaggle competitions submit -c {COMP_NAME} -f {FILE_PATH} -m "{SUBMIT_MESSAGE}"')