In [1]:
import numpy as np
import os
import pandas as pd
import sys

# Config

In [2]:
HYPER_OPT = False 
HYPER_OPT_TIME = 3*60*60
USE_ORIGINAL_DATA = False 
SUBMIT_TO_KAGGLE = False

In [3]:
# Define the base directory (where the notebook is running)
base_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Define the data directory
data_dir = os.path.join(base_dir, "data")

model_dir = os.path.join(base_dir, "models")

notebooks_dir = os.path.join(base_dir, "notebooks")

sys.path.append(base_dir)

# Data Extraction

In [4]:
from axyom_utilities.data_extraction import extract_data

X_train, y_train, X_test, X_orig, y_orig = extract_data(data_dir, USE_ORIGINAL_DATA, log_transform=True)

oof_log = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\oof_preds_log.csv"))
oof_nonlog = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\oof_preds_nonlog.csv"))

test_log = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\test_preds_log.csv"))
test_nonlog = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\test_preds_nonlog.csv"))

X_train["oof_nonlog"] = oof_nonlog
X_train["oof_log"] = oof_log

X_test["oof_nonlog"] = test_nonlog
X_test["oof_log"] = test_log

# Data Cleaning

In [5]:
from axyom_utilities.preprocessing import * 
from toolz import pipe

preprocessor = lambda df: pipe(df,\
    preprocess_dates,
    all_to_string    
)

X_train = preprocessor(X_train)
X_test = preprocessor(X_test)
if USE_ORIGINAL_DATA:
    X_orig = preprocessor(X_orig)

# HyperOpt

In [6]:
from axyom_utilities.hyperparameter_tuning import CatBoostTuner

if HYPER_OPT:
    #tune_lgbm(X_train, y_train, HYPER_OPT_TIME)
    catboost_tuner = CatBoostTuner(X_train, y_train, HYPER_OPT_TIME)
    best_params = catboost_tuner.tune()
else:
    best_params = {
        "iterations":1000,
        "learning_rate":0.1,
        "depth":6,
        "eval_metric":"RMSE",
        "random_seed":42,
        "verbose":200,
        "task_type":'GPU',
        "l2_leaf_reg" :0.7,
    }

In [7]:
best_params

{'iterations': 1000,
 'learning_rate': 0.1,
 'depth': 6,
 'eval_metric': 'RMSE',
 'random_seed': 42,
 'verbose': 200,
 'task_type': 'GPU',
 'l2_leaf_reg': 0.7}

In [8]:
best_params["iterations"] = 10000

# Model Training

In [None]:
from axyom_utilities.training import train_model_cv
from axyom_utilities.wrappers import CatBoostRegressorWrapper
import json

model = CatBoostRegressorWrapper(**best_params)

# Use the train_model function to train and evaluate the model
results = train_model_cv(
    model=model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    X_orig=X_orig,
    early_stopping_rounds=100,
    cv_splits=5
)

Training fold 1...
0:	learn: 1.0917708	test: 1.0924541	best: 1.0924541 (0)	total: 239ms	remaining: 39m 49s
200:	learn: 1.0644263	test: 1.0596339	best: 1.0596339 (200)	total: 26.3s	remaining: 21m 22s
400:	learn: 1.0629169	test: 1.0590497	best: 1.0590497 (400)	total: 54.4s	remaining: 21m 41s
600:	learn: 1.0617068	test: 1.0588237	best: 1.0588167 (584)	total: 1m 22s	remaining: 21m 32s
800:	learn: 1.0604916	test: 1.0586746	best: 1.0586746 (800)	total: 1m 50s	remaining: 21m 11s
1000:	learn: 1.0592999	test: 1.0585330	best: 1.0585330 (1000)	total: 2m 19s	remaining: 20m 49s
1200:	learn: 1.0582167	test: 1.0585285	best: 1.0585164 (1124)	total: 2m 47s	remaining: 20m 27s
bestTest = 1.058516429
bestIteration = 1124
Shrink model to first 1125 iterations.
Fold 1 RMSE: 1.0585
Training fold 2...
0:	learn: 1.0921429	test: 1.0905254	best: 1.0905254 (0)	total: 121ms	remaining: 20m 14s
200:	learn: 1.0642371	test: 1.0579962	best: 1.0579962 (200)	total: 26s	remaining: 21m 9s
400:	learn: 1.0626659	test: 1.0574

In [None]:
from sklearn import mean_squared_log_error

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

rmsle_score = rmsle(y_train, results["oof_preds"])

print(f"RMSLE score = {rmsle_score}")

In [10]:
# mean_score = np.mean(results["cv_scores"])
# std_score = np.std(results["cv_scores"])

# # Prepare the data
# data = {
#     "mean_score": mean_score,
#     "std_score": std_score
# }

# # Save to a JSON file
# with open("score.json", "w") as json_file:
#     json.dump(data, json_file, indent=4)  # Use `indent` for readability

In [None]:
import joblib

# Save OOF predictions as a CSV file
oof_preds_df = pd.DataFrame({"oof_preds_nonlog": results["oof_preds"]})
oof_preds_df.to_csv("oof_preds_nonlog.csv", index=False)

test_preds_df = pd.DataFrame({"test_preds_nonlog": results["test_preds"]})
test_preds_df.to_csv("test_preds_nonlog.csv", index=False)

joblib.dump(results["models"], "models.pkl")

# Submission

In [12]:
# y_pred = np.expm1(results['test_preds'])

# submission = pd.DataFrame({
#     'id': X_test.index,  
#     'Premium Amount': y_pred
# })

# FILE_PATH = f"nonlog_preds_RMSE_{rmsle_score:.4f}.csv"

# submission.to_csv(FILE_PATH, index=False)

In [13]:
if False: 
    # Define your message and file paths
    COMP_NAME = "playground-series-s4e12"

    SUBMIT_MESSAGE = f"Clean LGBM: Mean score: {mean_score:.4f} +/- {std_score:.4f}"

    # Submit to Kaggle
    os.system(f'kaggle competitions submit -c {COMP_NAME} -f {FILE_PATH} -m "{SUBMIT_MESSAGE}"')