In [1]:
import numpy as np
import os
import pandas as pd
import sys

# Config

In [2]:
HYPER_OPT = False 
HYPER_OPT_TIME = 3*60*60
USE_ORIGINAL_DATA = False 
SUBMIT_TO_KAGGLE = False

In [3]:
# Define the base directory (where the notebook is running)
base_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Define the data directory
data_dir = os.path.join(base_dir, "data")

model_dir = os.path.join(base_dir, "models")

notebooks_dir = os.path.join(base_dir, "notebooks")

sys.path.append(base_dir)

# Data Extraction

In [4]:
from axyom_utilities.data_extraction import extract_data

X_train, y_train, X_test, X_orig, y_orig = extract_data(data_dir, USE_ORIGINAL_DATA, log_transform=True)

oof_log = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\oof_preds_log.csv"))
oof_nonlog = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\oof_preds_nonlog.csv"))

test_log = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\test_preds_log.csv"))
test_nonlog = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\test_preds_nonlog.csv"))

# X_train["oof_nonlog"] = oof_nonlog
# X_test["oof_nonlog"] = test_nonlog

# X_test["oof_log"] = test_log
# X_train["oof_log"] = oof_log

# Data Cleaning

In [5]:
from axyom_utilities.preprocessing import * 
from toolz import pipe

preprocessor = lambda df: pipe(df,\
    preprocess_dates,
    health_score_eng,
    clean_categorical   
)

X_train = preprocessor(X_train)
X_test = preprocessor(X_test)
if USE_ORIGINAL_DATA:
    X_orig = preprocessor(X_orig)
    


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1200000 entries, 0 to 1199999
Data columns (total 28 columns):
 #   Column                Non-Null Count    Dtype   
---  ------                --------------    -----   
 0   Age                   1181295 non-null  float64 
 1   Gender                1200000 non-null  category
 2   Annual Income         1155051 non-null  float64 
 3   Marital Status        1200000 non-null  category
 4   Number of Dependents  1090328 non-null  float64 
 5   Education Level       1200000 non-null  category
 6   Occupation            1200000 non-null  category
 7   Health Score          1200000 non-null  string  
 8   Location              1200000 non-null  category
 9   Policy Type           1200000 non-null  category
 10  Previous Claims       835971 non-null   float64 
 11  Vehicle Age           1199994 non-null  float64 
 12  Credit Score          1062118 non-null  float64 
 13  Insurance Duration    1199999 non-null  float64 
 14  Customer Feedback     1

# HyperOpt

In [7]:
from axyom_utilities.hyperparameter_tuning import CatBoostTuner

if HYPER_OPT:
    #tune_lgbm(X_train, y_train, HYPER_OPT_TIME)
    catboost_tuner = CatBoostTuner(X_train, y_train, HYPER_OPT_TIME)
    best_params = catboost_tuner.tune()
else:
    best_params = {
        "iterations":3000,
        "learning_rate":0.05,
        "depth":6,
        "eval_metric":"RMSE",
        "random_seed":42,
        "verbose":200,
        "task_type":'GPU',
        "l2_leaf_reg" :0.7,
    }

In [8]:
best_params

{'iterations': 3000,
 'learning_rate': 0.05,
 'depth': 6,
 'eval_metric': 'RMSE',
 'random_seed': 42,
 'verbose': 200,
 'task_type': 'GPU',
 'l2_leaf_reg': 0.7}

In [9]:
best_params["iterations"] = 10000

# Model Training

In [10]:
from axyom_utilities.training import train_model_cv
from axyom_utilities.wrappers import CatBoostRegressorWrapper
import json

model = CatBoostRegressorWrapper(**best_params)

# Use the train_model function to train and evaluate the model
results = train_model_cv(
    model=model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    X_orig=X_orig,
    early_stopping_rounds=100,
    cv_splits=5
)

Training fold 1...
0:	learn: 1.0925977	test: 1.0939887	best: 1.0939887 (0)	total: 120ms	remaining: 19m 57s
200:	learn: 1.0414795	test: 1.0394267	best: 1.0394267 (200)	total: 15s	remaining: 12m 12s
400:	learn: 1.0390510	test: 1.0375115	best: 1.0375115 (400)	total: 30.7s	remaining: 12m 13s
600:	learn: 1.0376895	test: 1.0368053	best: 1.0368053 (600)	total: 46.6s	remaining: 12m 8s
800:	learn: 1.0366531	test: 1.0363968	best: 1.0363968 (800)	total: 1m 2s	remaining: 11m 58s
1000:	learn: 1.0356444	test: 1.0360839	best: 1.0360837 (999)	total: 1m 18s	remaining: 11m 45s
1200:	learn: 1.0347522	test: 1.0358386	best: 1.0358383 (1198)	total: 1m 34s	remaining: 11m 33s
1400:	learn: 1.0339338	test: 1.0356986	best: 1.0356945 (1384)	total: 1m 50s	remaining: 11m 19s
1600:	learn: 1.0331089	test: 1.0355555	best: 1.0355502 (1557)	total: 2m 6s	remaining: 11m 5s
1800:	learn: 1.0323866	test: 1.0354303	best: 1.0354303 (1800)	total: 2m 23s	remaining: 10m 52s
2000:	learn: 1.0316067	test: 1.0353132	best: 1.0353098 (

In [None]:
# from sklearn import mean_squared_log_error

# def rmsle(y_true, y_pred):
#     return np.sqrt(mean_squared_log_error(y_true, y_pred))

# rmsle_score = rmsle(y_train, results["oof_preds"])

# print(f"RMSLE score = {rmsle_score}")

In [12]:
mean_score = np.mean(results["cv_scores"])
std_score = np.std(results["cv_scores"])

# Prepare the data
data = {
    "mean_score": mean_score,
    "std_score": std_score
}

# Save to a JSON file
with open("score.json", "w") as json_file:
    json.dump(data, json_file, indent=4)  # Use `indent` for readability

In [13]:
import joblib

name = f"catboost_nonlog_feature_rmsle_{mean_score}"

# Save OOF predictions as a CSV file
oof_preds_df = pd.DataFrame({f"preds_{name}": results["oof_preds"]})
oof_preds_df.to_csv(f"oof_preds_{name}.csv", index=False)

test_preds_df = pd.DataFrame({f"preds_{name}": results["test_preds"]})
test_preds_df.to_csv(f"test_preds_{name}.csv", index=False)

#joblib.dump(results["models"], "models.pkl")

# Submission

In [14]:
y_pred = np.expm1(results['test_preds'])

submission = pd.DataFrame({
    'id': X_test.index,  
    'Premium Amount': y_pred
})

FILE_PATH = f"{name}.csv"

submission.to_csv(FILE_PATH, index=False)

In [15]:
if True: 
    # Define your message and file paths
    COMP_NAME = "playground-series-s4e12"

    SUBMIT_MESSAGE = name

    # Submit to Kaggle
    os.system(f'kaggle competitions submit -c {COMP_NAME} -f {FILE_PATH} -m "{SUBMIT_MESSAGE}"')