In [1]:
import numpy as np
import os
import pandas as pd
import sys

# Config

In [2]:
HYPER_OPT = False 
HYPER_OPT_TIME = 1*60*60
USE_ORIGINAL_DATA = False 
SUBMIT_TO_KAGGLE = False
COMPUTE_NONLOG = False
USE_NONLOG = True
REGULARIZE_NONLOG = True
NOISE_INTENSITY = 0.5

In [3]:
# Define the base directory (where the notebook is running)
base_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Define the data directory
data_dir = os.path.join(base_dir, "data")

model_dir = os.path.join(base_dir, "models")

notebooks_dir = os.path.join(base_dir, "notebooks")

sys.path.append(base_dir)

# Data Extraction

In [4]:
from axyom_utilities.data_extraction import extract_data
from axyom_utilities.training import train_model_cv
from axyom_utilities.wrappers import CatBoostRegressorWrapper
from axyom_utilities.preprocessing import * 
from toolz import pipe
import joblib

def get_oof(X_train, y_train, X_test):
    
    #X_train, y_train, X_test, X_orig, y_orig = extract_data(data_dir, USE_ORIGINAL_DATA, log_transform=False) # IMPORTANT: no log transform

    preprocessor = lambda df: pipe(df,\
        preprocess_dates,
        all_to_string    
    )

    X_train = preprocessor(X_train)
    X_test = preprocessor(X_test)
    
    params = {
        "iterations":500,
        "learning_rate":0.1,
        "depth":6,
        "eval_metric":"RMSE",
        "random_seed":42,
        "verbose":200,
        "task_type":'GPU',
        "l2_leaf_reg" :0.7,
    }
    
    results = train_model_cv(
        model=CatBoostRegressorWrapper(**params),
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        early_stopping_rounds=100,
        cv_splits=5
    )
    
    oof, test = results["oof_preds"], results['test_preds']
    
    joblib.dump((oof, test), "nonlog_feature/nonlog.pkl")
    
    return oof, test



# oof_log = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\oof_preds_log.csv"))
# oof_nonlog = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\oof_preds_nonlog.csv"))

# test_log = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\test_preds_log.csv"))
# test_nonlog = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\test_preds_nonlog.csv"))

# X_train["oof_nonlog"] = oof_nonlog + np.random.normal(0, 0.01, oof_log.shape) # noise
# X_test["oof_nonlog"] = test_nonlog + np.random.normal(0, 0.01, oof_log.shape)

# X_test["oof_log"] = test_log
# X_train["oof_log"] = oof_log

In [5]:
X_train, y_train, X_test, X_orig, y_orig = extract_data(data_dir, USE_ORIGINAL_DATA, log_transform=True)
y_train_nonlog = np.expm1(y_train)

if USE_NONLOG:
    if COMPUTE_NONLOG:
        nonlog_oof_preds, nonlog_test_preds = get_oof(X_train, y_train_nonlog, X_test)
    else:
        nonlog_oof_preds, nonlog_test_preds = joblib.load('nonlog_feature/cat_non_loged.pkl')
    
    if REGULARIZE_NONLOG:
        # Parameters for noise
        noise_mean = 0
        noise_std = nonlog_oof_preds.std() * NOISE_INTENSITY # Scale noise to 10% of the standard deviation
        nonlog_oof_preds += np.random.normal(noise_mean, noise_std, size=nonlog_oof_preds.shape)
        #nonlog_test_preds DONT ADD NOISE IN TEST
    
    X_train["nonlog"] = nonlog_oof_preds
    X_test["nonlog"] = nonlog_test_preds

# Data Cleaning

In [6]:
from axyom_utilities.preprocessing import freq_encode, preprocess_dates, clean_categorical, frequency_encode
from toolz import pipe

def feature_engineering(df):
    
    # df = freq_encode(df)
    
    df['MissingHealth'] = df['Health Score'].isna().astype(int)
    df['Claims v Duration'] = df['Previous Claims'] / df['Insurance Duration']
    df['Health vs Claims'] = df['Health Score'] / df['Previous Claims']
    
    def duplicate_int(col, df):
        df[col + ' Integer'] = df[col].apply(lambda x: int(x) if pd.notna(x) else x)
        df[col] = df[col].fillna('None').astype('string')
        
        return df
    
    df = duplicate_int('Health Score', df)
    df = duplicate_int('Credit Score', df)
    df = duplicate_int('Previous Claims', df)
    df = duplicate_int('Annual Income', df)
    
    df['MissingValuesCount'] = df.isna().sum(axis=1)
    
    # df = freq_encode(df, 'Customer Feedback')
    # df = freq_encode(df, 'Marital Status')
    
    return df

preprocessor = lambda df: pipe(df,\
    preprocess_dates,
    clean_categorical,
    feature_engineering,  
)

X_train = preprocessor(X_train)
X_test = preprocessor(X_test)
if USE_ORIGINAL_DATA:
    X_orig = preprocessor(X_orig)
    
frequency_encode(X_train, X_test, drop_org=False) # data leaking but acceptable in this context


(          Age  Gender Annual Income Marital Status  Number of Dependents  \
 id                                                                         
 0        19.0  Female       10049.0        Married                   1.0   
 1        39.0  Female       31678.0       Divorced                   3.0   
 2        23.0    Male       25602.0       Divorced                   3.0   
 3        21.0    Male      141855.0        Married                   2.0   
 4        21.0    Male       39651.0         Single                   1.0   
 ...       ...     ...           ...            ...                   ...   
 1199995  36.0  Female       27316.0        Married                   0.0   
 1199996  54.0    Male       35786.0       Divorced                   NaN   
 1199997  19.0    Male       51884.0       Divorced                   0.0   
 1199998  55.0    Male          None         Single                   1.0   
 1199999  21.0  Female          None       Divorced                   0.0   

# HyperOpt

In [7]:
from axyom_utilities.hyperparameter_tuning import CatBoostTuner

if HYPER_OPT:
    catboost_tuner = CatBoostTuner(X_train, y_train, HYPER_OPT_TIME, study_name=None)
    best_params = catboost_tuner.tune()
    catboost_tuner.plot()
else:
    best_params = {
        'iterations': 7248,
        'task_type': 'GPU',
        'verbose': 200,
        'learning_rate': 0.0024025303850357584,
        'depth': 10,
        'l2_leaf_reg': 0.009054048982222585,
        'bagging_temperature': 1.0816358150014032,
        'border_count': 157,
        'random_strength': 4.069317345142579,
        'min_data_in_leaf': 86
    }

In [8]:
best_params

{'iterations': 7248,
 'task_type': 'GPU',
 'verbose': 200,
 'learning_rate': 0.0024025303850357584,
 'depth': 10,
 'l2_leaf_reg': 0.009054048982222585,
 'bagging_temperature': 1.0816358150014032,
 'border_count': 157,
 'random_strength': 4.069317345142579,
 'min_data_in_leaf': 86}

In [9]:
best_params["iterations"] = 3000

# Model Training

In [None]:
import json

model = CatBoostRegressorWrapper(**best_params)

# Use the train_model function to train and evaluate the model
results = train_model_cv(
    model=model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    X_orig=X_orig,
    early_stopping_rounds=100,
    cv_splits=5
)

Training fold 1...
0:	learn: 1.0953129	test: 1.0967277	best: 1.0967277 (0)	total: 341ms	remaining: 17m 4s
200:	learn: 1.0639296	test: 1.0652232	best: 1.0652232 (200)	total: 40s	remaining: 9m 17s


In [11]:
# from sklearn import mean_squared_log_error

# def rmsle(y_true, y_pred):
#     return np.sqrt(mean_squared_log_error(y_true, y_pred))

# rmsle_score = rmsle(y_train, results["oof_preds"])

# print(f"RMSLE score = {rmsle_score}")

In [12]:
models = results['models']

# Get feature importance
feature_importances = models[3].catboost_model_.get_feature_importance(type='PredictionValuesChange')

# Map to feature names
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

                    Feature  Importance
34    Annual Income Integer   30.314589
18                   nonlog   22.519283
32     Credit Score Integer   13.936556
31     Health Score Integer    6.409310
7              Health Score    4.504901
41        Health Score_freq    4.494220
10          Previous Claims    2.583169
12             Credit Score    2.281206
37       Annual Income_freq    2.115085
14        Customer Feedback    1.500919
45        Credit Score_freq    1.360078
27           DaysSinceStart    1.293221
2             Annual Income    0.860813
44     Previous Claims_freq    0.715672
3            Marital Status    0.683005
33  Previous Claims Integer    0.611628
28            MissingHealth    0.607912
46   Customer Feedback_freq    0.447678
6                Occupation    0.253305
0                       Age    0.169836
17            Property Type    0.154133
16       Exercise Frequency    0.129615
29        Claims v Duration    0.129028
20                      Day    0.121894


In [13]:
mean_score = np.mean(results["cv_scores"])
std_score = np.std(results["cv_scores"])

# Saving

In [14]:
import joblib
import json
from pathlib import Path

results_dir = Path(f"{mean_score:.4f}+-{std_score:.4f}")
os.makedirs(results_dir, exist_ok=True)

# Save OOF predictions as a CSV file
oof_preds_df = pd.DataFrame({"oof_preds": results["oof_preds"]})
oof_preds_df.to_csv(results_dir/"oof_preds.csv", index=False)

test_preds_df = pd.DataFrame({"test_preds": results["test_preds"]})
test_preds_df.to_csv(results_dir/"test_preds.csv", index=False)

with open(results_dir/"best_params", "w") as f:
    json.dump(best_params, f, indent=4)

#joblib.dump(results["models"], "models.pkl")

# Submission

In [15]:
y_pred = np.expm1(results['test_preds'])

submission = pd.DataFrame({
    'id': X_test.index,  
    'Premium Amount': y_pred
})

FILE_PATH = os.path.join(results_dir, f"Catboost_{mean_score:.4f}+-{std_score:.4f}.csv")

submission.to_csv(FILE_PATH, index=False)

In [16]:
if False: 
    # Define your message and file paths
    COMP_NAME = "playground-series-s4e12"

    SUBMIT_MESSAGE = "Catboost_{mean_score:.4f}+-{std_score:.4f}"

    # Submit to Kaggle
    os.system(f'kaggle competitions submit -c {COMP_NAME} -f {FILE_PATH} -m "{SUBMIT_MESSAGE}"')