In [1]:
import numpy as np
import os
import pandas as pd
import sys

# Config

In [2]:
HYPER_OPT = False 
HYPER_OPT_TIME = 3*60*60
USE_ORIGINAL_DATA = False 
SUBMIT_TO_KAGGLE = False
COMPUTE_NONLOG = True
USE_NONLOG = False

In [3]:
# Define the base directory (where the notebook is running)
base_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Define the data directory
data_dir = os.path.join(base_dir, "data")

model_dir = os.path.join(base_dir, "models")

notebooks_dir = os.path.join(base_dir, "notebooks")

sys.path.append(base_dir)

# Data Extraction

In [4]:
from axyom_utilities.data_extraction import extract_data
from axyom_utilities.training import train_model_cv
from axyom_utilities.wrappers import CatBoostRegressorWrapper
from axyom_utilities.preprocessing import * 
from toolz import pipe
import joblib

def get_oof(X_train, y_train, X_test):
    
    #X_train, y_train, X_test, X_orig, y_orig = extract_data(data_dir, USE_ORIGINAL_DATA, log_transform=False) # IMPORTANT: no log transform

    preprocessor = lambda df: pipe(df,\
        preprocess_dates,
        all_to_string    
    )

    X_train = preprocessor(X_train)
    X_test = preprocessor(X_test)
    
    params = {
        "iterations":200,
        "learning_rate":0.1,
        "depth":6,
        "eval_metric":"RMSE",
        "random_seed":42,
        "verbose":200,
        "task_type":'GPU',
        "l2_leaf_reg" :0.7,
    }
    
    results = train_model_cv(
        model=CatBoostRegressorWrapper(**params),
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        early_stopping_rounds=100,
        cv_splits=5
    )
    
    oof, test = results["oof_preds"], results['test_preds']
    
    joblib.dump((oof, test), "nonlog_feature/nonlog.pkl")
    
    return oof, test



# oof_log = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\oof_preds_log.csv"))
# oof_nonlog = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\oof_preds_nonlog.csv"))

# test_log = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\test_preds_log.csv"))
# test_nonlog = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\test_preds_nonlog.csv"))

# X_train["oof_nonlog"] = oof_nonlog + np.random.normal(0, 0.01, oof_log.shape) # noise
# X_test["oof_nonlog"] = test_nonlog + np.random.normal(0, 0.01, oof_log.shape)

# X_test["oof_log"] = test_log
# X_train["oof_log"] = oof_log

In [5]:
X_train, y_train, X_test, X_orig, y_orig = extract_data(data_dir, USE_ORIGINAL_DATA, log_transform=True)
y_train_nonlog = np.expm1(y_train)

if USE_NONLOG:
    if COMPUTE_NONLOG:
        nonlog_oof_preds, nonlog_test_preds = get_oof(X_train, y_train_nonlog)
    else:
        nonlog_oof_preds, nonlog_test_preds = joblib.load('nonlog_feature/cat_non_loged.pkl')
    
    X_train["nonlog"] = nonlog_oof_preds
    X_test["nonlog"] = nonlog_test_preds

# Data Cleaning

In [7]:
from axyom_utilities.preprocessing import freq_encode, preprocess_dates, clean_categorical
from toolz import pipe

def feature_engineering(df):
    
    df['MissingHealth'] = df['Health Score'].isna().astype(int)
    df['Claims v Duration'] = df['Previous Claims'] / df['Insurance Duration']
    df['Health vs Claims'] = df['Health Score'] / df['Previous Claims']
    
    def duplicate_int(col, df):
        df[col + ' Integer'] = df[col].apply(lambda x: int(x) if pd.notna(x) else x)
        df[col] = df[col].fillna('None').astype('string')
        
        return df
    
    df = duplicate_int('Health Score', df)
    df = duplicate_int('Credit Score', df)
    df = duplicate_int('Previous Claims', df)
    df = duplicate_int('Annual Income', df)
    
    df['MissingValuesCount'] = df.isna().sum(axis=1)
    
    df = freq_encode(df)
    
 
    
    return df

preprocessor = lambda df: pipe(df,\
    preprocess_dates,
    feature_engineering,
    clean_categorical   
)

X_train = preprocessor(X_train)
X_test = preprocessor(X_test)
if USE_ORIGINAL_DATA:
    X_orig = preprocessor(X_orig)
    
#frequency_encode(X_train, X_test, drop_org=False) # data leaking but acceptable in this context


# HyperOpt

In [8]:
from axyom_utilities.hyperparameter_tuning import CatBoostTuner

if HYPER_OPT:
    #tune_lgbm(X_train, y_train, HYPER_OPT_TIME)
    catboost_tuner = CatBoostTuner(X_train, y_train, HYPER_OPT_TIME)
    best_params = catboost_tuner.tune()
else:
    best_params = {
        "iterations":3000,
        "learning_rate":0.05,
        "depth":6,
        "eval_metric":"RMSE",
        "random_seed":42,
        "verbose":200,
        "task_type":'GPU',
        "l2_leaf_reg" :0.7,
    }

In [9]:
best_params

{'iterations': 3000,
 'learning_rate': 0.05,
 'depth': 6,
 'eval_metric': 'RMSE',
 'random_seed': 42,
 'verbose': 200,
 'task_type': 'GPU',
 'l2_leaf_reg': 0.7}

In [10]:
best_params["iterations"] = 10000

# Model Training

In [11]:
import json

model = CatBoostRegressorWrapper(**best_params)

# Use the train_model function to train and evaluate the model
results = train_model_cv(
    model=model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    X_orig=X_orig,
    early_stopping_rounds=100,
    cv_splits=5
)

Training fold 1...
0:	learn: 1.0928053	test: 1.0941692	best: 1.0941692 (0)	total: 159ms	remaining: 26m 31s
200:	learn: 1.0395612	test: 1.0366854	best: 1.0366854 (200)	total: 20.3s	remaining: 16m 27s
400:	learn: 1.0369598	test: 1.0346196	best: 1.0346196 (399)	total: 40.5s	remaining: 16m 10s
600:	learn: 1.0357016	test: 1.0339480	best: 1.0339475 (599)	total: 1m 1s	remaining: 15m 58s
800:	learn: 1.0346371	test: 1.0335163	best: 1.0335163 (800)	total: 1m 22s	remaining: 15m 42s
1000:	learn: 1.0337808	test: 1.0332840	best: 1.0332840 (1000)	total: 1m 42s	remaining: 15m 24s
1200:	learn: 1.0328883	test: 1.0330447	best: 1.0330419 (1186)	total: 2m 3s	remaining: 15m 6s
1400:	learn: 1.0321932	test: 1.0329778	best: 1.0329778 (1400)	total: 2m 24s	remaining: 14m 47s
1600:	learn: 1.0314528	test: 1.0328312	best: 1.0328312 (1600)	total: 2m 45s	remaining: 14m 27s
1800:	learn: 1.0306908	test: 1.0327254	best: 1.0327254 (1800)	total: 3m 5s	remaining: 14m 6s
2000:	learn: 1.0299585	test: 1.0326114	best: 1.032611

In [12]:
# from sklearn import mean_squared_log_error

# def rmsle(y_true, y_pred):
#     return np.sqrt(mean_squared_log_error(y_true, y_pred))

# rmsle_score = rmsle(y_train, results["oof_preds"])

# print(f"RMSLE score = {rmsle_score}")

In [13]:
models = results['models']

# Get feature importance
feature_importances = models[3].catboost_model_.get_feature_importance(type='PredictionValuesChange')

# Map to feature names
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

                    Feature  Importance
33    Annual Income Integer   26.231217
31     Credit Score Integer   14.285616
7              Health Score   11.040982
30     Health Score Integer    5.976213
12             Credit Score    5.398785
36       Annual Income_freq    4.749591
40        Health Score_freq    4.552173
26           DaysSinceStart    3.458021
10          Previous Claims    2.834553
14        Customer Feedback    2.673632
43     Previous Claims_freq    2.549359
2             Annual Income    2.245502
44        Credit Score_freq    1.706550
32  Previous Claims Integer    1.523217
45   Customer Feedback_freq    0.925649
3            Marital Status    0.917333
0                       Age    0.732742
28        Claims v Duration    0.644114
20                     Week    0.621302
11              Vehicle Age    0.565660
4      Number of Dependents    0.485746
19                      Day    0.439066
37      Marital Status_freq    0.416890
6                Occupation    0.412954


In [14]:
mean_score = np.mean(results["cv_scores"])
std_score = np.std(results["cv_scores"])

# Saving

In [15]:
import joblib

results_dir = f"{mean_score:.4f}+-{std_score:.4f}"
os.makedirs(results_dir, exist_ok=True)

# Save OOF predictions as a CSV file
oof_preds_df = pd.DataFrame({"oof_preds": results["oof_preds"]})
oof_preds_df.to_csv(os.path.join(results_dir, "oof_preds.csv"), index=False)

test_preds_df = pd.DataFrame({"test_preds": results["test_preds"]})
test_preds_df.to_csv(os.path.join(results_dir, "test_preds.csv"), index=False)

#joblib.dump(results["models"], "models.pkl")

# Submission

In [16]:
y_pred = np.expm1(results['test_preds'])

submission = pd.DataFrame({
    'id': X_test.index,  
    'Premium Amount': y_pred
})

FILE_PATH = os.path.join(results_dir, f"Catboost_nonlog_feature_{mean_score:.4f}+-{std_score:.4f}.csv")

submission.to_csv(FILE_PATH, index=False)

In [18]:
if True: 
    # Define your message and file paths
    COMP_NAME = "playground-series-s4e12"

    SUBMIT_MESSAGE = "Catboost_nonlog_feature_{mean_score:.4f}+-{std_score:.4f}"

    # Submit to Kaggle
    os.system(f'kaggle competitions submit -c {COMP_NAME} -f {FILE_PATH} -m "{SUBMIT_MESSAGE}"')