In [1]:
import numpy as np
import os
import pandas as pd
import sys

# Config

In [2]:
HYPER_OPT = True 
HYPER_OPT_TIME = 1*60*60
USE_ORIGINAL_DATA = False 
SUBMIT_TO_KAGGLE = False
COMPUTE_NONLOG = False
USE_NONLOG = True

In [3]:
from pathlib import Path
from datetime import datetime
import sys

# Define the base directory
base_dir = Path.cwd().resolve().parents[1]

# Define subdirectories
data_dir = base_dir / "data"
model_dir = base_dir / "models"
notebooks_dir = base_dir / "notebooks"

# Append base_dir to sys.path
sys.path.append(str(base_dir))

# Data Extraction

In [4]:
from axyom_utilities.data_extraction import extract_data
from axyom_utilities.training import train_model_cv
from axyom_utilities.preprocessing import * 
from toolz import pipe
import joblib

# oof_log = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\oof_preds_log.csv"))
# oof_nonlog = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\oof_preds_nonlog.csv"))

# test_log = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\test_preds_log.csv"))
# test_nonlog = pd.read_csv(os.path.join(notebooks_dir, "Catboost_nonlog\\test_preds_nonlog.csv"))

# X_train["oof_nonlog"] = oof_nonlog + np.random.normal(0, 0.01, oof_log.shape) # noise
# X_test["oof_nonlog"] = test_nonlog + np.random.normal(0, 0.01, oof_log.shape)

# X_test["oof_log"] = test_log
# X_train["oof_log"] = oof_log

In [5]:
X_train, y_train, X_test, X_orig, y_orig = extract_data(data_dir, USE_ORIGINAL_DATA, log_transform=True)
y_train_nonlog = np.expm1(y_train)

if USE_NONLOG:
    nonlog_oof_preds, nonlog_test_preds = joblib.load('nonlog_feature/cat_non_loged.pkl')
    
    X_train["nonlog"] = nonlog_oof_preds
    X_test["nonlog"] = nonlog_test_preds

# Data Cleaning

In [6]:

from axyom_utilities.preprocessing import freq_encode, preprocess_dates, clean_categorical, frequency_encode, reduce_memory_usage
from toolz import pipe

def feature_engineering(df):
    
    # df = freq_encode(df)
    
    df['MissingHealth'] = df['Health Score'].isna().astype(int)
    df['Claims v Duration'] = (df['Previous Claims'] / df['Insurance Duration']).replace([np.inf, -np.inf], np.nan)
    df['Health vs Claims'] = (df['Health Score'] / df['Previous Claims']).replace([np.inf, -np.inf], np.nan)
    
    def duplicate_int(col, df):
        df[col + ' Integer'] = df[col].apply(lambda x: int(x) if pd.notna(x) else x)
        #df[col] = df[col].fillna('None').astype('category')
        
        return df
    
    df = duplicate_int('Health Score', df)
    df = duplicate_int('Credit Score', df)
    df = duplicate_int('Previous Claims', df)
    df = duplicate_int('Annual Income', df)
    
    df['contract length'] = pd.cut(
        df["Insurance Duration"].fillna(99),  
        bins=[-float('inf'), 1, 3, float('inf')],  
        labels=[0, 1, 2]
    ).astype(int)
    
    df['MissingValuesCount'] = df.isna().sum(axis=1)
    
    # df = freq_encode(df, 'Customer Feedback')
    # df = freq_encode(df, 'Marital Status')
    
    return df

#X_train = reduce_memory_usage(X_train)


preprocessor = lambda df: pipe(df,\
    preprocess_dates,
    clean_categorical,
    feature_engineering,  
    reduce_memory_usage
)

X_train = preprocessor(X_train)
X_test = preprocessor(X_test)
if USE_ORIGINAL_DATA:
    X_orig = preprocessor(X_orig)
    
frequency_encode(X_train, X_test, drop_org=False) # data leaking but acceptable in this context


(          Age  Gender Annual Income Marital Status  Number of Dependents  \
 id                                                                         
 0        19.0  Female       10049.0        Married                   1.0   
 1        39.0  Female       31678.0       Divorced                   3.0   
 2        23.0    Male       25602.0       Divorced                   3.0   
 3        21.0    Male      141855.0        Married                   2.0   
 4        21.0    Male       39651.0         Single                   1.0   
 ...       ...     ...           ...            ...                   ...   
 1199995  36.0  Female       27316.0        Married                   0.0   
 1199996  54.0    Male       35786.0       Divorced                   NaN   
 1199997  19.0    Male       51884.0       Divorced                   0.0   
 1199998  55.0    Male          None         Single                   1.0   
 1199999  21.0  Female          None       Divorced                   0.0   

# HyperOpt

In [7]:
from axyom_utilities.hyperparameter_tuning import YggdrasilTuner

if HYPER_OPT:
    tuner = YggdrasilTuner(X_train, y_train, HYPER_OPT_TIME)
    best_params = tuner.tune()
    tuner.plot()
else:
    best_params = {
        "iterations":3000,
        "learning_rate":0.05,
        "depth":6,
        "eval_metric":"RMSE",
        "random_seed":42,
        "verbose":200,
        "task_type":'GPU',
        "l2_leaf_reg" :0.7,
    }

[I 2024-12-26 21:33:51,883] A new study created in RDB with name: yggdrasil
[W 2024-12-26 21:33:52,136] Trial 0 failed with parameters: {'learning_rate': 0.05433350003437882, 'max_depth': 18, 'min_examples': 69, 'l2_regularization': 0.815515517219875, 'shrinkage': 0.9563121617017054} because of the following error: TypeError("YggdrasilRegressorWrapper.fit() got an unexpected keyword argument 'early_stopping_rounds'").
Traceback (most recent call last):
  File "c:\Users\Axyom\anaconda3\envs\kaggle_env\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Axyom\Kaggle\PG S4e12 - Regression Insurance\axyom_utilities\hyperparameter_tuning.py", line 27, in <lambda>
    objective = lambda trial: self.generic_objective(trial, model_generator)
  File "C:\Users\Axyom\Kaggle\PG S4e12 - Regression Insurance\axyom_utilities\hyperparameter_tuning.py", line 42, in generic_objective
    results = train_model_cv(
  File "C:\Users\Axyom

Training fold 1...


TypeError: YggdrasilRegressorWrapper.fit() got an unexpected keyword argument 'early_stopping_rounds'

In [None]:
#tuner.plot()


In [None]:
best_params

In [10]:
best_params["num_trees"] = 1000

# Model Training

In [None]:
import json
from axyom_utilities.wrappers import YggdrasilRegressorWrapper

model = YggdrasilRegressorWrapper(**best_params)

# Use the train_model function to train and evaluate the model
results = train_model_cv(
    model=model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    X_orig=X_orig,
    early_stopping_rounds=100,
    cv_splits=5
)

In [None]:
# models = results['models']

# # Get feature importance
# feature_importances = models[3].catboost_model_.get_feature_importance(type='PredictionValuesChange')

# # Map to feature names
# feature_importance_df = pd.DataFrame({
#     'Feature': X_train.columns,
#     'Importance': feature_importances
# }).sort_values(by='Importance', ascending=False)

# print(feature_importance_df)

In [14]:
mean_score = np.mean(results["cv_scores"])
std_score = np.std(results["cv_scores"])

# Saving

In [15]:
import json
from pathlib import Path

results_dir = Path(f"{mean_score:.4f}+-{std_score:.4f}")
os.makedirs(results_dir, exist_ok=True)

# Save OOF predictions as a CSV file
oof_preds_df = pd.DataFrame({"oof_preds": results["oof_preds"]})
oof_preds_df.to_csv(results_dir/"oof_preds.csv", index=False)

test_preds_df = pd.DataFrame({"test_preds": results["test_preds"]})
test_preds_df.to_csv(results_dir/"test_preds.csv", index=False)

with open(results_dir/"best_params", "w") as f:
    json.dump(best_params, f, indent=4)

#joblib.dump(results["models"], "models.pkl")

# Submission

In [16]:
y_pred = np.maximum(0, np.expm1(results['test_preds']))

submission = pd.DataFrame({
    'id': X_test.index,  
    'Premium Amount': y_pred
})

FILE_PATH = os.path.join(results_dir, f"YDF_{mean_score:.4f}+-{std_score:.4f}.csv")

submission.to_csv(FILE_PATH, index=False)

In [17]:
if False: 
    # Define your message and file paths
    COMP_NAME = "playground-series-s4e12"

    SUBMIT_MESSAGE = "{mean_score:.4f}+-{std_score:.4f}"

    # Submit to Kaggle
    os.system(f'kaggle competitions submit -c {COMP_NAME} -f {FILE_PATH} -m "{SUBMIT_MESSAGE}"')