In [1]:
import pandas as pd
import numpy as np

import os

import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMRegressor as LGB
from catboost import CatBoostRegressor as CB

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor

from sklearn.metrics import mean_squared_log_error, mean_squared_error

import functools
rmse = functools.partial(mean_squared_error, squared=False)
rmsle = functools.partial(mean_squared_log_error, squared=False)

SEED=42

from src.styles import set_styles, TXT_ACC, TXT_RESET

import warnings
warnings.filterwarnings('ignore')



# ---- REPRODICIBILITY ------------------------------------------------
np.random.seed(SEED)



# ---- PANDAS ---------------------------------------------------------
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.4f}'.format



set_styles()

In [2]:
class CFG:
    path_train = 'data/train.csv'
    path_test = 'data/test.csv'
    path_original = 'data/abalone.csv'
    target = 'Rings'
    project = 'PGs04e04'
    num_folds = 5

# Predictions oof

In [3]:
mapper_sex = {'I':0, 'F':1, 'M':2}

df_train = pd.read_csv(CFG.path_train).drop('id', axis=1)
df_train['Sex'] = df_train['Sex'].map(mapper_sex)

df_test = pd.read_csv(CFG.path_test).drop('id', axis=1)
df_test['Sex'] = df_test['Sex'].map(mapper_sex)

df_original = pd.read_csv(CFG.path_original)
df_original['Sex'] = df_original['Sex'].map(mapper_sex)
df_original = df_original.rename(columns={'Shucked weight': 'Whole weight.1', 'Viscera weight': 'Whole weight.2'})

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
cv_idx = [idx for idx in cv.split(df_train, df_train[CFG.target])]

## Predictions oof: tree models

In [4]:
params_catboost_with_original = [
    {'depth': 7, 'verbose': 0, 'subsample': 0.7999999999999999, 'iterations': 974, 'l2_leaf_reg': 20.62778627087155, 'random_state': 42, 'learning_rate': 0.1444264894797429, 'loss_function': 'RMSE', 'min_data_in_leaf': 103}, 
    {'depth': 7, 'verbose': 0, 'subsample': 0.5, 'iterations': 994, 'l2_leaf_reg': 78.05729168782912, 'random_state': 42, 'learning_rate': 0.1892106979772809, 'loss_function': 'RMSE', 'min_data_in_leaf': 24}, 
    {'depth': 7, 'verbose': 0, 'subsample': 0.5, 'iterations': 995, 'l2_leaf_reg': 35.753302241413856, 'random_state': 42, 'learning_rate': 0.1530667105689769, 'loss_function': 'RMSE', 'min_data_in_leaf': 294}, 
    {'depth': 7, 'verbose': 0, 'subsample': 0.7, 'iterations': 951, 'l2_leaf_reg': 16.21107573604743, 'random_state': 42, 'learning_rate': 0.1465196325434651, 'loss_function': 'RMSE', 'min_data_in_leaf': 185}, 
    {'depth': 6, 'verbose': 0, 'subsample': 0.7, 'iterations': 879, 'l2_leaf_reg': 42.145665562690425, 'random_state': 42, 'learning_rate': 0.1932933415779507, 'loss_function': 'RMSE', 'min_data_in_leaf': 268}]
params_lgb_with_original = [
    {'device': 'cpu', 'boosting_type': 'gbdt', 'max_depth': 5, 'objective': 'regression', 'reg_alpha': 2.790271401831723, 'subsample': 0.8999999999999999, 'reg_lambda': 98.86202121297616, 'n_estimators': 640, 'random_state': 42, 'learning_rate': 0.1765129798011447}, 
    {'device': 'cpu', 'boosting_type': 'gbdt', 'max_depth': 5, 'objective': 'regression', 'reg_alpha': 2.2056343316599225, 'subsample': 0.8999999999999999, 'reg_lambda': 93.71192401252894, 'n_estimators': 722, 'random_state': 42, 'learning_rate': 0.1797135568956882}, 
    {'device': 'cpu', 'boosting_type': 'gbdt', 'max_depth': 5, 'objective': 'regression', 'reg_alpha': 0.9230695630977448, 'subsample': 1.0, 'reg_lambda': 99.21650396828215, 'n_estimators': 579, 'random_state': 42, 'learning_rate': 0.1453865026227929}, 
    {'device': 'cpu', 'boosting_type': 'gbdt', 'max_depth': 5, 'objective': 'regression', 'reg_alpha': 0.9230695630977448, 'subsample': 1.0, 'reg_lambda': 99.21650396828215, 'n_estimators': 579, 'random_state': 42, 'learning_rate': 0.1453865026227929}, 
    {'device': 'cpu', 'boosting_type': 'gbdt', 'max_depth': 5, 'objective': 'regression', 'reg_alpha': 1.2229327339862928, 'subsample': 1.0, 'reg_lambda': 93.65955291434508, 'n_estimators': 596, 'random_state': 42, 'learning_rate': 0.1329652767998142}]

In [6]:
X = df_train.drop(CFG.target, axis=1)
Y = df_train[CFG.target]
X_orig = df_original.copy()
Y_orig = X_orig.pop(CFG.target)

for fold in range(CFG.num_folds):

    print('-'*100)
    print('Fold', fold)
    print('-'*100)

    preds_ft = pd.read_csv(f'OOF_ft_8_4_fold{fold}.csv')

    models = (
            ('catboost_with_orig', TransformedTargetRegressor(
                            CB(**params_catboost_with_original[fold]),
                            func=np.log1p, 
                            inverse_func=np.expm1)),
            ('lgb_with_orig', TransformedTargetRegressor(
                            LGB(**params_lgb_with_original[fold], num_leaves=2**params_lgb_with_original[fold]['max_depth']-1), 
                            func=np.log1p, 
                            inverse_func=np.expm1)),
        )

    preds_tree_models = []
    for label, model in models:
        if 'with_orig' in label:
            model.fit(  pd.concat([X.loc[cv_idx[fold][0]], X_orig], axis=0),  
                        pd.concat([Y.loc[cv_idx[fold][0]], Y_orig], axis=0))
        else:
            model.fit(X.loc[cv_idx[fold][0]], Y.loc[cv_idx[fold][0]])
        preds_tree_models.append( model.predict(X.loc[cv_idx[fold][1]]) )

    gt = Y.loc[cv_idx[fold][1]]
    for i, (label, model) in enumerate(models):
        print(f'{TXT_ACC} Score {label} {TXT_RESET}    {rmsle(gt, preds_tree_models[i]):.4f}')

    print(f'{TXT_ACC} Score ft_8_4 {TXT_RESET}    {rmsle(gt, preds_ft[CFG.target].values):.4f}')

    ensembler = LinearRegression(positive=True)
    df_ens_data = np.log1p(preds_tree_models).T
    ensembler.fit(df_ens_data, np.log1p(gt))
    ens_oof = np.expm1(ensembler.predict(df_ens_data) )

    display(pd.DataFrame(ensembler.coef_, index=[m[0] for m in models], columns=['LR weights']))
    

    print(f'{TXT_ACC} Score ensemble {TXT_RESET}    {rmsle(gt, ens_oof):.4f}')


    preds_tree_models.append(preds_ft[CFG.target].values)

    ensembler = LinearRegression(positive=True)
    df_ens_data = np.log1p(preds_tree_models).T
    ensembler.fit(df_ens_data, np.log1p(gt))
    ens_oof = np.expm1(ensembler.predict(df_ens_data) )

    display(pd.DataFrame(ensembler.coef_, index=[*[m[0] for m in models], 'ft_8_4'], columns=['LR weights']))
    

    print(f'{TXT_ACC} Score ensemble {TXT_RESET}    {rmsle(gt, ens_oof):.4f}')

----------------------------------------------------------------------------------------------------
Fold 0
----------------------------------------------------------------------------------------------------
[1m[38;5;254m[48;5;240m Score catboost_with_orig [0m    0.1482
[1m[38;5;254m[48;5;240m Score lgb_with_orig [0m    0.1477
[1m[38;5;254m[48;5;240m Score ft_8_4 [0m    0.1505


Unnamed: 0,LR weights
catboost_with_orig,0.3704
lgb_with_orig,0.6285


[1m[38;5;254m[48;5;240m Score ensemble [0m    0.1475


Unnamed: 0,LR weights
catboost_with_orig,0.2571
lgb_with_orig,0.5959
ft_8_4,0.149


[1m[38;5;254m[48;5;240m Score ensemble [0m    0.1474
----------------------------------------------------------------------------------------------------
Fold 1
----------------------------------------------------------------------------------------------------
[1m[38;5;254m[48;5;240m Score catboost_with_orig [0m    0.1489
[1m[38;5;254m[48;5;240m Score lgb_with_orig [0m    0.1483
[1m[38;5;254m[48;5;240m Score ft_8_4 [0m    0.1514


Unnamed: 0,LR weights
catboost_with_orig,0.3414
lgb_with_orig,0.6515


[1m[38;5;254m[48;5;240m Score ensemble [0m    0.1480


Unnamed: 0,LR weights
catboost_with_orig,0.232
lgb_with_orig,0.6425
ft_8_4,0.1215


[1m[38;5;254m[48;5;240m Score ensemble [0m    0.1480
----------------------------------------------------------------------------------------------------
Fold 2
----------------------------------------------------------------------------------------------------
[1m[38;5;254m[48;5;240m Score catboost_with_orig [0m    0.1493
[1m[38;5;254m[48;5;240m Score lgb_with_orig [0m    0.1487
[1m[38;5;254m[48;5;240m Score ft_8_4 [0m    0.1520


Unnamed: 0,LR weights
catboost_with_orig,0.3379
lgb_with_orig,0.664


[1m[38;5;254m[48;5;240m Score ensemble [0m    0.1485


Unnamed: 0,LR weights
catboost_with_orig,0.2524
lgb_with_orig,0.6418
ft_8_4,0.1084


[1m[38;5;254m[48;5;240m Score ensemble [0m    0.1485
----------------------------------------------------------------------------------------------------
Fold 3
----------------------------------------------------------------------------------------------------
[1m[38;5;254m[48;5;240m Score catboost_with_orig [0m    0.1491
[1m[38;5;254m[48;5;240m Score lgb_with_orig [0m    0.1488
[1m[38;5;254m[48;5;240m Score ft_8_4 [0m    0.1523


Unnamed: 0,LR weights
catboost_with_orig,0.4144
lgb_with_orig,0.5885


[1m[38;5;254m[48;5;240m Score ensemble [0m    0.1485


Unnamed: 0,LR weights
catboost_with_orig,0.3903
lgb_with_orig,0.5771
ft_8_4,0.0358


[1m[38;5;254m[48;5;240m Score ensemble [0m    0.1485
----------------------------------------------------------------------------------------------------
Fold 4
----------------------------------------------------------------------------------------------------
[1m[38;5;254m[48;5;240m Score catboost_with_orig [0m    0.1471
[1m[38;5;254m[48;5;240m Score lgb_with_orig [0m    0.1467
[1m[38;5;254m[48;5;240m Score ft_8_4 [0m    0.1496


Unnamed: 0,LR weights
catboost_with_orig,0.402
lgb_with_orig,0.6066


[1m[38;5;254m[48;5;240m Score ensemble [0m    0.1465


Unnamed: 0,LR weights
catboost_with_orig,0.3224
lgb_with_orig,0.5776
ft_8_4,0.11


[1m[38;5;254m[48;5;240m Score ensemble [0m    0.1464
