In [8]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib.colors import LinearSegmentedColormap

from xgboost import XGBRegressor as XGB
import lightgbm
from lightgbm import LGBMRegressor as LGB
from catboost import CatBoostRegressor as CB

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, StandardScaler, OneHotEncoder

from sklearn.metrics import mean_squared_log_error, mean_squared_error

import functools
rmse = functools.partial(mean_squared_error, squared=False)
rmsle = functools.partial(mean_squared_log_error, squared=False)


SEED=42

from src.styles import set_styles, TXT_ACC, TXT_RESET

import warnings
warnings.filterwarnings('ignore')


# ---- REPRODICIBILITY ------------------------------------------------
np.random.seed(SEED)

# ---- PANDAS ---------------------------------------------------------
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.4f}'.format


set_styles()

In [9]:
class CFG:
    path_train = 'data/train.csv'
    path_test = 'data/test.csv'
    path_original = 'data/abalone.csv'
    target = 'Rings'

In [10]:
df_train = pd.read_csv(CFG.path_train).drop('id', axis=1)
df_train['Sex'] = df_train['Sex'].map({'I':0, 'F':1, 'M':2})
df_train

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,1,0.5500,0.4300,0.1500,0.7715,0.3285,0.1465,0.2400,11
1,1,0.6300,0.4900,0.1450,1.1300,0.4580,0.2765,0.3200,11
2,0,0.1600,0.1100,0.0250,0.0210,0.0055,0.0030,0.0050,6
3,2,0.5950,0.4750,0.1500,0.9145,0.3755,0.2055,0.2500,10
4,0,0.5550,0.4250,0.1300,0.7820,0.3695,0.1600,0.1975,9
...,...,...,...,...,...,...,...,...,...
90610,2,0.3350,0.2350,0.0750,0.1585,0.0685,0.0370,0.0450,6
90611,2,0.5550,0.4250,0.1500,0.8790,0.3865,0.1815,0.2400,9
90612,0,0.4350,0.3300,0.0950,0.3215,0.1510,0.0785,0.0815,6
90613,0,0.3450,0.2700,0.0750,0.2000,0.0980,0.0490,0.0700,6


In [11]:
df_original = pd.read_csv(CFG.path_original)
df_original['Sex'] = df_original['Sex'].map({'I':0, 'F':1, 'M':2})
df_original = df_original.rename(columns={'Shucked weight': 'Whole weight.1', 'Viscera weight': 'Whole weight.2'})
df_original

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,2,0.4550,0.3650,0.0950,0.5140,0.2245,0.1010,0.1500,15
1,2,0.3500,0.2650,0.0900,0.2255,0.0995,0.0485,0.0700,7
2,1,0.5300,0.4200,0.1350,0.6770,0.2565,0.1415,0.2100,9
3,2,0.4400,0.3650,0.1250,0.5160,0.2155,0.1140,0.1550,10
4,0,0.3300,0.2550,0.0800,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,1,0.5650,0.4500,0.1650,0.8870,0.3700,0.2390,0.2490,11
4173,2,0.5900,0.4400,0.1350,0.9660,0.4390,0.2145,0.2605,10
4174,2,0.6000,0.4750,0.2050,1.1760,0.5255,0.2875,0.3080,9
4175,1,0.6250,0.4850,0.1500,1.0945,0.5310,0.2610,0.2960,10


In [12]:
df_full = pd.concat([df_train, df_original], axis=0).reset_index(drop=True)
df_full.duplicated().sum()

0

In [13]:
def get_val_oof(X_train, Y_train, cv):
    vals = []
    idx_vals = []
    for fold, (idx_train, idx_val) in enumerate(cv.split(X_train, Y_train)):
        Xv = X_train.loc[idx_val]
        Yv = Xv.pop(CFG.target)
        vals.extend(Yv.values)
        idx_vals.extend(idx_val)
    return pd.DataFrame(np.array([vals, idx_vals]).T, columns=['ground_truth', 'index'])


def score_cv(X_train, Y_train, model, cv, append_original=False, df_original=None):
    scores = []
    oofs = []
    for fold, (idx_train, idx_val) in enumerate(cv.split(X_train, Y_train)):
        Xt = X_train.loc[idx_train]
        if append_original:
            Xt = pd.concat([Xt, df_original], axis=0)
        Yt = Xt.pop(CFG.target)
        Xv = X_train.loc[idx_val]
        Yv = Xv.pop(CFG.target)

        model.fit(Xt, Yt)
        preds = model.predict(Xv)
        score = rmsle(Yv, preds) 
        scores.append(score)
        oofs.extend(preds)

    return scores, oofs

In [14]:
# notebook for reference
# https://www.kaggle.com/code/ambrosm/pss4e4-eda-which-makes-sense

X_train = df_train
Y_train = X_train[CFG.target]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

models = (
          ('xgb_100_depth5', XGB(random_state=SEED, n_estimators=100, max_depth=5, verbose=-1)),
          ('lgb_300_depth5', LGB(random_state=SEED, n_estimators=300, max_depth=5, num_leaves=31, verbose=-1)),
          ('catboost', CB(random_state=SEED, n_estimators=1000, verbose=0)),
          ('HistGradientBoosting', HistGradientBoostingRegressor(random_state=SEED, max_iter=200)),

          # ~5min, ~0.150
          ('random_forest', RandomForestRegressor(n_estimators=200, random_state=SEED, min_samples_leaf=8, max_features=5)), 

          # ~2min, ~0.150
          ('extra_trees', ExtraTreesRegressor(n_estimators=200, random_state=SEED, min_samples_leaf=7)), 
          
          # score ~0.1547
          ('knn_50', Pipeline([
                                ('ohe', ColumnTransformer([('ohe', OneHotEncoder(drop='first'), ['Sex'])],  remainder='passthrough')),
                                ('scaler', StandardScaler()),
                                ('model', KNeighborsRegressor(n_neighbors=50)),
                                ])),
)

results = {}
oof_preds = {}
for label, m in models:
    model = TransformedTargetRegressor(m, func=np.log1p, inverse_func=np.expm1)
    scores, oofs = score_cv(X_train, Y_train, model, cv)
    results[label] = scores
    oof_preds[label] = oofs

df_scores = pd.DataFrame(results).T
df_scores.columns = [f'Fold {i}' for i in range(df_scores.shape[1])]
df_scores.insert(0, 'mean', df_scores.mean(axis=1))

display(df_scores.sort_values('mean'))

Unnamed: 0,mean,Fold 0,Fold 1,Fold 2,Fold 3,Fold 4
lgb_300_depth5,0.1489,0.1486,0.1492,0.1497,0.1494,0.1474
catboost,0.149,0.1488,0.1494,0.1497,0.1496,0.1474
HistGradientBoosting,0.1495,0.1493,0.1499,0.1499,0.1502,0.1482
random_forest,0.1498,0.1495,0.1503,0.1505,0.1504,0.1483
xgb_100_depth5,0.15,0.1498,0.1504,0.1507,0.1504,0.1489
extra_trees,0.1506,0.1502,0.151,0.1512,0.1512,0.1493
knn_50,0.1547,0.1545,0.1555,0.1547,0.1554,0.1534


In [15]:
df_oof = pd.DataFrame(oof_preds)
df_gt = get_val_oof(X_train, Y_train, cv)

df_oof

Unnamed: 0,xgb_100_depth5,lgb_300_depth5,catboost,HistGradientBoosting,random_forest,extra_trees,knn_50
0,3.9099,3.9773,3.9727,3.9591,3.9146,3.8993,4.0441
1,9.9758,9.9948,9.9544,10.1450,10.3074,10.3766,10.0626
2,10.4895,10.1585,9.9359,10.2000,10.1869,9.8608,10.0181
3,9.6610,9.4194,9.1872,8.9734,10.2824,10.0817,8.3161
4,10.8524,10.8762,10.8285,10.6698,10.6711,10.7227,10.4367
...,...,...,...,...,...,...,...
90610,9.7040,9.8276,10.0763,9.7895,9.4569,9.4890,9.9740
90611,9.1890,9.8379,10.0771,9.9066,9.5961,9.6905,10.1312
90612,7.8700,7.8822,7.7621,7.7766,7.8640,7.8240,8.0703
90613,14.0368,13.5959,13.3088,13.3355,13.7244,13.7416,12.7552


In [16]:
ensembler = LinearRegression(positive=True)

ensembler.fit(np.log1p(df_oof), np.log1p(df_gt['ground_truth']))
ens_oof = np.expm1(ensembler.predict(np.log1p(df_oof)) )

display(pd.DataFrame(ensembler.coef_, index=df_oof.columns, columns=['LR weights']))
print(f'{TXT_ACC} Score ensemble {TXT_RESET}    {rmsle(df_gt["ground_truth"], ens_oof):.4f}')

Unnamed: 0,LR weights
xgb_100_depth5,0.0283
lgb_300_depth5,0.3808
catboost,0.3246
HistGradientBoosting,0.0352
random_forest,0.2354
extra_trees,0.0
knn_50,0.0


[1m[38;5;254m[48;5;240m Score ensemble [0m    0.1484


In [17]:
ensembler = LinearRegression(positive=True)

df_ens_data = df_oof[['lgb_300_depth5', 'catboost']]
ensembler.fit(np.log1p(df_ens_data), np.log1p(df_gt['ground_truth']))
ens_oof = np.expm1(ensembler.predict(np.log1p(df_ens_data)) )

display(pd.DataFrame(ensembler.coef_, index=df_ens_data.columns, columns=['LR weights']))
print(f'{TXT_ACC} Score ensemble {TXT_RESET}    {rmsle(df_gt["ground_truth"], ens_oof):.4f}')

Unnamed: 0,LR weights
lgb_300_depth5,0.5372
catboost,0.4655


[1m[38;5;254m[48;5;240m Score ensemble [0m    0.1486
