In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib.colors import LinearSegmentedColormap

from xgboost import XGBRegressor as XGB
import lightgbm
from lightgbm import LGBMRegressor as LGB

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import VotingClassifier, StackingClassifier

from sklearn.compose import TransformedTargetRegressor

from sklearn.metrics import mean_squared_log_error, mean_squared_error

from sklearn.base import BaseEstimator, TransformerMixin

import functools


rmse = functools.partial(mean_squared_error, squared=False)
rmsle = functools.partial(mean_squared_log_error, squared=False)


SEED=42

from src.styles import set_styles, TXT_ACC, TXT_RESET

import warnings
warnings.filterwarnings('ignore')

In [2]:
# ---- REPRODICIBILITY ------------------------------------------------
np.random.seed(SEED)

# ---- PANDAS ---------------------------------------------------------
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,}'.format


set_styles()

In [3]:
class CFG:
    path_train = 'data/train.csv'
    path_test = 'data/test.csv'
    target = 'Rings'

In [4]:
df_train = pd.read_csv(CFG.path_train).drop('id', axis=1)
df_train['Sex'] = df_train['Sex'].map({'I':0, 'F':1, 'M':2})
df_train

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,1,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,0,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,2,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,0,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9
...,...,...,...,...,...,...,...,...,...
90610,2,0.335,0.235,0.075,0.1585,0.0685,0.037,0.045,6
90611,2,0.555,0.425,0.15,0.879,0.3865,0.1815,0.24,9
90612,0,0.435,0.33,0.095,0.3215,0.151,0.0785,0.0815,6
90613,0,0.345,0.27,0.075,0.2,0.098,0.049,0.07,6


In [5]:
def score_xgb(model, metric, Xt, Yt, Xv, Yv):
    model.fit(Xt, Yt, eval_set=[(Xv, Yv)], verbose=False)
    preds = model.predict(Xv)
    score = metric(Yv, preds)
    return score, model.best_iteration, preds

def score_lgb(model, metric, Xt, Yt, Xv, Yv):
    model.fit(Xt, Yt, eval_set=[(Xv, Yv)], callbacks=[lightgbm.early_stopping(10, verbose=False), lightgbm.log_evaluation(0, False)])
    preds = model.predict(Xv)
    score = metric(Yv, preds) 
    return score, model._best_iteration, preds

In [6]:
def init_dictionaty_keys(solutions, conditions_split):
    suffixes = ['log', 'round']
    dict_keys = [l[0] for l in solutions]
    dict_keys.extend([f'{l[0]}_{s}' for l in solutions for s in suffixes])
    for i in range(len(conditions_split)):
        dict_keys.extend([f'{l[0]}_{i}_{s}' for l in solutions for s in [*conditions_split[i]]])
        dict_keys.extend([f'{l[0]}_{i}' for l in solutions]) 
    
    return dict_keys

In [7]:
model_xgb = XGB(random_state=SEED, num_estimators=1000, early_stopping_rounds=10, verbose=-1)
model_lgb = LGB(random_state=SEED, n_estimators=1000, verbose=-1)

X_train = df_train
Y_train = X_train[CFG.target]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

solutions = (   ('xgb', model_xgb, score_xgb),
                ('lgb', model_lgb, score_lgb))
conditions_split = (('Sex==0', 'Sex!=0'), 
                    ('Sex==0', 'Sex==1', 'Sex==2'))
dict_keys = init_dictionaty_keys(solutions, conditions_split)
scores = {key: [] for key in dict_keys}
best_iterations = {key: [] for key in dict_keys}


# baseline scores for models with target transforms -----------------------------------------------------------

for fold, (idx_train, idx_val) in enumerate(cv.split(X_train, Y_train)):
    Xt = X_train.loc[idx_train]
    Yt = Xt.pop(CFG.target)
    Xv = X_train.loc[idx_val]
    Yv = Xv.pop(CFG.target)

    for label, model, score_func in solutions:
        sc, bestit, preds = score_func(model, rmsle, Xt, Yt, Xv, Yv)
        scores[label].append(sc)
        scores[f'{label}_round'].append(rmsle(Yv, np.round(preds)))
        best_iterations[label].append(bestit)
        best_iterations[f'{label}_round'].append(None)

        sc_log, bestit_log, _ = score_func(model, rmse, Xt, np.log1p(Yt), Xv, np.log1p(Yv))
        scores[f'{label}_log'].append(sc_log)
        best_iterations[f'{label}_log'].append(bestit_log)



# baseline scores for models on data split by 'Sex' feature -----------------------------------------------------

for num, conditions in enumerate(conditions_split):

    for fold, (idx_train, idx_val) in enumerate(cv.split(X_train, Y_train)):
        
        for label, model, score_func in solutions:
            preds = []
            vals = []

            for condition in conditions:
                Xt = X_train.loc[idx_train].query(condition)
                Yt = Xt.pop(CFG.target)
                Xv = X_train.loc[idx_val].query(condition)
                Yv = Xv.pop(CFG.target)

                sc, bestit, pred = score_func(model, rmse, Xt, np.log1p(Yt), Xv, np.log1p(Yv))
                scores[f'{label}_{num}_{condition}'].append(sc)
                best_iterations[f'{label}_{num}_{condition}'].append(bestit)

                preds.extend(pred)
                vals.extend(Yv)

            scores[f'{label}_{num}'].append(rmsle(vals, np.expm1(preds)))
            best_iterations[f'{label}_{num}'].append(None)

In [8]:
def highlight(row):
    '''
    highlight rows with info on full dataset, 
    extra highlight column with mean cv score
    '''
    if row.name.count('_') == 2:
        return ['background-color: #f1f1f1'] * len(row)
    else:
        return 'background-color: #d3d3d0; font-weight: bold', *['background-color: #d3d3d0'] * (len(row)-1)

df_scores = pd.DataFrame(scores).drop(['lgb_1_Sex==0', 'xgb_1_Sex==0'], axis=1).T
df_scores.columns = [f'Fold {i}' for i in range(df_scores.shape[1])]
df_scores.insert(0, 'mean', df_scores.mean(axis=1))

print('Highlighted with darker color are scores on full dataset')
display(df_scores.sort_values('mean').style \
    .format(precision=4) \
    .apply(highlight, axis=1) )

Highlighted with darker color are scores on full dataset


Unnamed: 0,mean,Fold 0,Fold 1,Fold 2,Fold 3,Fold 4
lgb_0_Sex==0,0.1349,0.1394,0.1327,0.1337,0.1351,0.1334
xgb_0_Sex==0,0.136,0.141,0.1334,0.1348,0.1361,0.1346
lgb_log,0.1492,0.149,0.1493,0.1499,0.1497,0.148
lgb_0,0.1495,0.1493,0.1497,0.1501,0.1507,0.1477
lgb,0.1499,0.1496,0.1504,0.1504,0.1506,0.1483
xgb_log,0.1503,0.1502,0.1505,0.151,0.1505,0.1494
lgb_1,0.1506,0.1502,0.1508,0.1513,0.1514,0.1493
xgb_0,0.1508,0.1507,0.1509,0.1516,0.1519,0.149
xgb,0.1511,0.1508,0.1517,0.1517,0.1518,0.1492
xgb_1,0.1518,0.1516,0.1518,0.1525,0.1528,0.1503


In [9]:
pd.DataFrame(best_iterations).T.dropna()

Unnamed: 0,0,1,2,3,4
xgb,36,43,34,43,47
lgb,185,166,192,198,152
xgb_log,40,56,38,70,50
lgb_log,162,187,187,241,138
xgb_0_Sex==0,20,17,14,11,20
xgb_0_Sex!=0,49,46,29,46,73
lgb_0_Sex==0,44,64,75,65,64
lgb_0_Sex!=0,195,176,148,132,260
xgb_1_Sex==0,20,17,14,11,20
xgb_1_Sex==1,24,18,17,19,17
