In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib.colors import LinearSegmentedColormap

from xgboost import XGBRegressor as XGB
import lightgbm
from lightgbm import LGBMRegressor as LGB

from boruta import BorutaPy

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, StandardScaler
from sklearn.decomposition import PCA

from sklearn.feature_selection import SelectKBest, f_regression, RFECV

from sklearn.metrics import mean_squared_log_error, mean_squared_error

from sklearn.base import BaseEstimator, TransformerMixin

import functools


rmse = functools.partial(mean_squared_error, squared=False)
rmsle = functools.partial(mean_squared_log_error, squared=False)


SEED=42

from src.styles import set_styles, TXT_ACC, TXT_RESET

import warnings
warnings.filterwarnings('ignore')


# ---- REPRODICIBILITY ------------------------------------------------
np.random.seed(SEED)

# ---- PANDAS ---------------------------------------------------------
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,}'.format


set_styles()

In [2]:
class CFG:
    path_train = 'data/train.csv'
    path_test = 'data/test.csv'
    target = 'Rings'

In [3]:
df_train = pd.read_csv(CFG.path_train).drop('id', axis=1)
df_train['Sex'] = df_train['Sex'].map({'I':0, 'F':1, 'M':2})
df_train

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,1,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,0,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,2,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,0,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9
...,...,...,...,...,...,...,...,...,...
90610,2,0.335,0.235,0.075,0.1585,0.0685,0.037,0.045,6
90611,2,0.555,0.425,0.15,0.879,0.3865,0.1815,0.24,9
90612,0,0.435,0.33,0.095,0.3215,0.151,0.0785,0.0815,6
90613,0,0.345,0.27,0.075,0.2,0.098,0.049,0.07,6


In [4]:
class NewFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, df_input):

        X = df_input.copy()

        X['ratio_weight1'] = X['Whole weight.1'] / X['Whole weight']
        X['ratio_weight2'] = X['Whole weight.2'] / X['Whole weight']
        X['ratio_shell']   = X['Shell weight']   / X['Whole weight']
        X['weight1/weight2'] = X['Whole weight.1'] / X['Whole weight.2']
        X['weight2/weight1'] = X['Whole weight.2'] / X['Whole weight.1']
        X['ratio_sum'] = X['ratio_weight1'] + X['ratio_weight2'] + X['ratio_shell']

        X['weights_sum'] = X['Whole weight.1'] + X['Whole weight.2'] + X['Shell weight']
        X['weight_dif'] = X['Whole weight'] - X['weights_sum']

        X['Length/Diameter'] = X['Length']   / X['Diameter']
        X['Height/Diameter'] = X['Height']   / X['Diameter']
        X['Diameter/Length'] = X['Diameter'] / X['Length']
        X['Height/Length']   = X['Height']   / X['Length']
        X['Diameter/Height'] = X['Diameter'] / (X['Height'] + 1e-15)
        X['Length/Height']   = X['Length']   / (X['Height'] + 1e-15)

        X['Height/WholeWeight']   = X['Height']   / X['Whole weight']
        X['Diameter/WholeWeight'] = X['Diameter'] / X['Whole weight']
        X['Length/WholeWeight']   = X['Length']   / X['Whole weight']

        return X    

# Correlations

In [5]:
X_train = NewFeatures().transform(df_train)
y_train = X_train.pop(CFG.target)

corrs = [(feature, abs(X_train[feature].corr(y_train))) for feature in X_train.columns]
df_corrs = pd.DataFrame(corrs)
df_corrs.columns = ['feature', f'corr']

display(df_corrs.sort_values('corr', ascending=False))

Unnamed: 0,feature,corr
7,Shell weight,0.6947655842875777
3,Height,0.6657718018300474
2,Diameter,0.6368323390197846
1,Length,0.6237856646237635
4,Whole weight,0.6172738962824822
14,weights_sum,0.5993978391476139
6,Whole weight.2,0.5889543539062496
5,Whole weight.1,0.5150668067459404
24,Length/WholeWeight,0.4766472346691895
22,Height/WholeWeight,0.476313237670749


# Feature selection

In [6]:
X_train = df_train.copy()
Y_train = X_train.pop(CFG.target)

X_train = NewFeatures().transform(X_train)

selector = SelectKBest(score_func=f_regression, k=10)
X_best = selector.fit_transform(X_train, Y_train)

print(f'{TXT_ACC} SelectKBest features {TXT_RESET}')
for i, f in enumerate(selector.get_feature_names_out(X_train.columns)):
    print(f"'{f}', ", end='')
    if i%6 == 5:
        print()

[1m[38;5;254m[48;5;240m SelectKBest features [0m
'Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 
'Shell weight', 'weights_sum', 'Height/WholeWeight', 'Length/WholeWeight', 

In [7]:
X_train = df_train.copy()
Y_train = X_train.pop(CFG.target)

X_train = NewFeatures().transform(X_train)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
selector = RFECV(estimator=LGB(random_state=SEED, n_estimators=300, verbose=-1), min_features_to_select=8, cv=cv)
X_best = selector.fit_transform(X_train, Y_train)

print(f'{TXT_ACC} RFECV features {TXT_RESET}')
for i, f in enumerate(selector.get_feature_names_out(X_train.columns)):
    print(f"'{f}', ", end='')
    if i%6 == 5:
        print()

[1m[38;5;254m[48;5;240m RFECV features [0m
'Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 
'Whole weight.2', 'Shell weight', 'ratio_weight1', 'ratio_weight2', 'ratio_shell', 'weight1/weight2', 
'weight2/weight1', 'ratio_sum', 'weights_sum', 'weight_dif', 'Length/Diameter', 'Height/Diameter', 
'Height/Length', 'Height/WholeWeight', 'Diameter/WholeWeight', 'Length/WholeWeight', 

In [8]:
X_train = df_train.copy()
Y_train = X_train.pop(CFG.target)

X_train = NewFeatures().transform(X_train)

# Boruta is very outdated, avoid compatibility issues
np.int = np.int32
np.float = np.float64
np.bool = np.bool_

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
selector = BorutaPy(
    verbose=-1,
    estimator=LGB(random_state=SEED, n_estimators=300, verbose=-1),
    n_estimators=100,
    max_iter=100
)
selector.fit(X_train.values, Y_train.values)

features_out = [f for f, is_selected in zip(X_train.columns, selector.support_) if is_selected]

print(f'{TXT_ACC} Boruta features {TXT_RESET}')
for i, f in enumerate(features_out):
    print(f"'{f}', ", end='')
    if i%6 == 5:
        print()

[1m[38;5;254m[48;5;240m Boruta features [0m
'Length', 'Diameter', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight', 
'ratio_weight1', 'ratio_weight2', 'ratio_shell', 'weight_dif', 'Length/Diameter', 'Height/Length', 


# Score LGB

*Summary*: the best score is on original features

In [9]:
model = TransformedTargetRegressor( Pipeline([
                                        # ('data_preprocessing', NewFeatures()),
                                        # ('feature_selection', SelectKBest(score_func=f_regression, k=10)),
                                        ('model', LGB(random_state=SEED, n_estimators=300, verbose=-1))
                                        ]),
                func=np.log1p, 
                inverse_func=np.expm1)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

X_train = NewFeatures().transform(df_train)
Y_train = X_train[CFG.target]

features_rfe = ['Rings', 'Sex',  'Length',  'Diameter',  'Height',  'Whole weight',  'Whole weight.1',  'Whole weight.2',  
                'Shell weight',  'ratio_weight1',  'ratio_weight2',  'ratio_shell',  'weight1/weight2',  'weight2/weight1',  
                'ratio_sum',  'weights_sum',  'weight_dif',  'Length/Diameter',  'Diameter/Length',  'Height/Diameter',  
                'Height/Length',  'Height/WholeWeight',]

features_kbest = ['Rings', 'Sex',  'Length',  'Diameter',  'Height',  'Whole weight',  'Whole weight.1',  'Whole weight.2',  
                  'Shell weight', 'weights_sum',  'Height/WholeWeight',]

features_boruta = ['Rings', 'Length', 'Diameter', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight', 
                   'ratio_weight1', 'ratio_weight2', 'ratio_shell', 'weight_dif', 'Length/Diameter', 'Height/Length', ]


selected_features = (('original',    df_train.columns),
                     ('RFECV',       features_rfe),
                     ('SelectKBest', features_kbest),
                     ('Boruta',      features_boruta),)


for label, features in selected_features:
    scores = []
    for fold, (idx_train, idx_val) in enumerate(cv.split(X_train, Y_train)):
        Xt = X_train.loc[idx_train][features]
        Yt = Xt.pop(CFG.target)
        Xv = X_train.loc[idx_val][features]
        Yv = Xv.pop(CFG.target)

        model.fit(Xt, Yt)
        preds = model.predict(Xv)
        score = rmsle(Yv, preds) 
        scores.append(score)

    print(f'{TXT_ACC} Scores for {label} features {TXT_RESET}')
    for s in scores:
        print(f'{s:.4f}', end='  ')
    print()

[1m[38;5;254m[48;5;240m Scores for original features [0m
0.1488  0.1491  0.1497  0.1497  0.1474  
[1m[38;5;254m[48;5;240m Scores for RFECV features [0m
0.1494  0.1504  0.1506  0.1508  0.1484  
[1m[38;5;254m[48;5;240m Scores for SelectKBest features [0m
0.1490  0.1496  0.1501  0.1498  0.1475  
[1m[38;5;254m[48;5;240m Scores for Boruta features [0m
0.1500  0.1512  0.1511  0.1514  0.1489  
