In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib.colors import LinearSegmentedColormap

from xgboost import XGBRegressor as XGB
import lightgbm
from lightgbm import LGBMRegressor as LGB

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, StandardScaler
from sklearn.decomposition import PCA

from sklearn.metrics import mean_squared_log_error, mean_squared_error

from sklearn.base import BaseEstimator, TransformerMixin

import functools


rmse = functools.partial(mean_squared_error, squared=False)
rmsle = functools.partial(mean_squared_log_error, squared=False)


SEED=42

from src.styles import set_styles, TXT_ACC, TXT_RESET

import warnings
warnings.filterwarnings('ignore')


# ---- REPRODICIBILITY ------------------------------------------------
np.random.seed(SEED)

# ---- PANDAS ---------------------------------------------------------
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,}'.format


set_styles()

In [2]:
class CFG:
    path_train = 'data/train.csv'
    path_test = 'data/test.csv'
    target = 'Rings'

In [3]:
df_train = pd.read_csv(CFG.path_train).drop('id', axis=1)
df_train['Sex'] = df_train['Sex'].map({'I':0, 'F':1, 'M':2})
df_train

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,1,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,0,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,2,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,0,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9
...,...,...,...,...,...,...,...,...,...
90610,2,0.335,0.235,0.075,0.1585,0.0685,0.037,0.045,6
90611,2,0.555,0.425,0.15,0.879,0.3865,0.1815,0.24,9
90612,0,0.435,0.33,0.095,0.3215,0.151,0.0785,0.0815,6
90613,0,0.345,0.27,0.075,0.2,0.098,0.049,0.07,6


In [4]:
linear_features = ['Length', 'Diameter', 'Height']
not_linear_features = [f for f in df_train.columns if f not in linear_features]

squared_features = ['Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']
not_squared_features = [f for f in df_train.columns if f not in squared_features]

square_transformer = ColumnTransformer([
    ('square', FunctionTransformer(func=lambda x: x*x, validate=False), linear_features)
], remainder='passthrough')

square_root_transformer = ColumnTransformer([
    ('square_root', FunctionTransformer(func=lambda x: x*x, validate=False), squared_features)
], remainder='passthrough')

In [5]:
def get_pca_transformer(transformer_stage0=None):
    if transformer_stage0 is None:
        return Pipeline([
                    ('scaler', StandardScaler()),
                    ('pca', PCA()),
                    ])
    else:
        return Pipeline([
                ('power_transformer', transformer_stage0),
                ('scaler', StandardScaler()),
                ('pca', PCA()),
                        ])



def concat_to_original_features(transformer):
    return FeatureUnion(transformer_list=[
                    ('original_features', FunctionTransformer(func=lambda x: x)),  # Identity transform
                    ('new_features',      transformer),
                ])

In [6]:
data_preprocessing = (
    ('original features',       FunctionTransformer(func=lambda x: x)),
    ('square linear features',  square_transformer),
    ('sqrt squared features',   square_root_transformer),
    ('pca',                     get_pca_transformer()),
    ('pca after square',        get_pca_transformer(square_transformer)),
    ('pca after sqrt',          get_pca_transformer(square_root_transformer)),
    ('concat pca',              concat_to_original_features( get_pca_transformer() )),
    ('concat pca after square', concat_to_original_features( get_pca_transformer(square_transformer) )),
    ('concat pca after sqrt',   concat_to_original_features( get_pca_transformer(square_root_transformer) )),
)


models = []
for label, data_preproc in data_preprocessing:
    models.append(
            (label, TransformedTargetRegressor( Pipeline([
                                ('data_preprocessing', data_preproc),
                                ('model', LGB(random_state=SEED, n_estimators=300, verbose=-1))
                                ]),
                func=np.log1p, 
                inverse_func=np.expm1)
            )
    )

In [7]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

X_train = df_train
Y_train = X_train[CFG.target]

scores = {label: [] for label, _ in data_preprocessing}

for fold, (idx_train, idx_val) in enumerate(cv.split(X_train, Y_train)):
    Xt = X_train.loc[idx_train]
    Yt = Xt.pop(CFG.target)
    Xv = X_train.loc[idx_val]
    Yv = Xv.pop(CFG.target)

    for label, model in models:
        model.fit(Xt, Yt)
        preds = model.predict(Xv)
        score = rmsle(Yv, preds) 
        scores[label].append(score)

In [8]:
df_scores = pd.DataFrame(scores).T
df_scores.columns = [f'Fold {i}' for i in range(df_scores.shape[1])]
df_scores.insert(0, 'mean', df_scores.mean(axis=1))
display(df_scores.sort_values('mean').style \
                 .format(precision=4) )

Unnamed: 0,mean,Fold 0,Fold 1,Fold 2,Fold 3,Fold 4
original features,0.1489,0.1488,0.1491,0.1497,0.1497,0.1474
square linear features,0.1489,0.1488,0.1491,0.1497,0.1497,0.1474
sqrt squared features,0.1489,0.1488,0.1491,0.1497,0.1497,0.1474
concat pca after sqrt,0.15,0.1497,0.1507,0.1507,0.1505,0.1486
concat pca after square,0.1501,0.1498,0.1505,0.1504,0.1512,0.1484
concat pca,0.1501,0.1499,0.1505,0.1504,0.1509,0.1486
pca after square,0.1521,0.1521,0.1524,0.1525,0.1531,0.1504
pca,0.1524,0.1523,0.1527,0.1526,0.1533,0.151
pca after sqrt,0.1526,0.152,0.1535,0.1531,0.1535,0.1507
