In [1]:
import pandas as pd
import numpy as np

import os

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib.colors import LinearSegmentedColormap

from xgboost import XGBRegressor as XGB
import lightgbm
from lightgbm import LGBMRegressor as LGB
from catboost import CatBoostRegressor as CB

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, StandardScaler, OneHotEncoder

from sklearn.metrics import mean_squared_log_error, mean_squared_error

from sklearn.base import BaseEstimator, TransformerMixin

import functools
rmse = functools.partial(mean_squared_error, squared=False)
rmsle = functools.partial(mean_squared_log_error, squared=False)

from autogluon.tabular import TabularPredictor
from src.autogluon_wrapper import AutoGluonTabular

SEED=42

from src.styles import set_styles, TXT_ACC, TXT_RESET

import warnings
warnings.filterwarnings('ignore')



# ---- REPRODICIBILITY ------------------------------------------------
np.random.seed(SEED)



# ---- PANDAS ---------------------------------------------------------
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.4f}'.format



set_styles()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class CFG:
    path_train = 'data/train.csv'
    path_test = 'data/test.csv'
    path_original = 'data/abalone.csv'
    target = 'Rings'

In [3]:
mapper_sex = {'I':0, 'F':1, 'M':2}

df_train = pd.read_csv(CFG.path_train).drop('id', axis=1)
df_train['Sex'] = df_train['Sex'].map(mapper_sex)

df_test = pd.read_csv(CFG.path_test).drop('id', axis=1)
df_test['Sex'] = df_test['Sex'].map(mapper_sex)

df_original = pd.read_csv(CFG.path_original)
df_original['Sex'] = df_original['Sex'].map(mapper_sex)
df_original = df_original.rename(columns={'Shucked weight': 'Whole weight.1', 'Viscera weight': 'Whole weight.2'})

In [4]:
class TargetLog1pTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns


    def fit(self, X):
        return self


    def transform(self, X):
        X_transformed = X.copy()
        if self.columns is None:
            self.columns = X.columns
        
        for col in self.columns:
            X_transformed[col] = np.log1p(X_transformed[col])
        
        return pd.DataFrame(X_transformed, columns=X.columns)
    

def log_clip(y):
    out = np.expm1(y)
    out = out.clip(1, 29)
    return out

In [5]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
cv_idx = [idx for idx in cv.split(df_train, df_train[CFG.target])]

model = AutoGluonTabular(problem_type='regression', 
                         eval_metric='root_mean_squared_error', 
                         time_limit=9000, )

experiments = (
            ('index>=0', 'Autogluon_all_models'),
            # ('Sex==0', 'Autogluon_I'),
            # ('Sex!=0', 'Autogluon_notI')
            )

for (condition, dir_models) in experiments:
      df = df_train.query(condition)
      df_oof = model.score_cv(df, 
                              CFG.target, 
                              cv_idx, 
                              f'{dir_models}/{dir_models}', 
                              transformer=TargetLog1pTransformer([CFG.target]), 
                              postprocessing=FunctionTransformer(log_clip),
                              df_original=df_original.query(condition))
      df_oof = df_oof.join(df[[CFG.target]], rsuffix='_gt', lsuffix='_oof') 
      df_oof.to_csv(f'oof_{dir_models}.csv', index_label='Index')

		Import fastai failed. A quick tip is to install via `pip install autogluon.tabular[fastai]==1.1.0`. 
		Import fastai failed. A quick tip is to install via `pip install autogluon.tabular[fastai]==1.1.0`. 
		Import fastai failed. A quick tip is to install via `pip install autogluon.tabular[fastai]==1.1.0`. 
		Import fastai failed. A quick tip is to install via `pip install autogluon.tabular[fastai]==1.1.0`. 
		Import fastai failed. A quick tip is to install via `pip install autogluon.tabular[fastai]==1.1.0`. 


In [6]:
df_oof = pd.read_csv('oof_Autogluon_with_original.csv').set_index('Index')
df1 = df_oof.loc[df_train['Sex'] == 0]
score1 = rmsle(df1['Rings_gt'], df1['Rings_oof'])
df2 = df_oof.loc[df_train['Sex'] != 0]
score2 = rmsle(df2['Rings_gt'], df2['Rings_oof'])

print(f'{TXT_ACC} Autogluon (with original) {TXT_RESET}')
print(f'Sex == I:   {score1: .4f},     Sex != I:   {score2: .4f}')

df1 = pd.read_csv('oof_Autogluon_I.csv').set_index('Index')
score1 = rmsle(df1['Rings_gt'], df1['Rings_oof'])
df2 = pd.read_csv('oof_Autogluon_notI.csv').set_index('Index')
score2 = rmsle(df2['Rings_gt'], df2['Rings_oof'])

print(f'{TXT_ACC} Autogluon on partial data {TXT_RESET}')
print(f'Sex == I:   {score1: .4f},     Sex != I:   {score2: .4f}')

[1m[38;5;254m[48;5;240m Autogluon (with original) [0m
Sex == I:    0.1336,     Sex != I:    0.1555
[1m[38;5;254m[48;5;240m Autogluon on partial data [0m
Sex == I:    0.1341,     Sex != I:    0.1557


Splitting data and fitting separate models doesn't look promising

# Summary of autogluon models

CatBoost and LGB are the most useful models for this data

In [12]:
dir_models = ['Autogluon_with_original', 'Autogluon_all_models',
# 'Autogluon_I', 'Autogluon_notI'
]
for directory in dir_models:
    for fold, dir_model in enumerate( os.listdir(directory) ):
        model = TabularPredictor.load(os.path.join(directory, dir_model))

        print(f'{TXT_ACC} {directory}       Fold {fold} {TXT_RESET}')
        display(model.leaderboard())

[1m[38;5;254m[48;5;240m Autogluon_with_original       Fold 0 [0m


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.1475,root_mean_squared_error,17.2017,220.2627,0.0,0.0432,2,True,6
1,CatBoost,-0.148,root_mean_squared_error,0.0355,70.1115,0.0355,70.1115,1,True,4
2,LightGBMLarge,-0.1482,root_mean_squared_error,0.3166,3.9446,0.3166,3.9446,1,True,5
3,LightGBM,-0.1482,root_mean_squared_error,0.5978,4.6504,0.5978,4.6504,1,True,2
4,LightGBMXT,-0.1488,root_mean_squared_error,15.4473,43.6831,15.4473,43.6831,1,True,1
5,RandomForestMSE,-0.1517,root_mean_squared_error,0.8044,97.8298,0.8044,97.8298,1,True,3


[1m[38;5;254m[48;5;240m Autogluon_with_original       Fold 1 [0m


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.1482,root_mean_squared_error,1.6781,171.4906,0.001,0.0484,2,True,5
1,CatBoost,-0.1488,root_mean_squared_error,0.0333,65.5126,0.0333,65.5126,1,True,3
2,LightGBM,-0.1488,root_mean_squared_error,0.4176,3.6901,0.4176,3.6901,1,True,1
3,LightGBMLarge,-0.1491,root_mean_squared_error,0.3894,5.0593,0.3894,5.0593,1,True,4
4,RandomForestMSE,-0.1522,root_mean_squared_error,0.8367,97.1802,0.8367,97.1802,1,True,2


[1m[38;5;254m[48;5;240m Autogluon_with_original       Fold 2 [0m


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.1487,root_mean_squared_error,11.7736,201.7746,0.0,0.0485,2,True,6
1,LightGBM,-0.1493,root_mean_squared_error,0.4041,3.798,0.4041,3.798,1,True,2
2,LightGBMLarge,-0.1493,root_mean_squared_error,0.2531,3.4972,0.2531,3.4972,1,True,5
3,CatBoost,-0.1494,root_mean_squared_error,0.0347,51.6665,0.0347,51.6665,1,True,4
4,LightGBMXT,-0.1497,root_mean_squared_error,10.2633,45.6871,10.2633,45.6871,1,True,1
5,RandomForestMSE,-0.1525,root_mean_squared_error,0.8184,97.0772,0.8184,97.0772,1,True,3


[1m[38;5;254m[48;5;240m Autogluon_with_original       Fold 3 [0m


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.1483,root_mean_squared_error,1.668,206.9035,0.002,0.043,2,True,5
1,CatBoost,-0.1487,root_mean_squared_error,0.0491,100.7125,0.0491,100.7125,1,True,3
2,LightGBM,-0.1491,root_mean_squared_error,0.5153,4.6516,0.5153,4.6516,1,True,1
3,LightGBMLarge,-0.1493,root_mean_squared_error,0.3045,4.5136,0.3045,4.5136,1,True,4
4,RandomForestMSE,-0.1527,root_mean_squared_error,0.797,96.9828,0.797,96.9828,1,True,2


[1m[38;5;254m[48;5;240m Autogluon_with_original       Fold 4 [0m


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.1465,root_mean_squared_error,1.6287,161.1352,0.0051,0.0431,2,True,5
1,CatBoost,-0.1469,root_mean_squared_error,0.0318,58.0436,0.0318,58.0436,1,True,3
2,LightGBM,-0.1472,root_mean_squared_error,0.4553,4.196,0.4553,4.196,1,True,1
3,LightGBMLarge,-0.1474,root_mean_squared_error,0.3534,4.0936,0.3534,4.0936,1,True,4
4,RandomForestMSE,-0.1505,root_mean_squared_error,0.7831,94.7589,0.7831,94.7589,1,True,2


[1m[38;5;254m[48;5;240m Autogluon_all_models       Fold 0 [0m


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.1475,root_mean_squared_error,19.1979,372.1959,0.0,0.0504,2,True,7
1,CatBoost,-0.148,root_mean_squared_error,0.0324,69.6055,0.0324,69.6055,1,True,4
2,LightGBMLarge,-0.1482,root_mean_squared_error,0.3138,3.8199,0.3138,3.8199,1,True,6
3,LightGBM,-0.1482,root_mean_squared_error,0.5347,4.443,0.5347,4.443,1,True,2
4,LightGBMXT,-0.1488,root_mean_squared_error,17.4577,42.3552,17.4577,42.3552,1,True,1
5,RandomForestMSE,-0.1517,root_mean_squared_error,0.8012,107.4708,0.8012,107.4708,1,True,3
6,NeuralNetTorch,-0.1527,root_mean_squared_error,0.0581,144.4511,0.0581,144.4511,1,True,5


[1m[38;5;254m[48;5;240m Autogluon_all_models       Fold 1 [0m


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.1482,root_mean_squared_error,1.6522,183.4075,0.0,0.0431,2,True,5
1,CatBoost,-0.1488,root_mean_squared_error,0.0308,64.255,0.0308,64.255,1,True,3
2,LightGBM,-0.1488,root_mean_squared_error,0.4091,3.5897,0.4091,3.5897,1,True,1
3,LightGBMLarge,-0.1491,root_mean_squared_error,0.3954,4.4738,0.3954,4.4738,1,True,4
4,RandomForestMSE,-0.1522,root_mean_squared_error,0.8168,111.0459,0.8168,111.0459,1,True,2


[1m[38;5;254m[48;5;240m Autogluon_all_models       Fold 2 [0m


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.1487,root_mean_squared_error,13.7346,327.157,0.0,0.0549,2,True,7
1,LightGBM,-0.1493,root_mean_squared_error,0.3903,3.3988,0.3903,3.3988,1,True,2
2,LightGBMLarge,-0.1493,root_mean_squared_error,0.3044,3.5018,0.3044,3.5018,1,True,6
3,CatBoost,-0.1494,root_mean_squared_error,0.0281,50.7276,0.0281,50.7276,1,True,4
4,LightGBMXT,-0.1497,root_mean_squared_error,12.1519,44.6011,12.1519,44.6011,1,True,1
5,RandomForestMSE,-0.1525,root_mean_squared_error,0.7997,109.0928,0.7997,109.0928,1,True,3
6,NeuralNetTorch,-0.1544,root_mean_squared_error,0.0602,115.78,0.0602,115.78,1,True,5


[1m[38;5;254m[48;5;240m Autogluon_all_models       Fold 3 [0m


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.1483,root_mean_squared_error,1.8543,343.0652,0.009,0.051,2,True,6
1,CatBoost,-0.1487,root_mean_squared_error,0.0434,99.6895,0.0434,99.6895,1,True,3
2,LightGBM,-0.1491,root_mean_squared_error,0.5788,4.6392,0.5788,4.6392,1,True,1
3,LightGBMLarge,-0.1493,root_mean_squared_error,0.3636,4.1178,0.3636,4.1178,1,True,5
4,RandomForestMSE,-0.1527,root_mean_squared_error,0.8069,108.101,0.8069,108.101,1,True,2
5,NeuralNetTorch,-0.1536,root_mean_squared_error,0.0526,126.4668,0.0526,126.4668,1,True,4


[1m[38;5;254m[48;5;240m Autogluon_all_models       Fold 4 [0m


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.1464,root_mean_squared_error,1.7404,352.0804,0.0,0.0487,2,True,6
1,CatBoost,-0.1469,root_mean_squared_error,0.0303,55.7386,0.0303,55.7386,1,True,3
2,LightGBM,-0.1472,root_mean_squared_error,0.508,4.1122,0.508,4.1122,1,True,1
3,LightGBMLarge,-0.1474,root_mean_squared_error,0.3433,3.9719,0.3433,3.9719,1,True,5
4,RandomForestMSE,-0.1505,root_mean_squared_error,0.8001,105.2307,0.8001,105.2307,1,True,2
5,NeuralNetTorch,-0.1519,root_mean_squared_error,0.0586,182.9784,0.0586,182.9784,1,True,4


In [8]:
def create_submission_df(model_dir):
    model = AutoGluonTabular(problem_type='regression', 
                            eval_metric='root_mean_squared_error', 
                            time_limit=9000, 
                            excluded_model_types=["NN_TORCH", "FASTAI", "NN"])
    preds = model.predict_cv(model_dir, df_test, postprocessing=FunctionTransformer(log_clip))

    df_sub = pd.read_csv(CFG.path_test)[['id']]
    df_sub[CFG.target] = np.mean(preds, axis=0)
    df_sub.to_csv(f'submission_{model_dir}.csv', index=False)
    display(df_sub)

In [9]:
create_submission_df('Autogluon_with_original')

Unnamed: 0,id,Rings
0,90615,9.7857
1,90616,9.6828
2,90617,9.9455
3,90618,10.5187
4,90619,7.5923
...,...,...
60406,151021,6.2993
60407,151022,9.2948
60408,151023,12.3380
60409,151024,13.3647


In [10]:
# get oofs -------------------------------------------------------------------------

# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
# cv_idx = [idx for idx in cv.split(df_train, df_train[CFG.target])]

# model = AutoGluonTabular(problem_type='regression', 
#                          eval_metric='root_mean_squared_error', 
#                          time_limit=9000, 
#                          excluded_model_types=["NN_TORCH", "FASTAI", "NN"])

# dir_models = 'Autogluon_with_original'
# df_oof = pd.DataFrame(index=df_train.index, columns=[CFG.target])
# for fold, ((idx_train, idx_val), path) in enumerate(zip(cv_idx, os.listdir(dir_models))):
#     p = model.predict(os.path.join(dir_models, path), 
#                     df_train.loc[idx_val].drop(CFG.target, axis=1), 
#                     postprocessing=FunctionTransformer(log_clip))
#     df_oof.loc[idx_val, CFG.target] = p

# df_oof = df_oof.join(df_train[[CFG.target]], rsuffix='_gt', lsuffix='_oof') 
# df_oof.to_csv(f'oof_{dir_models}.csv', index_label='Index')