# Imports

In [1]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 400)
pd.set_option('display.max_columns', None)

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
import matplotlib.colors as plt_colors

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

from xgboost import XGBRegressor, plot_importance
SEED=53

# Data Loading

In [2]:
filepath = r'C:\Users\Emincan\Desktop\Playgrounds\Playgrounds3-16(Regression)'

train = pd.read_csv(os.path.join(filepath, 'train.csv'), index_col=[0])
test = pd.read_csv(os.path.join(filepath, 'test.csv'), index_col=[0])
original = pd.read_csv(os.path.join(filepath, 'CrabAgePrediction.csv'))
synthetic = pd.read_csv(os.path.join(filepath, 'synthetic_data_200k_07(1).csv'))

# train['is_generated'] = 1
# test['is_generated'] = 1
# original['is_generated'] = 0

target = 'Age'
num_cols = test.select_dtypes(include=['float64']).columns.tolist()
cat_cols = test.select_dtypes(include=['object']).columns.tolist()

print(f"train shape :{train.shape}, ", f"test shape :{test.shape}")
print(f"original shape :{original.shape}")
print(f"original shape :{synthetic.shape}")

train shape :(74051, 9),  test shape :(49368, 8)
original shape :(3893, 9)
original shape :(200000, 10)


# EDA

In [3]:
display(train.head())
features = train.columns.drop([target]).to_list()

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


In [4]:
print(f'Train\'s shape: {train.shape}')
print(f'Test\'s shape: {test.shape}')

Train's shape: (74051, 9)
Test's shape: (49368, 8)


In [5]:
# Target column is of type integer between 1-29
print(f'{target}\'s type: {train[target].dtype}')
print(f'Interval: [{np.min(train[target])}, {np.max(train[target])}]')

Age's type: int64
Interval: [1, 29]


# Data Preprocessing

In [6]:
enc = OneHotEncoder() 
ohe_train = pd.DataFrame(enc.fit_transform(train[cat_cols]).toarray(), columns=enc.get_feature_names_out())
train.drop(cat_cols, axis=1, inplace=True)
train = pd.concat([train, ohe_train], axis=1)

ohe_test = pd.DataFrame(enc.transform(test[cat_cols]).toarray(), columns=enc.get_feature_names_out())
test.drop(cat_cols, axis=1, inplace=True)
test = pd.concat([test, ohe_test], axis=1)

features = train.columns.drop([target]).to_list()

# Split the Dataset for training

In [7]:
y_train = train[target]
train = train[features]
test = test[features]

# Hyperparameters tuning(Optuna)

In [8]:
def create_model(param):
    return XGBRegressor(random_state=SEED,  
                        tree_method='gpu_hist',
                        eval_metric= 'mae',
                        objective='reg:absoluteerror',
                        early_stopping_rounds=100,
                        **param
                       )

In [9]:
def train_model(param, X, y, X_test, n_splits=5, trial=None): 
    
    skf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    y_test = []
    validation = y.copy(deep=True)
    validation_scores = []
    models = []

    
    for fold, (idx_train, idx_val) in enumerate(skf.split(X, y)):
        print(f'Fold: {fold+1}/{n_splits}')      

        X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
        X_val, y_val = X.iloc[idx_val], y.iloc[idx_val]
        
        model = create_model(param)
        
        model.fit(X_train, y_train,
                 eval_set=[(X_train, y_train), (X_val, y_val)],
                 verbose=1000,
                )
        
        y_hat_val = model.predict(X_val)
        validation.iloc[y_val.index] = y_hat_val
        score = mean_absolute_error(y_val.values, y_hat_val)
        validation_scores.append(score)
        print(f'Fold: {fold+1}/{n_splits} score = {score:.5f}')
        
        
        y_test.append(model.predict(X_test))
        models.append(model)
 
        if trial:
            trial.report(score, fold)

            if trial.should_prune():
                raise optuna.TrialPruned()    
    
    return validation, y_test, validation_scores, models

In [10]:
def objective_xgb(trial):
       
    param = {
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']), 
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        'n_estimators' : trial.suggest_int('n_estimators', 100, 1000, step=100)
    }
    if(param['booster'] in ['gbtree', 'dart']):
        # maximum depth of the tree, signifies complexity of the tree.
        param['max_depth'] = trial.suggest_int('max_depth', 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param['min_child_weight'] = trial.suggest_int('min_child_weight', 2, 10)
        param['eta'] = trial.suggest_float('eta', 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param['gamma'] = trial.suggest_float('gamma', 1e-8, 1.0, log=True)
        param['grow_policy'] = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])

    if(param['booster'] == 'dart'):
        param['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
        param['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree', 'forest'])
        param['rate_drop'] = trial.suggest_float('rate_drop', 1e-8, 1.0, log=True)
        param['skip_drop'] = trial.suggest_float('skip_drop', 1e-8, 1.0, log=True)

    validation, y_hat_test, validation_scores, trained_models = train_model(param, train, y_train, test, 8, trial) # matches the max_resource
    
    return mean_absolute_error(y_train.values, validation)
    
    
study = optuna.create_study(
    sampler=TPESampler(seed=SEED),
    direction='minimize',
    study_name='xgb_tuning',
    pruner=HyperbandPruner(
        min_resource=1, max_resource=8, reduction_factor=3
    ),
)
study.optimize(objective_xgb, n_trials=1000, timeout=7*60*60)

[I 2023-07-14 03:01:18,576] A new study created in memory with name: xgb_tuning


Fold: 1/8
[0]	validation_0-mae:1.49030	validation_1-mae:1.49924
[118]	validation_0-mae:1.33902	validation_1-mae:1.45738
Fold: 1/8 score = 1.40893
Fold: 2/8
[0]	validation_0-mae:1.49336	validation_1-mae:1.50227
[114]	validation_0-mae:1.34937	validation_1-mae:1.43991
Fold: 2/8 score = 1.39092
Fold: 3/8
[0]	validation_0-mae:1.49557	validation_1-mae:1.47051
[107]	validation_0-mae:1.35999	validation_1-mae:1.41022
Fold: 3/8 score = 1.37955
Fold: 4/8
[0]	validation_0-mae:1.50178	validation_1-mae:1.49752
[134]	validation_0-mae:1.33893	validation_1-mae:1.43472
Fold: 4/8 score = 1.39199
Fold: 5/8
[0]	validation_0-mae:1.47928	validation_1-mae:1.47083
[108]	validation_0-mae:1.34537	validation_1-mae:1.44147
Fold: 5/8 score = 1.39142
Fold: 6/8
[0]	validation_0-mae:1.49897	validation_1-mae:1.54948
[111]	validation_0-mae:1.34153	validation_1-mae:1.49120
Fold: 6/8 score = 1.43893
Fold: 7/8
[0]	validation_0-mae:1.49041	validation_1-mae:1.46683
[118]	validation_0-mae:1.34960	validation_1-mae:1.42993
Fold

[I 2023-07-14 03:01:22,225] Trial 0 finished with value: 1.4014682448582734 and parameters: {'booster': 'gbtree', 'lambda': 4.3551510196757946e-05, 'alpha': 6.567488326823929e-06, 'subsample': 0.6686811073078698, 'colsample_bytree': 0.6285997954184606, 'n_estimators': 900, 'max_depth': 5, 'min_child_weight': 6, 'eta': 6.057315331886634e-06, 'gamma': 1.1907664188208343e-07, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 1.4014682448582734.


Fold: 8/8 score = 1.42609
Fold: 1/8
[0]	validation_0-mae:1.44290	validation_1-mae:1.45031
[103]	validation_0-mae:1.51108	validation_1-mae:1.74527
Fold: 1/8 score = 1.41812
Fold: 2/8
[0]	validation_0-mae:1.45211	validation_1-mae:1.45058
[102]	validation_0-mae:1.51585	validation_1-mae:1.78253
Fold: 2/8 score = 1.42209
Fold: 3/8
[0]	validation_0-mae:1.45686	validation_1-mae:1.45031
[101]	validation_0-mae:1.51953	validation_1-mae:1.78513
Fold: 3/8 score = 1.40526
Fold: 4/8
[0]	validation_0-mae:1.46234	validation_1-mae:1.45905
[102]	validation_0-mae:1.52862	validation_1-mae:1.80007
Fold: 4/8 score = 1.41122
Fold: 5/8
[0]	validation_0-mae:1.45565	validation_1-mae:1.45743
[101]	validation_0-mae:1.52511	validation_1-mae:1.77467
Fold: 5/8 score = 1.40314
Fold: 6/8
[0]	validation_0-mae:1.44834	validation_1-mae:1.48579
[101]	validation_0-mae:1.51336	validation_1-mae:1.79510
Fold: 6/8 score = 1.45125
Fold: 7/8
[0]	validation_0-mae:1.46125	validation_1-mae:1.46013
[103]	validation_0-mae:1.50700	val

[I 2023-07-14 03:01:32,423] Trial 1 finished with value: 1.4226073922026712 and parameters: {'booster': 'gbtree', 'lambda': 0.00028435664810945937, 'alpha': 0.15134094273094537, 'subsample': 0.3338767096051072, 'colsample_bytree': 0.5257651358638613, 'n_estimators': 700, 'max_depth': 7, 'min_child_weight': 7, 'eta': 0.00015661768375919782, 'gamma': 0.0007255046398582566, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 1.4014682448582734.


Fold: 8/8 score = 1.45306
Fold: 1/8
[0]	validation_0-mae:1.48170	validation_1-mae:1.52387
[101]	validation_0-mae:1.36224	validation_1-mae:1.79697
Fold: 1/8 score = 1.47355
Fold: 2/8
[0]	validation_0-mae:1.49660	validation_1-mae:1.52420
[103]	validation_0-mae:1.29201	validation_1-mae:1.74882
Fold: 2/8 score = 1.46669
Fold: 3/8
[0]	validation_0-mae:1.48667	validation_1-mae:1.49535
[100]	validation_0-mae:1.46384	validation_1-mae:1.86504
Fold: 3/8 score = 1.45815
Fold: 4/8
[0]	validation_0-mae:1.48345	validation_1-mae:1.49703
[103]	validation_0-mae:1.29693	validation_1-mae:1.71601


[I 2023-07-14 03:02:07,357] Trial 2 pruned. 


Fold: 4/8 score = 1.44972
Fold: 1/8
[0]	validation_0-mae:1.41711	validation_1-mae:1.44988
[101]	validation_0-mae:1.37270	validation_1-mae:1.93822
Fold: 1/8 score = 1.43219
Fold: 2/8
[0]	validation_0-mae:1.41438	validation_1-mae:1.45036
[101]	validation_0-mae:1.36822	validation_1-mae:1.94345


[I 2023-07-14 03:02:13,475] Trial 3 pruned. 


Fold: 2/8 score = 1.43778
Fold: 1/8
[0]	validation_0-mae:1.70647	validation_1-mae:1.72118
[399]	validation_0-mae:1.42529	validation_1-mae:1.42795
Fold: 1/8 score = 1.42708
Fold: 2/8
[0]	validation_0-mae:1.70423	validation_1-mae:1.71427
[209]	validation_0-mae:1.42490	validation_1-mae:1.43027
Fold: 2/8 score = 1.42827
Fold: 3/8
[0]	validation_0-mae:1.71135	validation_1-mae:1.69115
[357]	validation_0-mae:1.40928	validation_1-mae:1.39970
Fold: 3/8 score = 1.39894
Fold: 4/8
[0]	validation_0-mae:1.71118	validation_1-mae:1.70689
[258]	validation_0-mae:1.42249	validation_1-mae:1.42772


[I 2023-07-14 03:02:17,017] Trial 4 pruned. 


Fold: 4/8 score = 1.42718
Fold: 1/8
[0]	validation_0-mae:1.69102	validation_1-mae:1.70714
[311]	validation_0-mae:1.37958	validation_1-mae:1.39438
Fold: 1/8 score = 1.39297
Fold: 2/8
[0]	validation_0-mae:1.69045	validation_1-mae:1.71114
[135]	validation_0-mae:1.39993	validation_1-mae:1.39602


[I 2023-07-14 03:02:18,406] Trial 5 pruned. 


Fold: 2/8 score = 1.39376
Fold: 1/8
[0]	validation_0-mae:1.43079	validation_1-mae:1.43767
[104]	validation_0-mae:1.21755	validation_1-mae:1.48623
Fold: 1/8 score = 1.38247
Fold: 2/8
[0]	validation_0-mae:1.42894	validation_1-mae:1.43994
[105]	validation_0-mae:1.21312	validation_1-mae:1.49314
Fold: 2/8 score = 1.38312
Fold: 3/8
[0]	validation_0-mae:1.42560	validation_1-mae:1.42724
[106]	validation_0-mae:1.20716	validation_1-mae:1.49793
Fold: 3/8 score = 1.38873
Fold: 4/8
[0]	validation_0-mae:1.42175	validation_1-mae:1.42745
[104]	validation_0-mae:1.21408	validation_1-mae:1.47270
Fold: 4/8 score = 1.39094
Fold: 5/8
[0]	validation_0-mae:1.42520	validation_1-mae:1.43231
[103]	validation_0-mae:1.23082	validation_1-mae:1.49141
Fold: 5/8 score = 1.38964
Fold: 6/8
[0]	validation_0-mae:1.42219	validation_1-mae:1.46910
[102]	validation_0-mae:1.19939	validation_1-mae:1.54071
Fold: 6/8 score = 1.42878
Fold: 7/8
[0]	validation_0-mae:1.42492	validation_1-mae:1.41357
[103]	validation_0-mae:1.20659	val

[I 2023-07-14 03:02:22,491] Trial 6 finished with value: 1.3976617466340766 and parameters: {'booster': 'gbtree', 'lambda': 3.503102820160236e-07, 'alpha': 0.00021161314992090908, 'subsample': 0.8998366717589052, 'colsample_bytree': 0.6619057226379685, 'n_estimators': 400, 'max_depth': 7, 'min_child_weight': 3, 'eta': 0.002326158156495611, 'gamma': 0.08832868179076325, 'grow_policy': 'depthwise'}. Best is trial 6 with value: 1.3976617466340766.


Fold: 8/8 score = 1.43364


In [11]:
print(f'Number of finished trials: {len(study.trials)}')
print('Best trial:')
trial = study.best_trial
print(f'  Score: {trial.value:.5f}')
print(f'  Params: {trial.params}')

Number of finished trials: 7
Best trial:
  Score: 1.39766
  Params: {'booster': 'gbtree', 'lambda': 3.503102820160236e-07, 'alpha': 0.00021161314992090908, 'subsample': 0.8998366717589052, 'colsample_bytree': 0.6619057226379685, 'n_estimators': 400, 'max_depth': 7, 'min_child_weight': 3, 'eta': 0.002326158156495611, 'gamma': 0.08832868179076325, 'grow_policy': 'depthwise'}


# Best parameters

In [12]:
best_params = {'booster': 'dart', 'lambda': 1.3564242299917567e-06, 'alpha': 6.657736811017855e-08, 'subsample': 0.9987885488425374, 'colsample_bytree': 0.9748449420972439, 'n_estimators': 200, 'max_depth': 7, 'min_child_weight': 10, 'eta': 6.015791017774553e-07, 'gamma': 0.5513468186878823, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 2.765721277981557e-05, 'skip_drop': 3.569660132300492e-05}

In [13]:
# Done.