In [None]:
pip install lightgbm xgboost optuna sklego catboost

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import optuna
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, train_test_split 
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklego.linear_model import LADRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression

In [4]:
train = pd.read_csv('train.csv').drop(columns = ['id'])
test = pd.read_csv('test.csv').drop(columns = ['id'])
sub = pd.read_csv('sample_submission.csv')
original = pd.read_csv('original.csv')

train['generated'] = 1
original['generated'] = 0
test['generated'] = 1

train = pd.concat([train, original], axis = 0).reset_index(drop = True)

## Exploratory Data Analysis:

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
print(train.isna().sum())
print('\n', test.isna().sum())

In [None]:
train.drop(columns = ['Sex']).corr()

In [None]:
fig, axes = plt.subplots(2, 4, figsize = (20, 10))
plt.tight_layout(pad = 5)

sns.kdeplot(ax = axes[0, 0], data = train, x = 'Length', fill = True).set_title('Length')
sns.kdeplot(ax = axes[0, 1], data = train, x = 'Diameter', fill = True).set_title('Diameter')
sns.kdeplot(ax = axes[0, 2], data = train, x = 'Height', fill = True).set_title('Height')
sns.kdeplot(ax = axes[0, 3], data = train, x = 'Weight', fill = True).set_title('Weight')
sns.kdeplot(ax = axes[1, 0], data = train, x = 'Shucked Weight', fill = True).set_title('Shucked Weight')
sns.kdeplot(ax = axes[1, 1], data = train, x = 'Viscera Weight', fill = True).set_title('Viscera Weight')
sns.kdeplot(ax = axes[1, 2], data = train, x = 'Shell Weight', fill = True).set_title('Shell Weight')
sns.kdeplot(ax = axes[1, 3], data = train, x = 'Age', fill = True).set_title('Age')
plt.show()

## Feature Engineering and Cleaning:

In [5]:
le = LabelEncoder()

train['Sex'] = le.fit_transform(train['Sex'])
test['Sex'] = le.transform(test['Sex'])

In [None]:
## Weight
train['Accounted Weight'] = train['Shucked Weight'] + train['Viscera Weight'] + train['Shell Weight']
train['Weight Diff.'] = train['Weight'] - train['Accounted Weight']
train['Too Heavy'] = np.where(train['Accounted Weight'] > train['Weight'], 1, 0).astype(int)
train['Shucked Weight'] = np.where(train['Accounted Weight'] > train['Weight'], 0.424150 * train['Weight'], train['Shucked Weight'])
train['Viscera Weight'] = np.where(train['Accounted Weight'] > train['Weight'], 0.213569 * train['Weight'], train['Viscera Weight'])
train['Shell Weight'] = np.where(train['Accounted Weight'] > train['Weight'], 0.288712 * train['Weight'], train['Shell Weight'])
train['Shucked Weight Perc.'] = train['Shucked Weight'] / train['Weight']
train['Viscera Weight Perc.'] = train['Viscera Weight'] / train['Weight']
train['Shell Weight Perc.'] = train['Shell Weight'] / train['Weight']

test['Accounted Weight'] = test['Shucked Weight'] + test['Viscera Weight'] + test['Shell Weight']
test['Weight Diff.'] = test['Weight'] - test['Accounted Weight']
test['Too Heavy'] = np.where(test['Accounted Weight'] > test['Weight'], 1, 0).astype(int)
test['Shucked Weight'] = np.where(test['Accounted Weight'] > test['Weight'], 0.424150 * test['Weight'], test['Shucked Weight'])
test['Viscera Weight'] = np.where(test['Accounted Weight'] > test['Weight'], 0.213569 * test['Weight'], test['Viscera Weight'])
test['Shell Weight'] = np.where(test['Accounted Weight'] > test['Weight'], 0.288712 * test['Weight'], test['Shell Weight'])
test['Shucked Weight Perc.'] = test['Shucked Weight'] / test['Weight']
test['Viscera Weight Perc.'] = test['Viscera Weight'] / test['Weight']
test['Shell Weight Perc.'] = test['Shell Weight'] / test['Weight']

In [None]:
## Dimensions
train['Height'] = np.where(train['Height'] > 2, np.mean(train['Height']), 
                           np.where(train['Height'] == 0, 0.29337*train['Length']-0.03826729, train['Height']))
train['Volume'] = train['Length'] * train['Diameter'] * train['Height']
train['Density'] = train['Weight'] / train['Volume']
train['Ratio'] = train['Weight'] / (train['Diameter'] + 1e-8)
train['Area'] = train['Length'] * train['Diameter']
train['BMI'] = train['Weight'] / (train['Height']**2)

test['Height'] = np.where(test['Height'] > 2, np.mean(test['Height']), 
                           np.where(test['Height'] == 0, 0.29400666*test['Length']-0.03933592, test['Height']))
test['Volume'] = test['Length'] * test['Diameter'] * test['Height']
test['Density'] = test['Weight'] / test['Volume']
test['Ratio'] = test['Weight'] / (test['Diameter'] + 1e-8)
test['Area'] = test['Length'] * test['Diameter']
test['BMI'] = test['Weight'] / (test['Height']**2)

In [None]:
## Gender
train['Male'] = np.where(train['Sex'] == 'M', 1, 0); train['Female'] = np.where(train['Sex'] == 'F', 1, 0)
test['Male'] = np.where(test['Sex'] == 'M', 1, 0); test['Female'] = np.where(test['Sex'] == 'F', 1, 0)

In [None]:
## PCA
numeric_features = ['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight', 'Accounted Weight', 
                    'Weight Diff.', 'Shucked Weight Perc.', 'Viscera Weight Perc.', 'Shell Weight Perc.', 'Volume', 'Density']

scaler = StandardScaler().fit(train[numeric_features])
X_train = scaler.transform(train[numeric_features])
X_test = scaler.transform(test[numeric_features])

pca = PCA(4).fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

X_train_pca = pd.DataFrame(X_train_pca, columns = ['PC_1', 'PC_2', 'PC_3', 'PC_4'])
X_test_pca = pd.DataFrame(X_test_pca, columns = ['PC_1', 'PC_2', 'PC_3', 'PC_4'])

train = pd.concat([train, X_train_pca], axis = 1)
test = pd.concat([test, X_test_pca], axis = 1)

In [None]:
train = train.drop(columns = ['Sex', 'Too Heavy'])
test = test.drop(columns = ['Sex', 'Too Heavy'])

In [None]:
## Transformations
train['log_sqrt_weight'] = np.log(np.sqrt(train['Weight']))
train['log_sqrt_shucked_weight'] = np.log(np.sqrt(train['Shucked Weight']))
train['log_sqrt_viscera_weight'] = np.log(np.sqrt(train['Viscera Weight']))
train['sqrt_shell_weight'] = np.sqrt(train['Shell Weight'])
train['sqrt_area'] = np.sqrt(train['Area'])
train['log_sqrt_density'] = np.log(np.sqrt(train['Density']))

test['log_sqrt_weight'] = np.log(np.sqrt(test['Weight']))
test['log_sqrt_shucked_weight'] = np.log(np.sqrt(test['Shucked Weight']))
test['log_sqrt_viscera_weight'] = np.log(np.sqrt(test['Viscera Weight']))
test['sqrt_shell_weight'] = np.sqrt(test['Shell Weight'])
test['sqrt_area'] = np.sqrt(test['Area'])
test['log_sqrt_density'] = np.log(np.sqrt(test['Density']))

In [None]:
mutual_df = train.drop(columns = ['Age', 'Male', 'Female'])
y = train['Age']

mutual_info = mutual_info_regression(mutual_df, y, random_state = 1)

mutual_info = pd.Series(mutual_info)
mutual_info.index = mutual_df.columns
pd.DataFrame(mutual_info.sort_values(ascending=False), columns = ["MI_score"] ).style.background_gradient("cool")

In [None]:
train = train[['sqrt_shell_weight', 'Volume', 'Weight', 'Ratio', 'Area', 'log_sqrt_viscera_weight', 'PC_1', 'Height', 'Diameter', 
               'Accounted Weight', 'Length', 'log_sqrt_shucked_weight', 'BMI', 'Female', 'Male', 'Age']]
test = test[['sqrt_shell_weight', 'Volume', 'Weight', 'Ratio', 'Area', 'log_sqrt_viscera_weight', 'PC_1', 'Height', 'Diameter', 
             'Accounted Weight', 'Length', 'log_sqrt_shucked_weight', 'BMI', 'Female', 'Male']]

## Hyper-parameter Tuning:

In [None]:
## Defining input and target variables
X = train.drop(columns = ['Age'])
Y = train['Age']

## Initializing parameters
SEED = 42
SPLITS = 2

## Defining Optuna objective functions
def RF_objective(trial):

    ## Defining the hyper-parameter grid
    param_grid = {'n_estimators': trial.suggest_int('n_estimators', 100, 1000, 50), 
                  'max_depth': trial.suggest_int('max_depth', 3, 12),  
                  'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),  
                  'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),  
                  'random_state': trial.suggest_int('random_state', 1, 500), 
                  'max_features': trial.suggest_categorical('max_features', ['sqrt', None])
                 }
    scores = list()
    kf = KFold(n_splits = SPLITS, shuffle = True, random_state = SEED)
    
    for train_idx, valid_idx in kf.split(X, Y):
        
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
        
        ## Building the model
        model = RandomForestRegressor(**param_grid, n_jobs = -1, criterion = 'absolute_error').fit(X_train, Y_train)
        
        ## Predicting on the test data-frame
        preds = model.predict(X_valid)
        
        ## Evaluating model performance on the test set
        scores.append(mean_absolute_error(Y_valid, preds))
    
    return np.mean(scores)

def HIST_objective(trial):

    ## Defining the hyper-parameter grid
    param_grid = {'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step = 0.01), 
                  'max_iter': trial.suggest_int('max_iter', 100, 1000, 50), 
                  'max_depth': trial.suggest_int('max_depth', 3, 12),  
                  'l2_regularization': trial.suggest_float('l2_regularization', 0, 0.1, step = 0.002), 
                  'random_state': trial.suggest_int('random_state', 1, 500),
                 }
    scores = list()
    kf = KFold(n_splits = SPLITS, shuffle = True, random_state = SEED)
    
    for train_idx, valid_idx in kf.split(X, Y):
        
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
        
        ## Building the model
        model = HistGradientBoostingRegressor(**param_grid, loss = 'absolute_error', early_stopping = True).fit(X_train, Y_train)
        
        ## Predicting on the test data-frame
        preds = model.predict(X_valid)
        
        ## Evaluating model performance on the test set
        scores.append(mean_absolute_error(Y_valid, preds))
    
    return np.mean(scores)

def XGB_objective(trial):

    ## Defining the hyper-parameter grid
    param_grid = {'n_estimators': trial.suggest_int('n_estimators', 100, 1000, 50),  
                  'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step = 0.01),  
                  'max_depth': trial.suggest_int('max_depth', 3, 12),  
                  'gamma': trial.suggest_float('gamma', 0, 0.3, step = 0.05),  
                  'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),  
                  'subsample': trial.suggest_float('subsample', 0.6, 1, step = 0.05),  
                  'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1, step = 0.05), 
                  'seed': trial.suggest_int('seed', 1, 1000) 
                 }
    scores = list()
    kf = KFold(n_splits = SPLITS, shuffle = True, random_state = SEED)
    
    for train_idx, valid_idx in kf.split(X, Y):
        
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
        
        ## Building the model
        model = XGBRegressor(**param_grid, n_jobs = -1).fit(X_train, Y_train)
        
        ## Predicting on the test data-frame
        preds = model.predict(X_valid)
        
        ## Evaluating model performance on the test set
        scores.append(mean_absolute_error(Y_valid, preds))
    
    return np.mean(scores)

def LGBM_objective(trial):
    
    ## Defining the hyper-parameter grid
    param_grid = {'n_estimators': trial.suggest_int('n_estimators', 100, 1000, 50), 
                  'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step = 0.01), 
                  'num_leaves': trial.suggest_int('num_leaves', 5, 40, step = 1), 
                  'max_depth': trial.suggest_int('max_depth', 3, 12), 
                  'subsample': trial.suggest_float('subsample', 0.6, 1, step = 0.05),  
                  'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1, step = 0.05), 
                  'random_state': trial.suggest_int('random_state', 1, 1000),
                 }
    scores = list()
    kf = KFold(n_splits = SPLITS, shuffle = True, random_state = SEED)
    
    for train_idx, valid_idx in kf.split(X, Y):
        
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        Y_train, Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
        
        ## Building the model
        model = LGBMRegressor(**param_grid, n_jobs = -1, verbosity = -1).fit(X_train, Y_train)
        
        ## Predicting on the test data-frame
        preds = model.predict(X_valid)
        
        ## Evaluating model performance on the test set
        scores.append(mean_absolute_error(Y_valid, preds))
    
    return np.mean(scores)



# ## Starting RandomForest
# ## ----
# ## Creating a study object and to optimize the home objective function
# study_rf = optuna.create_study(direction = 'minimize', study_name = 'RandomForest')
# study_rf.optimize(RF_objective, n_trials = 1)

## Starting HistGradientBoosting
## ----
## Creating a study object and to optimize the home objective function
study_hist = optuna.create_study(direction = 'minimize', study_name = 'HistGradientBoosting')
study_hist.optimize(HIST_objective, n_trials = 1)

## Starting XGBoost
## ----
## Creating a study object and to optimize the home objective function
study_xgb = optuna.create_study(direction = 'minimize', study_name = 'XGBoost')
study_xgb.optimize(XGB_objective, n_trials = 1)

## Starting LightGBM
## ----
## Creating a study object and to optimize the home objective function
study_lgbm = optuna.create_study(direction = 'minimize', study_name = 'LightGBM')
study_lgbm.optimize(LGBM_objective, n_trials = 1)

# ## Printing best hyper-parameter set
# print('Random Forest: \n', study_rf.best_trial.params)
# print(study_rf.best_trial.value)

## Printing best hyper-parameter set
print('HistGB: \n', study_hist.best_trial.params)
print(study_hist.best_trial.value)

## Printing best hyper-parameter set
print('\nXGBoost: \n', study_xgb.best_trial.params)
print(study_xgb.best_trial.value)

## Printing best hyper-parameter set
print('\nLightGBM: \n', study_lgbm.best_trial.params)
print(study_lgbm.best_trial.value)

## Linear Model:

In [None]:
from sklearn.linear_model import LinearRegression

## Defining input and target variables
X_train = np.array(train['sqrt_shell_weight']).reshape(-1, 1)
Y_train = train['Age']
X_test = np.array(test['sqrt_shell_weight']).reshape(-1, 1)

## Building the model
lm_md = LinearRegression().fit(X_train, Y_train)

## Predicting on the test data and rounding
preds = np.round(lm_md.predict(X_test))

## Saving predictions as csv
sub['Age'] = preds
sub.to_csv('submissions/linear_regression.csv', index = False)

## Modelling:

In [6]:
X = train.drop(columns = ['Age'], axis = 1)
Y = train['Age']

X['Meat Yield'] = X['Shucked Weight'] / (X['Weight'] + X['Shell Weight'])
X['Shell Ratio'] = X['Shell Weight'] / X['Weight']
X['Weight_to_Shucked_Weight'] = X['Weight'] / X['Shucked Weight']
X['Viscera Ratio'] = X['Viscera Weight'] / X['Weight']

test_baseline = test
test_baseline['Meat Yield'] = test_baseline['Shucked Weight'] / (test_baseline['Weight'] + test_baseline['Shell Weight'])
test_baseline['Shell Ratio'] = test_baseline['Shell Weight'] / test_baseline['Weight']
test_baseline['Weight_to_Shucked_Weight'] = test_baseline['Weight'] / test_baseline['Shucked Weight']
test_baseline['Viscera Ratio'] = test_baseline['Viscera Weight'] / test_baseline['Weight']

gb_cv_scores, gb_preds = list(), list()
hist_cv_scores, hist_preds = list(), list()
lgb_cv_scores, lgb_preds = list(), list()
xgb_cv_scores, xgb_preds = list(), list()
cat_cv_scores, cat_preds = list(), list()
ens_cv_scores_1, ens_preds_1 = list(), list()
ens_cv_scores_2, ens_preds_2 = list(), list()
ens_cv_scores_3, ens_preds_3 = list(), list()
ens_cv_scores_4, ens_preds_4 = list(), list()

kf = KFold(n_splits = 10, random_state = 42, shuffle = True)
    
for i, (train_ix, test_ix) in enumerate(kf.split(X, Y)):
        
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
    print('---------------------------------------------------------------')
    
    ######################
    ## GradientBoosting ##
    ######################
    
    gb_features = ['Sex',
                   'Length',
                   'Diameter',
                   'Height',
                   'Weight',
                   'Shucked Weight',
                   'Viscera Weight',
                   'Shell Weight',
                   'generated']
    
    X_train_gb = X_train[gb_features]
    X_test_gb = X_test[gb_features]
    test_baseline_gb = test_baseline[gb_features]
    
    gb_md = GradientBoostingRegressor(loss = 'absolute_error',
                                      n_estimators = 1000, 
                                      max_depth = 8, 
                                      learning_rate = 0.01,
                                      min_samples_split = 10, 
                                      min_samples_leaf = 20,
                                      random_state = 42).fit(X_train_gb, Y_train) 
    
    gb_pred_1 = gb_md.predict(X_test_gb[X_test_gb['generated'] == 1])
    gb_pred_2 = gb_md.predict(test_baseline_gb)
            
    gb_score_fold = mean_absolute_error(Y_test[X_test_gb['generated'] == 1], gb_pred_1)
    gb_cv_scores.append(gb_score_fold)
    gb_preds.append(gb_pred_2)
    
    print('Fold', i, '==> GradientBoositng oof MAE is ==>', gb_score_fold)
    
    
    ##########################
    ## HistGradientBoosting ##
    ##########################
        
    hist_md = HistGradientBoostingRegressor(loss = 'absolute_error',
                                            l2_regularization = 0.01,
                                            early_stopping = False,
                                            learning_rate = 0.01,
                                            max_iter = 1000,
                                            max_depth = 15,
                                            max_bins = 255,
                                            min_samples_leaf = 70,
                                            max_leaf_nodes = 115,
                                            random_state = 42).fit(X_train, Y_train) 
    
    hist_pred_1 = hist_md.predict(X_test[X_test['generated'] == 1])
    hist_pred_2 = hist_md.predict(test_baseline)

    hist_score_fold = mean_absolute_error(Y_test[X_test['generated'] == 1], hist_pred_1)
    hist_cv_scores.append(hist_score_fold)
    hist_preds.append(hist_pred_2)
    
    print('Fold', i, '==> HistGradient oof MAE is ==>', hist_score_fold)
        
    ##############
    ## LightGBM ##
    ##############
        
    lgb_md = LGBMRegressor(objective = 'mae', 
                           n_estimators = 1000,
                           max_depth = 15,
                           learning_rate = 0.01,
                           num_leaves = 105, 
                           reg_alpha = 8, 
                           reg_lambda = 3, 
                           subsample = 0.6, 
                           colsample_bytree = 0.8,
                           random_state = 42).fit(X_train, Y_train)
    
    lgb_pred_1 = lgb_md.predict(X_test[X_test['generated'] == 1])
    lgb_pred_2 = lgb_md.predict(test_baseline)

    lgb_score_fold = mean_absolute_error(Y_test[X_test['generated'] == 1], lgb_pred_1)    
    lgb_cv_scores.append(lgb_score_fold)
    lgb_preds.append(lgb_pred_2)
    
    print('Fold', i, '==> LightGBM oof MAE is ==>', lgb_score_fold)
        
    #############
    ## XGBoost ##
    #############
    
    xgb_md = XGBRegressor(objective = 'reg:pseudohubererror',
                          tree_method = 'hist',
                          colsample_bytree = 0.9, 
                          gamma = 0.65, 
                          learning_rate = 0.01, 
                          max_depth = 7, 
                          min_child_weight = 20, 
                          n_estimators = 1500,
                          subsample = 0.7,
                          random_state = 42).fit(X_train_gb, Y_train) 
    
    xgb_pred_1 = xgb_md.predict(X_test_gb[X_test_gb['generated'] == 1])
    xgb_pred_2 = xgb_md.predict(test_baseline_gb)

    xgb_score_fold = mean_absolute_error(Y_test[X_test_gb['generated'] == 1], xgb_pred_1)    
    xgb_cv_scores.append(xgb_score_fold)
    xgb_preds.append(xgb_pred_2)
    
    print('Fold', i, '==> XGBoost oof MAE is ==>', xgb_score_fold)
    
    ##############
    ## CatBoost ##
    ##############
    
    cat_features = ['Sex',
                    'Length',
                    'Diameter',
                    'Height',
                    'Weight',
                    'Shucked Weight',
                    'Viscera Weight',
                    'Shell Weight',
                    'generated',
                    'Meat Yield',
                    'Shell Ratio',
                    'Weight_to_Shucked_Weight']
    
    X_train_cat = X_train[cat_features]
    X_test_cat = X_test[cat_features]
    test_baseline_cat = test_baseline[cat_features]
    
    cat_md = CatBoostRegressor(loss_function = 'MAE',
                               iterations = 1000,
                               learning_rate = 0.08,
                               depth = 10, 
                               random_strength = 0.2,
                               bagging_temperature = 0.7,
                               border_count = 254,
                               l2_leaf_reg = 0.001,
                               verbose = False,
                               grow_policy = 'Lossguide',
                               task_type = 'CPU',
                               random_state = 42).fit(X_train_cat, Y_train)
    
    cat_pred_1 = cat_md.predict(X_test_cat[X_test_cat['generated'] == 1])
    cat_pred_2 = cat_md.predict(test_baseline_cat)

    cat_score_fold = mean_absolute_error(Y_test[X_test_cat['generated'] == 1], cat_pred_1)    
    cat_cv_scores.append(cat_score_fold)
    cat_preds.append(cat_pred_2)
    
    print('Fold', i, '==> CatBoost oof MAE is ==>', cat_score_fold)

    
    ##################
    ## LAD Ensemble ##
    ##################
    
    x = pd.DataFrame({'GBC': np.round(gb_pred_1.tolist()),  'hist': np.round(hist_pred_1.tolist()), 'lgb': np.round(lgb_pred_1.tolist()), 
                      'xgb': np.round(xgb_pred_1.tolist()), 'cat': np.round(cat_pred_1.tolist())})
    y = Y_test[X_test['generated'] == 1]
    
    x_test = pd.DataFrame({'GBC': np.round(gb_pred_2.tolist()),  'hist': np.round(hist_pred_2.tolist()), 'lgb': np.round(lgb_pred_2.tolist()), 
                           'xgb': np.round(xgb_pred_2.tolist()), 'cat': np.round(cat_pred_2.tolist())})
    
    lad_md_1 = LADRegression(fit_intercept = True, positive = False).fit(x, y)
    lad_md_2 = LADRegression(fit_intercept = True, positive = True).fit(x, y)
    lad_md_3 = LADRegression(fit_intercept = False, positive = True).fit(x, y)
    lad_md_4 = LADRegression(fit_intercept = False, positive = False).fit(x, y)
    
    lad_pred_1 = lad_md_1.predict(x)
    lad_pred_2 = lad_md_2.predict(x)
    lad_pred_3 = lad_md_3.predict(x)
    lad_pred_4 = lad_md_4.predict(x)
    
    lad_pred_test_1 = lad_md_1.predict(x_test)
    lad_pred_test_2 = lad_md_2.predict(x_test)
    lad_pred_test_3 = lad_md_3.predict(x_test)
    lad_pred_test_4 = lad_md_4.predict(x_test)
        
    ens_score_1 = mean_absolute_error(y, lad_pred_1)
    ens_cv_scores_1.append(ens_score_1)
    ens_preds_1.append(lad_pred_test_1)
    
    ens_score_2 = mean_absolute_error(y, lad_pred_2)
    ens_cv_scores_2.append(ens_score_2)
    ens_preds_2.append(lad_pred_test_2)
    
    ens_score_3 = mean_absolute_error(y, lad_pred_3)
    ens_cv_scores_3.append(ens_score_3)
    ens_preds_3.append(lad_pred_test_3)
    
    ens_score_4 = mean_absolute_error(y, lad_pred_4)
    ens_cv_scores_4.append(ens_score_4)
    ens_preds_4.append(lad_pred_test_4)
    
    print('Fold', i, '==> LAD Model 1 ensemble oof MAE is ==>', ens_score_1)
    print('Fold', i, '==> LAD Model 2 ensemble oof MAE is ==>', ens_score_2)
    print('Fold', i, '==> LAD Model 3 ensemble oof MAE is ==>', ens_score_3)
    print('Fold', i, '==> LAD Model 4 ensemble oof MAE is ==>', ens_score_4)

---------------------------------------------------------------
Fold 0 ==> GradientBoositng oof MAE is ==> 1.3600831902284967
Fold 0 ==> HistGradient oof MAE is ==> 1.358525406691329
Fold 0 ==> LightGBM oof MAE is ==> 1.3553085796410644
Fold 0 ==> XGBoost oof MAE is ==> 1.3660042263320271
Fold 0 ==> CatBoost oof MAE is ==> 1.360316135172595
Fold 0 ==> LAD Model 1 ensemble oof MAE is ==> 1.340758293972472
Fold 0 ==> LAD Model 2 ensemble oof MAE is ==> 1.3407582938388867
Fold 0 ==> LAD Model 3 ensemble oof MAE is ==> 1.340758293838867
Fold 0 ==> LAD Model 4 ensemble oof MAE is ==> 1.3407582938421498
---------------------------------------------------------------
Fold 1 ==> GradientBoositng oof MAE is ==> 1.3536029355660653
Fold 1 ==> HistGradient oof MAE is ==> 1.3558243763216642
Fold 1 ==> LightGBM oof MAE is ==> 1.3522249811316982
Fold 1 ==> XGBoost oof MAE is ==> 1.3575149287694488
Fold 1 ==> CatBoost oof MAE is ==> 1.3504308905158011
Fold 1 ==> LAD Model 1 ensemble oof MAE is ==> 1.3

In [None]:
## Storing optimal HP sets
hist_params = study_hist.best_trial.params
xgb_params = study_xgb.best_trial.params
lgbm_params = study_lgbm.best_trial.params

## Defining the input and target variables
X = train.drop(columns = ['Age'], axis = 1)
Y = train['Age']

## Defining lists to store results
hist_cv_scores, hist_preds = list(), list()
hist_cv_scores_round, hist_preds_round = list(), list()

lgb_cv_scores, lgb_preds = list(), list()
lgb_cv_scores_round, lgb_preds_round = list(), list()

xgb_cv_scores, xgb_preds = list(), list()
xgb_cv_scores_round, xgb_preds_round = list(), list()

ens_cv_scores, ens_preds = list(), list()
ens_cv_scores_round, ens_preds_round = list(), list()

ens_cv_scores2, ens_preds2 = list(), list()
ens_cv_scores_round2, ens_preds_round2 = list(), list()


## Performing KFold cross-validation
kf = KFold(n_splits = 2, shuffle = True)
    
for i, (train_ix, test_ix) in enumerate(kf.split(X, Y)):
        
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
    print('---------------------------------------------------------------')    
    
    hist_md = HistGradientBoostingRegressor(**hist_params, loss = 'absolute_error', early_stopping = True).fit(X_train, Y_train)
    
    hist_pred_1 = hist_md.predict(X_test); hist_pred_1_round = np.round_(hist_pred_1).astype(int)
    hist_pred_2 = hist_md.predict(test); hist_pred_2_round = np.round_(hist_pred_2).astype(int)
    
    hist_score_fold = mean_absolute_error(Y_test, hist_pred_1); hist_score_fold_round = mean_absolute_error(Y_test, hist_pred_1_round)
    hist_cv_scores.append(hist_score_fold); hist_cv_scores_round.append(hist_score_fold_round)
    hist_preds.append(hist_pred_2); hist_preds_round.append(hist_pred_2_round)
    
    print('Fold', i+1, '==> HistGradient oof MAE is ==>', hist_score_fold)
    print('Fold', i+1, '==> HistGradient oof MAE is ==>', hist_score_fold_round)
    
    
    lgb_md = LGBMRegressor(**lgbm_params, n_jobs = -1, verbosity = -1).fit(X_train, Y_train)

    lgb_pred_1 = lgb_md.predict(X_test); lgb_pred_1_round = np.round_(lgb_pred_1).astype(int)
    lgb_pred_2 = lgb_md.predict(test); lgb_pred_2_round = np.round_(lgb_pred_2).astype(int)
    
    lgb_score_fold = mean_absolute_error(Y_test, lgb_pred_1); lgb_score_fold_round = mean_absolute_error(Y_test, lgb_pred_1_round)
    lgb_cv_scores.append(lgb_score_fold); lgb_cv_scores_round.append(lgb_score_fold_round)
    lgb_preds.append(lgb_pred_2); lgb_preds_round.append(lgb_pred_2_round)
    
    print('Fold', i+1, '==> LightGBM oof MAE is ==>', lgb_score_fold)
    print('Fold', i+1, '==> LightGBM oof MAE is ==>', lgb_score_fold_round)
    
    
    xgb_md = XGBRegressor(**xgb_params, n_jobs = -1).fit(X_train, Y_train)
    
    xgb_pred_1 = xgb_md.predict(X_test); xgb_pred_1_round = np.round_(xgb_pred_1).astype(int)
    xgb_pred_2 = xgb_md.predict(test); xgb_pred_2_round = np.round_(xgb_pred_2).astype(int)
    
    xgb_score_fold = mean_absolute_error(Y_test, xgb_pred_1); xgb_score_fold_round = mean_absolute_error(Y_test, xgb_pred_1_round)
    xgb_cv_scores.append(xgb_score_fold); xgb_cv_scores_round.append(xgb_score_fold_round)
    xgb_preds.append(xgb_pred_2); xgb_preds_round.append(xgb_pred_2_round)
    
    print('Fold', i+1, '==> XGBoost oof MAE is ==>', xgb_score_fold)
    print('Fold', i+1, '==> XGBoost oof MAE is ==>', xgb_score_fold_round)
    
    
    x = pd.DataFrame({'HIST': hist_pred_1, 'LGB': lgb_pred_1, 'XGB': xgb_pred_1})
    y = Y_test
    
    lad_md = LADRegression().fit(x, y)
    lad_pred = lad_md.predict(x); lad_pred_round = np.round_(lad_pred).astype(int)
    
    x_test = pd.DataFrame({'HIST': hist_pred_2, 'LGB': lgb_pred_2, 'XGB': xgb_pred_2})
    lad_pred_test = lad_md.predict(x_test); lad_pred_test_round = np.round_(lad_pred_test).astype(int)
        
    ens_score = mean_absolute_error(y, lad_pred); ens_score_round = mean_absolute_error(y, lad_pred_round)
    ens_cv_scores.append(ens_score); ens_cv_scores_round.append(ens_score_round)
    ens_preds.append(lad_pred_test); ens_preds_round.append(lad_pred_test_round)
    
    print('Fold', i+1, '==> LAD ensemble oof MAE is ==>', ens_score)
    print('Fold', i+1, '==> LAD ensemble oof MAE is ==>', ens_score_round)
    
    
    x = pd.DataFrame({'HIST': hist_pred_1_round, 'LGB': lgb_pred_1_round, 'XGB': xgb_pred_1_round})
    y = Y_test
    
    lad_md = LADRegression().fit(x, y)
    lad_pred = lad_md.predict(x); lad_pred_round = np.round_(lad_pred).astype(int)
    
    x_test = pd.DataFrame({'HIST': hist_pred_2_round, 'LGB': lgb_pred_2_round, 'XGB': xgb_pred_2_round})
    lad_pred_test = lad_md.predict(x_test); lad_pred_test_round = np.round_(lad_pred_test).astype(int)
    
    ens_score = mean_absolute_error(y, lad_pred); ens_score_round = mean_absolute_error(y, lad_pred_round)
    ens_cv_scores2.append(ens_score); ens_cv_scores_round2.append(ens_score_round)
    ens_preds2.append(lad_pred_test); ens_preds_round2.append(lad_pred_test_round)
    
    print('Fold', i+1, '==> LAD Rounded ensemble oof MAE is ==>', ens_score)
    print('Fold', i+1, '==> LAD Rounded ensemble oof MAE is ==>', ens_score_round)

In [None]:
## Defining the input and target variables
X = train.drop(columns = ['Age'], axis = 1)
Y = train['Age']

## Defining lists to store results
hist_cv_scores, hist_preds = list(), list()
hist_cv_scores_round, hist_preds_round = list(), list()

lgb_cv_scores, lgb_preds = list(), list()
lgb_cv_scores_round, lgb_preds_round = list(), list()

xgb_cv_scores, xgb_preds = list(), list()
xgb_cv_scores_round, xgb_preds_round = list(), list()

ens_cv_scores, ens_preds = list(), list()
ens_cv_scores_round, ens_preds_round = list(), list()

ens_cv_scores2, ens_preds2 = list(), list()
ens_cv_scores_round2, ens_preds_round2 = list(), list()


## Performing KFold cross-validation
kf = KFold(n_splits = 10, shuffle = True)
    
for i, (train_ix, test_ix) in enumerate(kf.split(X, Y)):
        
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
    print('---------------------------------------------------------------')    
    
    hist_md = HistGradientBoostingRegressor(loss = 'absolute_error', l2_regularization = 0.01, early_stopping = False, 
                                            learning_rate = 0.01, max_iter = 1000, max_depth = 15, max_bins = 255, 
                                            min_samples_leaf = 70, max_leaf_nodes = 115).fit(X_train, Y_train)
    
    hist_pred_1 = hist_md.predict(X_test); hist_pred_1_round = np.round_(hist_pred_1).astype(int)
    hist_pred_2 = hist_md.predict(test); hist_pred_2_round = np.round_(hist_pred_2).astype(int)
    
    hist_score_fold = mean_absolute_error(Y_test, hist_pred_1); hist_score_fold_round = mean_absolute_error(Y_test, hist_pred_1_round)
    hist_cv_scores.append(hist_score_fold); hist_cv_scores_round.append(hist_score_fold_round)
    hist_preds.append(hist_pred_2); hist_preds_round.append(hist_pred_2_round)
    
    print('Fold', i+1, '==> HistGradient oof MAE is ==>', hist_score_fold)
    print('Fold', i+1, '==> HistGradient oof MAE is ==>', hist_score_fold_round)
    
    
    lgb_md = LGBMRegressor(objective = 'mae', n_estimators = 1000, max_depth = 15, learning_rate = 0.01, num_leaves = 105,
                           reg_alpha = 8, reg_lambda = 3, subsample = 0.6, colsample_bytree = 0.8, verbosity = -1).fit(X_train, Y_train)

    lgb_pred_1 = lgb_md.predict(X_test); lgb_pred_1_round = np.round_(lgb_pred_1).astype(int)
    lgb_pred_2 = lgb_md.predict(test); lgb_pred_2_round = np.round_(lgb_pred_2).astype(int)
    
    lgb_score_fold = mean_absolute_error(Y_test, lgb_pred_1); lgb_score_fold_round = mean_absolute_error(Y_test, lgb_pred_1_round)
    lgb_cv_scores.append(lgb_score_fold); lgb_cv_scores_round.append(lgb_score_fold_round)
    lgb_preds.append(lgb_pred_2); lgb_preds_round.append(lgb_pred_2_round)
    
    print('Fold', i+1, '==> LightGBM oof MAE is ==>', lgb_score_fold)
    print('Fold', i+1, '==> LightGBM oof MAE is ==>', lgb_score_fold_round)
    
    
    xgb_md = XGBRegressor(objective = 'reg:pseudohubererror', tree_method = 'hist', colsample_bytree = 0.9, gamma = 0.65, 
                          learning_rate = 0.01, max_depth = 7, min_child_weight = 20, n_estimators = 1000, subsample = 0.7).fit(X_train, Y_train)
    
    xgb_pred_1 = xgb_md.predict(X_test); xgb_pred_1_round = np.round_(xgb_pred_1).astype(int)
    xgb_pred_2 = xgb_md.predict(test); xgb_pred_2_round = np.round_(xgb_pred_2).astype(int)
    
    xgb_score_fold = mean_absolute_error(Y_test, xgb_pred_1); xgb_score_fold_round = mean_absolute_error(Y_test, xgb_pred_1_round)
    xgb_cv_scores.append(xgb_score_fold); xgb_cv_scores_round.append(xgb_score_fold_round)
    xgb_preds.append(xgb_pred_2); xgb_preds_round.append(xgb_pred_2_round)
    
    print('Fold', i+1, '==> XGBoost oof MAE is ==>', xgb_score_fold)
    print('Fold', i+1, '==> XGBoost oof MAE is ==>', xgb_score_fold_round)
    
    
    x = pd.DataFrame({'HIST': hist_pred_1, 'LGB': lgb_pred_1, 'XGB': xgb_pred_1})
    y = Y_test
    
    lad_md = LADRegression().fit(x, y)
    lad_pred = lad_md.predict(x); lad_pred_round = np.round_(lad_pred).astype(int)
    
    x_test = pd.DataFrame({'HIST': hist_pred_2, 'LGB': lgb_pred_2, 'XGB': xgb_pred_2})
    lad_pred_test = lad_md.predict(x_test); lad_pred_test_round = np.round_(lad_pred_test).astype(int)
        
    ens_score = mean_absolute_error(y, lad_pred); ens_score_round = mean_absolute_error(y, lad_pred_round)
    ens_cv_scores.append(ens_score); ens_cv_scores_round.append(ens_score_round)
    ens_preds.append(lad_pred_test); ens_preds_round.append(lad_pred_test_round)
    
    print('Fold', i+1, '==> LAD ensemble oof MAE is ==>', ens_score)
    print('Fold', i+1, '==> LAD ensemble oof MAE is ==>', ens_score_round)
    
    
    x = pd.DataFrame({'HIST': hist_pred_1_round, 'LGB': lgb_pred_1_round, 'XGB': xgb_pred_1_round})
    y = Y_test
    
    lad_md = LADRegression().fit(x, y)
    lad_pred = lad_md.predict(x); lad_pred_round = np.round_(lad_pred).astype(int)
    
    x_test = pd.DataFrame({'HIST': hist_pred_2_round, 'LGB': lgb_pred_2_round, 'XGB': xgb_pred_2_round})
    lad_pred_test = lad_md.predict(x_test); lad_pred_test_round = np.round_(lad_pred_test).astype(int)
    
    ens_score = mean_absolute_error(y, lad_pred); ens_score_round = mean_absolute_error(y, lad_pred_round)
    ens_cv_scores2.append(ens_score); ens_cv_scores_round2.append(ens_score_round)
    ens_preds2.append(lad_pred_test); ens_preds_round2.append(lad_pred_test_round)
    
    print('Fold', i+1, '==> LAD Rounded ensemble oof MAE is ==>', ens_score)
    print('Fold', i+1, '==> LAD Rounded ensemble oof MAE is ==>', ens_score_round)

In [7]:
print(np.mean(gb_cv_scores))
print(np.mean(hist_cv_scores))
print(np.mean(lgb_cv_scores))
print(np.mean(xgb_cv_scores))
print(np.mean(cat_cv_scores))
print(np.mean(ens_cv_scores_1))
print(np.mean(ens_cv_scores_2))
print(np.mean(ens_cv_scores_3))
print(np.mean(ens_cv_scores_4))

1.354658483940648
1.3544178267669753
1.3511264786298782
1.3623775251135168
1.3544690180700283
1.332926089796411
1.3329249614445433
1.332924958854301
1.3329258653116136


In [9]:
ens_preds_test_1 = pd.DataFrame(ens_preds_1).apply(np.mean, axis = 0)
ens_preds_test_2 = pd.DataFrame(ens_preds_2).apply(np.mean, axis = 0)
ens_preds_test_3 = pd.DataFrame(ens_preds_3).apply(np.mean, axis = 0)
ens_preds_test_4 = pd.DataFrame(ens_preds_4).apply(np.mean, axis = 0)

sub['Age'] = np.round(ens_preds_test_1).astype(int)
sub.to_csv('submissions/LAD_Ensemble_1.csv', index = False)

sub['Age'] = np.round(ens_preds_test_2).astype(int)
sub.to_csv('submissions/LAD_Ensemble_2.csv', index = False)

sub['Age'] = np.round(ens_preds_test_3).astype(int)
sub.to_csv('submissions/LAD_Ensemble_3.csv', index = False)

sub['Age'] = np.round(ens_preds_test_4).astype(int)
sub.to_csv('submissions/LAD_Ensemble_4.csv', index = False)

In [None]:
hist = pd.DataFrame(hist_preds).apply(np.mean, axis = 0); hist_round = pd.DataFrame(hist_preds_round).apply(np.mean, axis = 0)
lgb = pd.DataFrame(lgb_preds).apply(np.mean, axis = 0); lgb_round = pd.DataFrame(lgb_preds_round).apply(np.mean, axis = 0)
xgb = pd.DataFrame(xgb_preds).apply(np.mean, axis = 0); xgb_round = pd.DataFrame(xgb_preds_round).apply(np.mean, axis = 0)
ens = pd.DataFrame(ens_preds).apply(np.mean, axis = 0); ens_round = pd.DataFrame(ens_preds_round).apply(np.mean, axis = 0)
ens2 = pd.DataFrame(ens_preds2).apply(np.mean, axis = 0); ens_round2 = pd.DataFrame(ens_preds_round2).apply(np.mean, axis = 0)

sub['Age'] = hist
sub.to_csv('submissions/Hist_sub.csv', index = False)

sub['Age'] = np.round(hist_round)
sub.to_csv('submissions/Hist_sub_round.csv', index = False)

sub['Age'] = lgb
sub.to_csv('submissions/LGBM_sub.csv', index = False)

sub['Age'] = np.round(lgb_round)
sub.to_csv('submissions/LGBM_sub_round.csv', index = False)

sub['Age'] = xgb
sub.to_csv('submissions/XGB_sub.csv', index = False)

sub['Age'] = np.round(xgb_round)
sub.to_csv('submissions/XGB_sub_round.csv', index = False)

sub['Age'] = ens
sub.to_csv('submissions/Ens_sub.csv', index = False)

sub['Age'] = np.round(ens_round)
sub.to_csv('submissions/Ens_sub_round.csv', index = False)

sub['Age'] = ens2
sub.to_csv('submissions/Ens_sub2.csv', index = False)

sub['Age'] = np.round(ens_round2)
sub.to_csv('submissions/Ens_sub_round2.csv', index = False)