In [None]:
pip install xgboost lightgbm catboost

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [51]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')

In [52]:
train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


In [53]:
train.describe()

Unnamed: 0,id,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
count,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0,74051.0
mean,37025.0,1.31746,1.024496,0.348089,23.385217,10.10427,5.058386,6.72387,9.967806
std,21376.826729,0.287757,0.237396,0.092034,12.648153,5.618025,2.792729,3.584372,3.175189
min,0.0,0.1875,0.1375,0.0,0.056699,0.028349,0.042524,0.042524,1.0
25%,18512.5,1.15,0.8875,0.3,13.437663,5.712424,2.8633,3.96893,8.0
50%,37025.0,1.375,1.075,0.3625,23.799405,9.90815,4.989512,6.931453,10.0
75%,55537.5,1.5375,1.2,0.4125,32.162508,14.033003,6.988152,9.07184,11.0
max,74050.0,2.012815,1.6125,2.825,80.101512,42.184056,21.54562,28.491248,29.0


In [54]:
print(train.shape)
print(test.shape)

(74051, 10)
(49368, 9)


In [55]:
print(train.isna().sum())
print('\n', test.isna().sum())

id                0
Sex               0
Length            0
Diameter          0
Height            0
Weight            0
Shucked Weight    0
Viscera Weight    0
Shell Weight      0
Age               0
dtype: int64

 id                0
Sex               0
Length            0
Diameter          0
Height            0
Weight            0
Shucked Weight    0
Viscera Weight    0
Shell Weight      0
dtype: int64


In [56]:
## Dropping 'id' variable
train.drop(columns = 'id', axis = 1, inplace = True)
test.drop(columns = ['id'], axis = 1, inplace = True)

## Label encoding 'sex' variable
le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
test['Sex'] = le.transform(test['Sex'])

## Defining the input and target variables
X = train.drop(columns = ['Age'], axis = 1)
Y = train['Age']

## Defining lists to store results
hist_cv_scores, hist_preds = list(), list()
lgb_cv_scores, lgb_preds = list(), list()
xgb_cv_scores, xgb_preds = list(), list()
ens_cv_scores, ens_preds = list(), list()

## Performing KFold cross-validation
skf = KFold(n_splits = 10, random_state = 42, shuffle = True)
    
for i, (train_ix, test_ix) in enumerate(skf.split(X, Y)):
        
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
    print('---------------------------------------------------------------')
    
    ##########################
    ## HistGradientBoosting ##
    ##########################
        
    hist_md = HistGradientBoostingRegressor(loss = 'absolute_error', l2_regularization = 0.01, early_stopping = False,
                                            learning_rate = 0.01, max_iter = 1000, max_depth = 15, max_bins = 255,
                                            min_samples_leaf = 30, max_leaf_nodes = 30).fit(X_train, Y_train)
    
    hist_pred_1 = hist_md.predict(X_test)
    hist_pred_2 = hist_md.predict(test)

    hist_score_fold = mean_absolute_error(Y_test, hist_pred_1)
    hist_cv_scores.append(hist_score_fold)
    hist_preds.append(hist_pred_2)
    
    print('Fold', i, '==> HistGradient oof MAE is ==>', hist_score_fold)
        
    ##############
    ## LightGBM ##
    ##############
        
    lgb_md = LGBMRegressor(objective = 'mae', n_estimators = 1000, max_depth = 10, learning_rate = 0.01, num_leaves = 70,
                           reg_alpha = 3, reg_lambda = 3, subsample = 0.7, colsample_bytree = 0.7).fit(X_train, Y_train)
    
    lgb_pred_1 = lgb_md.predict(X_test)
    lgb_pred_2 = lgb_md.predict(test)

    lgb_score_fold = mean_absolute_error(Y_test, lgb_pred_1)    
    lgb_cv_scores.append(lgb_score_fold)
    lgb_preds.append(lgb_pred_2)
    
    print('Fold', i, '==> LightGBM oof MAE is ==>', lgb_score_fold)
        
    #############
    ## XGBoost ##
    #############
        
    xgb_md = XGBRegressor(objective = 'reg:pseudohubererror', colsample_bytree = 0.7, gamma = 0.8, learning_rate = 0.01, 
                          max_depth = 8, min_child_weight = 20, n_estimators = 1000, subsample = 0.7).fit(X_train, Y_train)
    
    xgb_pred_1 = xgb_md.predict(X_test)
    xgb_pred_2 = xgb_md.predict(test)

    xgb_score_fold = mean_absolute_error(Y_test, xgb_pred_1)    
    xgb_cv_scores.append(xgb_score_fold)
    xgb_preds.append(xgb_pred_2)
    
    print('Fold', i, '==> XGBoost oof MAE is ==>', xgb_score_fold)
    
    ##############
    ## Ensemble ##
    ##############
    
    w1 = 1 / hist_score_fold
    w2 = 1 / lgb_score_fold
    w3 = 1 / xgb_score_fold
    wtot = w1 + w2 + w3
    w1 = w1 / wtot
    w2 = w2 / wtot
    w3 = w3 / wtot
    
    ens_pred = w1*hist_pred_1 + w2*lgb_pred_1 + w3*xgb_pred_1
    ens_pred_test = w1*hist_pred_2 + w2*lgb_pred_2 + w3*xgb_pred_2
    
    ens_score = mean_absolute_error(Y_test, ens_pred)
    ens_cv_scores.append(ens_score)
    ens_preds.append(ens_pred_test)
    
    print('Fold', i, '==> Ensemble oof MAE is ==>', ens_score)

---------------------------------------------------------------
Fold 0 ==> HistGradient oof MAE is ==> 1.382240903513237
Fold 0 ==> LightGBM oof MAE is ==> 1.363506052814721
Fold 0 ==> XGBoost oof MAE is ==> 1.3700580186789788
Fold 0 ==> Ensemble oof MAE is ==> 1.367251328466619
---------------------------------------------------------------
Fold 1 ==> HistGradient oof MAE is ==> 1.3713189707695528
Fold 1 ==> LightGBM oof MAE is ==> 1.350385360293788
Fold 1 ==> XGBoost oof MAE is ==> 1.3530716908935916
Fold 1 ==> Ensemble oof MAE is ==> 1.3535922669506184
---------------------------------------------------------------
Fold 2 ==> HistGradient oof MAE is ==> 1.374850122363534
Fold 2 ==> LightGBM oof MAE is ==> 1.3619814735813953
Fold 2 ==> XGBoost oof MAE is ==> 1.3655776447898869
Fold 2 ==> Ensemble oof MAE is ==> 1.3629781437159887
---------------------------------------------------------------
Fold 3 ==> HistGradient oof MAE is ==> 1.3694161191999614
Fold 3 ==> LightGBM oof MAE is ==>

In [57]:
print(np.mean(hist_cv_scores))
print(np.mean(lgb_cv_scores))
print(np.mean(xgb_cv_scores))
print(np.mean(ens_cv_scores))

1.372569681108215
1.3563241279761984
1.3640010884616987
1.3594161705103267


In [59]:
hist_preds_test = pd.DataFrame(hist_preds).apply(np.mean, axis = 0)
lgb_preds_test = pd.DataFrame(lgb_preds).apply(np.mean, axis = 0)
xgb_preds_test = pd.DataFrame(xgb_preds).apply(np.mean, axis = 0)
ens_preds_test = pd.DataFrame(ens_preds).apply(np.mean, axis = 0)

sub['Age'] = hist_preds_test
sub.to_csv('submissions/Hist_Baseline_submission.csv', index = False)

sub['Age'] = lgb_preds_test
sub.to_csv('submissions/LightGBM_Baseline_submission.csv', index = False)

sub['Age'] = xgb_preds_test
sub.to_csv('submissions/XGBoost_Baseline_submission.csv', index = False)

sub['Age'] = ens_preds_test
sub.to_csv('submissions/Ensemble_Baseline_submission.csv', index = False)