In [1]:
pip install xgboost lightgbm catboost optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Using cached xgboost-1.7.4-py3-none-manylinux2014_x86_64.whl (193.6 MB)
Collecting lightgbm
  Using cached lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)
Collecting catboost
  Using cached catboost-1.1.1-cp310-none-manylinux1_x86_64.whl (76.6 MB)
Collecting optuna
  Using cached optuna-3.1.0-py3-none-any.whl (365 kB)
Collecting graphviz
  Using cached graphviz-0.20.1-py3-none-any.whl (47 kB)
Collecting colorlog
  Using cached colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Using cached alembic-1.10.2-py3-none-any.whl (212 kB)
Collecting cmaes>=0.9.1
  Using cached cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting Mako
  Using cached Mako-1.2.4-py3-none-any.whl (78 kB)
Installing collected packages: Mako, graphviz, colorlog, cmaes, xgboost, alembic, optuna, lightgbm, catboost
Successfully installed Mako-1.2.4 alembic-1.10.2 catboost-1.1.1 cmaes-0.9.1 col

In [17]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from tqdm import tqdm

from functools import partial
import scipy as sp

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor 
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/PS-S3/Ep9/train.csv'
file_key_2 = 'Tabular-Playground-Series/PS-S3/Ep9/test.csv'
file_key_3 = 'Tabular-Playground-Series/PS-S3/Ep9/sample_submission.csv'
file_key_4 = 'Tabular-Playground-Series/PS-S3/Ep9/ConcreteStrengthData.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

bucket_object_4 = bucket.Object(file_key_4)
file_object_4 = bucket_object_4.get()
file_content_stream_4 = file_object_4.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

# Best Models

In [18]:
X = train.drop(columns = ['id', 'Strength'], axis = 1)
Y = train['Strength']
X['WaterComponent_to_Cement_ratio'] = X['WaterComponent'] / (X['CementComponent'] + 1e-6)

test_baseline = test.drop(columns = ['id'], axis = 1)
test_baseline['WaterComponent_to_Cement_ratio'] = test_baseline['WaterComponent'] / (test_baseline['CementComponent'] + 1e-6)

hist_md = HistGradientBoostingRegressor(l2_regularization = 0.01,
                                        early_stopping = False,
                                        learning_rate = 0.01,
                                        max_iter = 1000,
                                        max_depth = 2,
                                        max_bins = 255,
                                        min_samples_leaf = 10,
                                        max_leaf_nodes = 10).fit(X, Y)
hist_pred_train = hist_md.predict(X)
hist_pred_test = hist_md.predict(test_baseline)


XGB_md = XGBRegressor(tree_method = 'hist',
                      colsample_bytree = 0.7, 
                      gamma = 0.8, 
                      learning_rate = 0.01, 
                      max_depth = 2, 
                      min_child_weight = 10, 
                      n_estimators = 1000, 
                      subsample = 0.7).fit(X, Y)
xgb_pred_train = XGB_md.predict(X)
xgb_pred_test = XGB_md.predict(test_baseline)


cat_md = CatBoostRegressor(loss_function = 'RMSE',
                           iterations = 1000,
                           learning_rate = 0.01,
                           depth = 3,
                           random_strength = 0.5,
                           bagging_temperature = 0.7,
                           border_count = 30,
                           l2_leaf_reg = 5,
                           verbose = False).fit(X, Y)
cat_pred_train = cat_md.predict(X)
cat_pred_test = cat_md.predict(test_baseline)

In [19]:
train_preds = pd.DataFrame({'Hist': hist_pred_train, 'XGBoost': xgb_pred_train, 
                            'CatBoost': cat_pred_train, 'target': Y})
train_preds.head()

Unnamed: 0,Hist,XGBoost,CatBoost,target
0,20.707976,20.263592,20.080438,10.38
1,34.615818,34.988472,35.085194,23.52
2,38.092231,37.325882,37.591441,36.96
3,43.247601,43.899803,44.12041,39.05
4,44.365252,44.196915,44.762457,74.19


# Optimization

In [20]:
class OptimizedEnsemble(object):
    
    def __init__(self):
        self.coef_ = 0

    def _rmse_loss(self, coef, X, y):
        
        ens = coef[0]*X[:, 0] + coef[1]*X[:, 1] + coef[2]*X[:, 2]
        ll = mean_squared_error(y, ens, squared = False)
        return ll

    def fit(self, X, y):
        loss_partial = partial(self._rmse_loss, X = X, y = y)
        initial_coef = [1/3, 1/3, 1/3]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method = 'nelder-mead')

    def predict(self, X, coef):
        
        ens = coef[0]*X[:, 0] + coef[1]*X[:, 1] + coef[2]*X[:, 2]
        return ens

    def coefficients(self):
        return self.coef_['x']

In [21]:
models_pred_train = np.transpose((hist_pred_train, xgb_pred_train, cat_pred_train))

opt_ens = OptimizedEnsemble()
opt_ens.fit(models_pred_train, Y)
coef = opt_ens.coefficients()

models_pred_test = np.transpose((hist_pred_test, xgb_pred_test, cat_pred_test))
ens_pred = opt_ens.predict(models_pred_test, coef)

submission['Strength'] = ens_pred
submission.head(10)

Unnamed: 0,id,Strength
0,5407,48.114968
1,5408,19.513224
2,5409,33.788238
3,5410,46.632225
4,5411,32.503814
5,5412,39.441962
6,5413,33.603838
7,5414,22.729529
8,5415,45.914104
9,5416,40.118052


In [22]:
submission.to_csv('catboost_xgb_hist_opt_full_submission.csv', index = False)

# Optuna

In [10]:
X = train_preds.drop(columns = 'target', axis = 1)
Y = train_preds['target']

class Objective:

    def __init__(self, seed):
        # Hold this implementation specific arguments as the fields of the class.
        self.seed = seed

    def __call__(self, trial):
        
        ## Parameters to be evaluated
        param = dict(n_estimators = trial.suggest_int('n_estimators', 100, 5000),
                     max_depth = trial.suggest_int('max_depth', 3, 10),
                     min_samples_split = trial.suggest_int('min_samples_split', 3, 50), 
                     min_samples_leaf = trial.suggest_int('min_samples_leaf', 3, 50)
                    )

        scores = []
        
        skf = KFold(n_splits = 5, shuffle = True, random_state = self.seed)

        for train_idx, valid_idx in skf.split(X, Y):

            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train , Y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

            model = RandomForestRegressor(**param, n_jo).fit(X_train, Y_train)

            preds_valid = model.predict(X_valid)

            score = mean_squared_error(Y_valid, preds_valid, squared = False)
            scores.append(score)

        return np.mean(scores)
    
## Defining SEED and Trials
SEED = 42
N_TRIALS = 50

# Execute an optimization
study = optuna.create_study(direction = 'minimize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-09 18:42:47,707][0m A new study created in memory with name: no-name-ecfd4fb9-8d36-424b-b89d-f3f028ce3b76[0m
[32m[I 2023-03-09 18:43:25,036][0m Trial 0 finished with value: 11.922499513927482 and parameters: {'n_estimators': 3677, 'max_depth': 8, 'min_samples_split': 24, 'min_samples_leaf': 46}. Best is trial 0 with value: 11.922499513927482.[0m
[32m[I 2023-03-09 18:44:06,167][0m Trial 1 finished with value: 11.868901156367722 and parameters: {'n_estimators': 4430, 'max_depth': 4, 'min_samples_split': 15, 'min_samples_leaf': 46}. Best is trial 1 with value: 11.868901156367722.[0m
[33m[W 2023-03-09 18:44:40,706][0m Trial 2 failed with parameters: {'n_estimators': 4853, 'max_depth': 3, 'min_samples_split': 35, 'min_samples_leaf': 32} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/tensorflow2_p310/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    

KeyboardInterrupt: 