In [1]:
pip install xgboost lightgbm catboost optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.4-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting catboost
  Downloading catboost-1.1.1-cp310-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
Collecting graphviz
  Down

In [36]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from tqdm import tqdm

from functools import partial
import scipy as sp

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor 
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/PS-S3/Ep9/train.csv'
file_key_2 = 'Tabular-Playground-Series/PS-S3/Ep9/test.csv'
file_key_3 = 'Tabular-Playground-Series/PS-S3/Ep9/sample_submission.csv'
file_key_4 = 'Tabular-Playground-Series/PS-S3/Ep9/ConcreteStrengthData.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

bucket_object_4 = bucket.Object(file_key_4)
file_object_4 = bucket_object_4.get()
file_content_stream_4 = file_object_4.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
train = train.drop(columns = 'id', axis = 1)
train['is_generated'] = 1

test = pd.read_csv(file_content_stream_2)
test['is_generated'] = 1

original = pd.read_csv(file_content_stream_4)
original['is_generated'] = 0
original.columns = train.columns.tolist()

submission = pd.read_csv(file_content_stream_3)

train = pd.concat([train, original], axis = 0)

# Models

In [62]:
X = train.drop(columns = ['Strength'], axis = 1)
Y = train['Strength']
X['WaterComponent_to_Cement_ratio'] = X['WaterComponent'] / (X['CementComponent'] + 1e-6)

test_baseline = test.drop(columns = ['id'], axis = 1)
test_baseline['WaterComponent_to_Cement_ratio'] = test_baseline['WaterComponent'] / (test_baseline['CementComponent'] + 1e-6)

GBR_md = GradientBoostingRegressor(n_estimators = 500,
                                   learning_rate = 0.01,
                                   max_depth = 2,
                                   min_samples_split = 2,
                                   min_samples_leaf = 2).fit(X, Y)
GBR_pred = GBR_md.predict(test_baseline)
submission['Strength'] = GBR_pred
submission.head(10)

Unnamed: 0,id,Strength
0,5407,46.655199
1,5408,20.734286
2,5409,33.784127
3,5410,44.606526
4,5411,30.298132
5,5412,39.234069
6,5413,32.900146
7,5414,23.303146
8,5415,45.657342
9,5416,40.385776


In [63]:
X = train.drop(columns = ['Strength'], axis = 1)
Y = train['Strength']
X['WaterComponent_to_Cement_ratio'] = X['WaterComponent'] / (X['CementComponent'] + 1e-6)

test_baseline = test.drop(columns = ['id'], axis = 1)
test_baseline['WaterComponent_to_Cement_ratio'] = test_baseline['WaterComponent'] / (test_baseline['CementComponent'] + 1e-6)

hist_md = HistGradientBoostingRegressor(l2_regularization = 0.01,
                                        early_stopping = False,
                                        learning_rate = 0.01,
                                        max_iter = 500,
                                        max_depth = 2,
                                        max_bins = 255,
                                        min_samples_leaf = 10,
                                        max_leaf_nodes = 10).fit(X, Y)

hist_pred = hist_md.predict(test_baseline)
submission['Strength'] = hist_pred
submission.head(10)

Unnamed: 0,id,Strength
0,5407,46.583556
1,5408,20.764205
2,5409,33.774408
3,5410,44.636735
4,5411,30.174076
5,5412,39.553087
6,5413,33.05379
7,5414,23.860628
8,5415,45.516656
9,5416,40.339184


In [64]:
X = train.drop(columns = ['Strength'], axis = 1)
Y = train['Strength']
# X['WaterComponent_to_Cement_ratio'] = X['WaterComponent'] / (X['CementComponent'] + 1e-6)

test_baseline = test.drop(columns = ['id'], axis = 1)
# test_baseline['WaterComponent_to_Cement_ratio'] = test_baseline['WaterComponent'] / (test_baseline['CementComponent'] + 1e-6)

lgb_md = LGBMRegressor(n_estimators = 450,
                       max_depth = 3,
                       learning_rate = 0.01,
                       num_leaves = 20,
                       lambda_l1 = 3,
                       lambda_l2 = 3,
                       bagging_fraction = 0.5,
                       feature_fraction = 0.5).fit(X, Y)

lgb_pred = lgb_md.predict(test_baseline)
submission['Strength'] = lgb_pred
submission.head(10)



Unnamed: 0,id,Strength
0,5407,46.774149
1,5408,20.716929
2,5409,33.913433
3,5410,45.382779
4,5411,30.063766
5,5412,39.990596
6,5413,33.190147
7,5414,23.107188
8,5415,43.854499
9,5416,39.119804


In [65]:
X = train.drop(columns = ['Strength'], axis = 1)
Y = train['Strength']
X['WaterComponent_to_Cement_ratio'] = X['WaterComponent'] / (X['CementComponent'] + 1e-6)

test_baseline = test.drop(columns = ['id'], axis = 1)
test_baseline['WaterComponent_to_Cement_ratio'] = test_baseline['WaterComponent'] / (test_baseline['CementComponent'] + 1e-6)

XGB_md = XGBRegressor(tree_method = 'hist',
                      colsample_bytree = 0.7, 
                      gamma = 0.8, 
                      learning_rate = 0.01, 
                      max_depth = 2, 
                      min_child_weight = 10, 
                      n_estimators = 1000, 
                      subsample = 0.7).fit(X, Y)

xgb_pred = XGB_md.predict(test_baseline)
submission['Strength'] = xgb_pred
submission.head(10)

Unnamed: 0,id,Strength
0,5407,47.19146
1,5408,20.102165
2,5409,32.507866
3,5410,45.355045
4,5411,27.872448
5,5412,40.168808
6,5413,31.98064
7,5414,23.539715
8,5415,46.883781
9,5416,38.75211


In [66]:
X = train.drop(columns = ['Strength'], axis = 1)
Y = train['Strength']
X['WaterComponent_to_Cement_ratio'] = X['WaterComponent'] / (X['CementComponent'] + 1e-6)

test_baseline = test.drop(columns = ['id'], axis = 1)
test_baseline['WaterComponent_to_Cement_ratio'] = test_baseline['WaterComponent'] / (test_baseline['CementComponent'] + 1e-6)

cat_md = CatBoostRegressor(loss_function = 'RMSE',
                           iterations = 1000,
                           learning_rate = 0.01,
                           depth = 3,
                           random_strength = 0.5,
                           bagging_temperature = 0.7,
                           border_count = 30,
                           l2_leaf_reg = 5,
                           verbose = False).fit(X, Y)

cat_pred = cat_md.predict(test_baseline)
submission['Strength'] = cat_pred
submission.head(10)

Unnamed: 0,id,Strength
0,5407,46.878972
1,5408,19.732393
2,5409,32.571737
3,5410,46.562442
4,5411,31.740334
5,5412,39.683684
6,5413,32.204177
7,5414,22.992113
8,5415,46.355513
9,5416,39.635956


In [67]:
submission['Strength'] = (cat_pred + xgb_pred + lgb_pred + hist_pred + GBR_pred) / 5
submission.head()

Unnamed: 0,id,Strength
0,5407,46.816667
1,5408,20.409996
2,5409,33.310314
3,5410,45.308705
4,5411,30.029751


In [68]:
submission.to_csv('catboost_xgb_hist_GBR_lgb_full_original_submission_1.csv', index = False)