In [1]:
pip install xgboost lightgbm catboost optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.4-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting catboost
  Downloading catboost-1.1.1-cp310-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
Collecting graphviz
  Down

In [2]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor 
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/PS-S3/Ep9/train.csv'
file_key_2 = 'Tabular-Playground-Series/PS-S3/Ep9/test.csv'
file_key_3 = 'Tabular-Playground-Series/PS-S3/Ep9/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

# Baseline Modeling

In [7]:
X = train.drop(columns = ['id', 'Strength'], axis = 1)
Y = train['Strength']

test_baseline = test.drop(columns = ['id'], axis = 1)

XGB_cv_scores, XGB_imp = list(), list()
XGB_preds = list()

lgb_cv_scores, lgb_imp = list(), list()
lgb_preds = list()

cat_cv_scores, cat_imp = list(), list()
cat_preds = list()

for i in range(5):

    skf = KFold(n_splits = 5, random_state = 42, shuffle = True)
    
    for train_ix, test_ix in skf.split(X, Y):
        
        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
                
        #############    
        ## XGBoost ##
        #############
        
        XGB_md = XGBRegressor(tree_method = 'hist',
                              colsample_bytree = 0.7, 
                              gamma = 0.8, 
                              learning_rate = 0.01, 
                              max_depth = 7, 
                              min_child_weight = 10, 
                              n_estimators = 1000, 
                              subsample = 0.7).fit(X_train, Y_train)
        XGB_imp.append(XGB_md.feature_importances_)
        
        ## Predicting on X_test and test
        XGB_pred_1 = XGB_md.predict(X_test)
        XGB_pred_2 = XGB_md.predict(test_baseline)
        
        ## Computing rmse
        XGB_cv_scores.append(mean_squared_error(Y_test, XGB_pred_1, squared = False))
        XGB_preds.append(XGB_pred_2)
        
        ##############
        ## LightGBM ##
        ##############
        
        lgb_md = LGBMRegressor(n_estimators = 1000,
                               max_depth = 7,
                               learning_rate = 0.01,
                               num_leaves = 20,
                               lambda_l1 = 3,
                               lambda_l2 = 3,
                               bagging_fraction = 0.7,
                               feature_fraction = 0.7).fit(X_train, Y_train)
        lgb_imp.append(lgb_md.feature_importances_)
        
        ## Predicting on X_test and test
        lgb_pred_1 = lgb_md.predict(X_test)
        lgb_pred_2 = lgb_md.predict(test_baseline)
        
        ## Computing rmse
        lgb_cv_scores.append(mean_squared_error(Y_test, lgb_pred_1, squared = False))
        lgb_preds.append(lgb_pred_2)
        
        ##############
        ## CatBoost ##
        ##############
        
        cat_md = CatBoostRegressor(loss_function = 'RMSE',
                                   iterations = 1000,
                                   learning_rate = 0.01,
                                   depth = 7,
                                   random_strength = 0.5,
                                   bagging_temperature = 0.7,
                                   border_count = 30,
                                   l2_leaf_reg = 5,
                                   verbose = False).fit(X_train, Y_train)
        cat_imp.append(cat_md.feature_importances_)
        
        ## Predicting on X_test and test
        cat_pred_1 = cat_md.predict(X_test)
        cat_pred_2 = cat_md.predict(test_baseline)
        
        ## Computing rmse
        cat_cv_scores.append(mean_squared_error(Y_test, cat_pred_1, squared = False))
        cat_preds.append(cat_pred_2)



In [9]:
XGB_cv_score = np.mean(XGB_cv_scores)    
print('The average oof rmse score over 5-folds (run 5 times) of the XGBoost model is:', XGB_cv_score)

lgb_cv_score = np.mean(lgb_cv_scores)    
print('The average oof rmse score over 5-folds (run 5 times) of the LightGBM model is:', lgb_cv_score)

cat_cv_score = np.mean(cat_cv_scores)    
print('The average oof rmse score over 5-folds (run 5 times) of the CatBoost model is:', cat_cv_score)

The average oof rmse score over 5-folds (run 5 times) of the XGBoost model is: 12.329474679521704
The average oof rmse score over 5-folds (run 5 times) of the LightGBM model is: 12.200119849572816
The average oof rmse score over 5-folds (run 5 times) of the CatBoost model is: 12.109350678391197


In [12]:
cat_preds_test = pd.DataFrame(cat_preds).apply(np.mean, axis = 0)

submission['Strength'] = cat_preds_test
submission.head()

Unnamed: 0,id,Strength
0,5407,49.818871
1,5408,19.212196
2,5409,33.756955
3,5410,47.744129
4,5411,30.605016


In [13]:
submission.to_csv('catboost_submission.csv', index = False)