In [8]:
pip install xgboost optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.3-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.3
Note: you may need to restart the kernel to use updated packages.


In [9]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.cluster import KMeans
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score, cohen_kappa_score, davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVR
# from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor
# from catboost import CatBoostClassifier

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/TS-S3-Ep5/train.csv'
file_key_2 = 'Tabular-Playground-Series/TS-S3-Ep5/test.csv'
file_key_3 = 'Tabular-Playground-Series/TS-S3-Ep5/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

## Enginering features
train['alcohol_density'] = train['alcohol'] * train['density']
train['sulphate/density'] = train['sulphates']  / train['density']
train['alcohol_sulphate'] = train['alcohol'] * train['sulphates']

test['alcohol_density'] = test['alcohol']  * test['density']
test['sulphate/density'] = test['sulphates']  / test['density']
test['alcohol_sulphate'] = test['alcohol'] * test['sulphates']

test_md = test.copy()

X = train[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates', 'fixed acidity']]
Y = train['quality'] 

test_md = test_md[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates', 'fixed acidity']]

In [6]:
train.head()

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,alcohol_density,sulphate/density,alcohol_sulphate
0,0,8.0,0.5,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1,6,12.048212,0.77331,9.317
1,1,9.3,0.3,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8,6,12.781312,0.67098,8.576
2,2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.9966,3.52,0.73,11.3,7,11.26158,0.73249,8.249
3,3,8.1,0.87,0.22,2.6,0.084,11.0,65.0,0.9973,3.2,0.53,9.8,5,9.77354,0.531435,5.194
4,4,8.5,0.36,0.3,2.3,0.079,10.0,45.0,0.99444,3.2,1.36,9.5,6,9.44718,1.367604,12.92


# Data Exploration

# Optimal Rounder

In [10]:
from functools import partial
import numpy as np
import scipy as sp

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8

        ll = cohen_kappa_score(y, X_p, weights = 'quadratic')
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        initial_coef = [3.5, 4.5, 5.5, 6.5, 7.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method = 'nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8
        return X_p

    def coefficients(self):
        return self.coef_['x']

# Optuna Optimization

In [21]:
def objective(trial):
    
    ## Parameters to be evaluated
    param = dict(objective = 'reg:absoluteerror',
                 eval_metric = 'mae',
                 tree_method = 'hist', 
                 random_state = 42,
                 max_depth = trial.suggest_int('max_depth', 2, 10),
                 learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log = True),
                 n_estimators = trial.suggest_int('n_estimators', 30, 10000),
                 gamma = trial.suggest_float('gamma', 0, 10),
                 min_child_weight = trial.suggest_int('min_child_weight', 1, 100),
                 colsample_bytree = trial.suggest_float('colsample_bytree', 0.2, 0.9),
                 subsample = trial.suggest_float('subsample', 0.2, 0.9)
                )

    scores = []

#     skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    skf = KFold(n_splits = 5, shuffle = True, random_state = 42)
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
        print(fold, end = ' ')
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train , y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

        model = XGBRegressor(**param)
        model.fit(X_train, y_train)

        preds_valid = model.predict(X_valid)
        
        optR = OptimizedRounder()
        optR.fit(preds_valid, y_valid)
        coef = optR.coefficients()
        preds_valid = optR.predict(preds_valid, coef).astype(int)
        
        score = cohen_kappa_score(y_valid,  preds_valid, weights = "quadratic")
        scores.append(score)
        
    return np.mean(scores)

In [22]:
study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials = 50, timeout = 3600)

[32m[I 2023-02-07 16:47:53,761][0m A new study created in memory with name: no-name-183465bd-ea78-44b9-9023-dad24db79d33[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:48:06,096][0m Trial 0 finished with value: 0.5121107592334092 and parameters: {'max_depth': 10, 'learning_rate': 0.0014931154415614145, 'n_estimators': 1888, 'gamma': 9.723708752958828, 'min_child_weight': 53, 'colsample_bytree': 0.7563893562329784, 'subsample': 0.8562427545910245}. Best is trial 0 with value: 0.5121107592334092.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:48:25,942][0m Trial 1 finished with value: 0.5015860107445176 and parameters: {'max_depth': 6, 'learning_rate': 0.00020506262358495038, 'n_estimators': 1485, 'gamma': 6.460133586139943, 'min_child_weight': 21, 'colsample_bytree': 0.41711466553151944, 'subsample': 0.6392608168791015}. Best is trial 0 with value: 0.5121107592334092.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:48:34,905][0m Trial 2 finished with value: 0.4639539451496715 and parameters: {'max_depth': 8, 'learning_rate': 0.0020595742611230817, 'n_estimators': 547, 'gamma': 9.539964988139623, 'min_child_weight': 90, 'colsample_bytree': 0.5353245869334184, 'subsample': 0.6832752090339398}. Best is trial 0 with value: 0.5121107592334092.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:48:56,205][0m Trial 3 finished with value: 0.516656800357771 and parameters: {'max_depth': 9, 'learning_rate': 0.006241968929300682, 'n_estimators': 6849, 'gamma': 7.770023052690174, 'min_child_weight': 85, 'colsample_bytree': 0.6326063960062369, 'subsample': 0.41867693570118414}. Best is trial 3 with value: 0.516656800357771.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:49:31,061][0m Trial 4 finished with value: 0.5194442651441314 and parameters: {'max_depth': 10, 'learning_rate': 0.0876210716095894, 'n_estimators': 9830, 'gamma': 3.2267680309899327, 'min_child_weight': 61, 'colsample_bytree': 0.3252218050116584, 'subsample': 0.7553465589089341}. Best is trial 4 with value: 0.5194442651441314.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:49:59,517][0m Trial 5 finished with value: 0.44219868143618896 and parameters: {'max_depth': 6, 'learning_rate': 0.021324838207349518, 'n_estimators': 7656, 'gamma': 0.8052332647924287, 'min_child_weight': 46, 'colsample_bytree': 0.2675966726289176, 'subsample': 0.5099047844853182}. Best is trial 4 with value: 0.5194442651441314.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:50:26,448][0m Trial 6 finished with value: 0.4945273462070867 and parameters: {'max_depth': 10, 'learning_rate': 0.00013828786075565668, 'n_estimators': 9903, 'gamma': 3.558519651895012, 'min_child_weight': 81, 'colsample_bytree': 0.2500844133563544, 'subsample': 0.27269007526431877}. Best is trial 4 with value: 0.5194442651441314.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:50:46,431][0m Trial 7 finished with value: 0.4969717265529635 and parameters: {'max_depth': 4, 'learning_rate': 0.0002849451384103373, 'n_estimators': 3882, 'gamma': 8.16626830998223, 'min_child_weight': 48, 'colsample_bytree': 0.2109712175156624, 'subsample': 0.7838838761664326}. Best is trial 4 with value: 0.5194442651441314.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:50:56,622][0m Trial 8 finished with value: 0.48851364381650236 and parameters: {'max_depth': 2, 'learning_rate': 0.00907217555459998, 'n_estimators': 1725, 'gamma': 8.988476451962812, 'min_child_weight': 49, 'colsample_bytree': 0.529525487756008, 'subsample': 0.5547237132236849}. Best is trial 4 with value: 0.5194442651441314.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:51:22,200][0m Trial 9 finished with value: 0.21654137447273397 and parameters: {'max_depth': 8, 'learning_rate': 0.003199752017475202, 'n_estimators': 4425, 'gamma': 0.5342110243304665, 'min_child_weight': 27, 'colsample_bytree': 0.5057302139144422, 'subsample': 0.3808260248886456}. Best is trial 4 with value: 0.5194442651441314.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:51:57,255][0m Trial 10 finished with value: 0.5059663090463498 and parameters: {'max_depth': 4, 'learning_rate': 0.04517405461396429, 'n_estimators': 9824, 'gamma': 3.8204955105362406, 'min_child_weight': 5, 'colsample_bytree': 0.8971711649123537, 'subsample': 0.8983942008149163}. Best is trial 4 with value: 0.5194442651441314.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:52:25,583][0m Trial 11 finished with value: 0.491331383521314 and parameters: {'max_depth': 8, 'learning_rate': 0.03872633213387987, 'n_estimators': 7147, 'gamma': 5.974137156416225, 'min_child_weight': 75, 'colsample_bytree': 0.6763746529476702, 'subsample': 0.4143352794531003}. Best is trial 4 with value: 0.5194442651441314.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:52:46,899][0m Trial 12 finished with value: 0.5151801185821627 and parameters: {'max_depth': 10, 'learning_rate': 0.09536005531498556, 'n_estimators': 7507, 'gamma': 7.447355884786189, 'min_child_weight': 68, 'colsample_bytree': 0.380162377616411, 'subsample': 0.216464316328503}. Best is trial 4 with value: 0.5194442651441314.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:53:07,582][0m Trial 13 finished with value: 0.4992042563850803 and parameters: {'max_depth': 9, 'learning_rate': 0.00849239992114759, 'n_estimators': 5951, 'gamma': 4.303539890137994, 'min_child_weight': 95, 'colsample_bytree': 0.6402887463124647, 'subsample': 0.7358211253594871}. Best is trial 4 with value: 0.5194442651441314.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:53:36,096][0m Trial 14 finished with value: 0.5024725741220355 and parameters: {'max_depth': 7, 'learning_rate': 0.08888509360599608, 'n_estimators': 8853, 'gamma': 2.486373973135122, 'min_child_weight': 68, 'colsample_bytree': 0.368396679970067, 'subsample': 0.6055424850583614}. Best is trial 4 with value: 0.5194442651441314.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:53:56,205][0m Trial 15 finished with value: 0.5061373494916828 and parameters: {'max_depth': 9, 'learning_rate': 0.014646032692346524, 'n_estimators': 5746, 'gamma': 5.390658988827036, 'min_child_weight': 58, 'colsample_bytree': 0.4629732186836588, 'subsample': 0.48041695947241225}. Best is trial 4 with value: 0.5194442651441314.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:54:28,008][0m Trial 16 finished with value: 0.5075778141289569 and parameters: {'max_depth': 9, 'learning_rate': 0.0050982000857114395, 'n_estimators': 8642, 'gamma': 7.109839973698492, 'min_child_weight': 98, 'colsample_bytree': 0.6146279774548568, 'subsample': 0.720883150867635}. Best is trial 4 with value: 0.5194442651441314.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:54:49,669][0m Trial 17 finished with value: 0.5364199197720635 and parameters: {'max_depth': 7, 'learning_rate': 0.025138063664479862, 'n_estimators': 6379, 'gamma': 5.0815176722328355, 'min_child_weight': 84, 'colsample_bytree': 0.31735926891248817, 'subsample': 0.5943129608752584}. Best is trial 17 with value: 0.5364199197720635.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:55:13,641][0m Trial 18 finished with value: 0.5438879730475265 and parameters: {'max_depth': 4, 'learning_rate': 0.030683303521501528, 'n_estimators': 3233, 'gamma': 5.057497917011283, 'min_child_weight': 34, 'colsample_bytree': 0.34282071506694545, 'subsample': 0.6108646069098083}. Best is trial 18 with value: 0.5438879730475265.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:55:34,326][0m Trial 19 finished with value: 0.5363798832313104 and parameters: {'max_depth': 4, 'learning_rate': 0.019735348112388553, 'n_estimators': 3270, 'gamma': 5.158478371430271, 'min_child_weight': 27, 'colsample_bytree': 0.3084097395146341, 'subsample': 0.6014382739680705}. Best is trial 18 with value: 0.5438879730475265.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:55:55,145][0m Trial 20 finished with value: 0.5320288007334204 and parameters: {'max_depth': 3, 'learning_rate': 0.03702851125647513, 'n_estimators': 2965, 'gamma': 4.663247694719957, 'min_child_weight': 36, 'colsample_bytree': 0.2001236516412475, 'subsample': 0.6603671444038143}. Best is trial 18 with value: 0.5438879730475265.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:56:16,222][0m Trial 21 finished with value: 0.5486524253216831 and parameters: {'max_depth': 5, 'learning_rate': 0.020043729199262176, 'n_estimators': 3116, 'gamma': 5.342735345197131, 'min_child_weight': 21, 'colsample_bytree': 0.31594524397205576, 'subsample': 0.5932747491109975}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:56:39,539][0m Trial 22 finished with value: 0.5376133650787734 and parameters: {'max_depth': 5, 'learning_rate': 0.014692622886628993, 'n_estimators': 5371, 'gamma': 5.9585051771394015, 'min_child_weight': 7, 'colsample_bytree': 0.326167057661258, 'subsample': 0.5655831078368192}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:56:59,350][0m Trial 23 finished with value: 0.5007271772196414 and parameters: {'max_depth': 5, 'learning_rate': 0.013545066184878631, 'n_estimators': 4866, 'gamma': 6.2053196673821684, 'min_child_weight': 3, 'colsample_bytree': 0.4393564317646145, 'subsample': 0.5164554135463706}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:57:17,048][0m Trial 24 finished with value: 0.5327941804669669 and parameters: {'max_depth': 5, 'learning_rate': 0.011685409866466871, 'n_estimators': 2835, 'gamma': 5.786496270823675, 'min_child_weight': 16, 'colsample_bytree': 0.37532838455559786, 'subsample': 0.5680861180460801}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:57:42,133][0m Trial 25 finished with value: 0.5215179898201132 and parameters: {'max_depth': 5, 'learning_rate': 0.030268299515562546, 'n_estimators': 3748, 'gamma': 6.3504177047911785, 'min_child_weight': 37, 'colsample_bytree': 0.28200477711981375, 'subsample': 0.6580978787688104}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:57:58,213][0m Trial 26 finished with value: 0.5301349716146889 and parameters: {'max_depth': 3, 'learning_rate': 0.05108654222547333, 'n_estimators': 2365, 'gamma': 6.935866509967084, 'min_child_weight': 12, 'colsample_bytree': 0.3489192156254848, 'subsample': 0.5357045377198715}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:58:22,737][0m Trial 27 finished with value: 0.5361865783424922 and parameters: {'max_depth': 3, 'learning_rate': 0.018937209927609812, 'n_estimators': 5048, 'gamma': 4.464425136771472, 'min_child_weight': 39, 'colsample_bytree': 0.25921974725954783, 'subsample': 0.6244501663901957}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:58:46,615][0m Trial 28 finished with value: 0.5153174661537487 and parameters: {'max_depth': 2, 'learning_rate': 0.04918455737084692, 'n_estimators': 5376, 'gamma': 5.390027938309796, 'min_child_weight': 11, 'colsample_bytree': 0.40466391143664693, 'subsample': 0.694149361504753}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:58:56,232][0m Trial 29 finished with value: 0.4765030216373402 and parameters: {'max_depth': 5, 'learning_rate': 0.005396022342842441, 'n_estimators': 457, 'gamma': 6.712246864788344, 'min_child_weight': 27, 'colsample_bytree': 0.34631294480757324, 'subsample': 0.8232847600043096}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:59:15,590][0m Trial 30 finished with value: 0.48681081997409786 and parameters: {'max_depth': 4, 'learning_rate': 0.012407263385737164, 'n_estimators': 4208, 'gamma': 5.8502099728966215, 'min_child_weight': 20, 'colsample_bytree': 0.48398861140611305, 'subsample': 0.5750908530940744}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:59:39,349][0m Trial 31 finished with value: 0.5439498384648906 and parameters: {'max_depth': 7, 'learning_rate': 0.026015306277686345, 'n_estimators': 6381, 'gamma': 5.099893085953497, 'min_child_weight': 32, 'colsample_bytree': 0.2969889316417133, 'subsample': 0.6132446282205011}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 16:59:55,015][0m Trial 32 finished with value: 0.5369714730457001 and parameters: {'max_depth': 6, 'learning_rate': 0.031441065350585405, 'n_estimators': 2375, 'gamma': 4.782164953413178, 'min_child_weight': 33, 'colsample_bytree': 0.4255644299130633, 'subsample': 0.6465230917690873}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:00:07,447][0m Trial 33 finished with value: 0.5252838128992433 and parameters: {'max_depth': 7, 'learning_rate': 0.02305736208204179, 'n_estimators': 1091, 'gamma': 5.630922065430855, 'min_child_weight': 23, 'colsample_bytree': 0.30223167400090956, 'subsample': 0.6192202401588498}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:00:27,560][0m Trial 34 finished with value: 0.5090425454537224 and parameters: {'max_depth': 5, 'learning_rate': 0.059236624439674576, 'n_estimators': 4641, 'gamma': 6.479813685320035, 'min_child_weight': 14, 'colsample_bytree': 0.3943507839939038, 'subsample': 0.6876945675796429}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:00:49,542][0m Trial 35 finished with value: 0.517544688898532 and parameters: {'max_depth': 6, 'learning_rate': 0.017045359658521487, 'n_estimators': 6464, 'gamma': 4.382661838284211, 'min_child_weight': 32, 'colsample_bytree': 0.2372913908983853, 'subsample': 0.4805366372365494}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:01:13,674][0m Trial 36 finished with value: 0.5135626684669083 and parameters: {'max_depth': 6, 'learning_rate': 0.028285518229341458, 'n_estimators': 3605, 'gamma': 5.151351597177103, 'min_child_weight': 41, 'colsample_bytree': 0.29159053035203775, 'subsample': 0.5540722355800246}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:01:36,498][0m Trial 37 finished with value: 0.5439711761280477 and parameters: {'max_depth': 6, 'learning_rate': 0.0687525801119576, 'n_estimators': 5863, 'gamma': 3.9568674300768816, 'min_child_weight': 22, 'colsample_bytree': 0.3477811465878755, 'subsample': 0.6550154262583892}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:01:48,404][0m Trial 38 finished with value: 0.545434137728529 and parameters: {'max_depth': 7, 'learning_rate': 0.0650868010697295, 'n_estimators': 1070, 'gamma': 3.2447067680769055, 'min_child_weight': 20, 'colsample_bytree': 0.2476772086344024, 'subsample': 0.7017299011486737}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:02:00,512][0m Trial 39 finished with value: 0.5276442042499566 and parameters: {'max_depth': 7, 'learning_rate': 0.06498765296997051, 'n_estimators': 1030, 'gamma': 2.5651563982041354, 'min_child_weight': 21, 'colsample_bytree': 0.23618616081613836, 'subsample': 0.7640465265332332}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:02:27,536][0m Trial 40 finished with value: 0.5323001155103265 and parameters: {'max_depth': 8, 'learning_rate': 0.07342756717495472, 'n_estimators': 7978, 'gamma': 3.8611937366703266, 'min_child_weight': 44, 'colsample_bytree': 0.2608826900736553, 'subsample': 0.716297907242584}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:02:43,771][0m Trial 41 finished with value: 0.4941136998696921 and parameters: {'max_depth': 7, 'learning_rate': 0.0687735315633693, 'n_estimators': 85, 'gamma': 3.1323678132802626, 'min_child_weight': 30, 'colsample_bytree': 0.3483078320619492, 'subsample': 0.6663339649539709}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:02:59,390][0m Trial 42 finished with value: 0.5229410583671547 and parameters: {'max_depth': 6, 'learning_rate': 0.042851453198762976, 'n_estimators': 2353, 'gamma': 4.140967677514484, 'min_child_weight': 18, 'colsample_bytree': 0.29042931534220295, 'subsample': 0.6969393690370962}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:03:15,027][0m Trial 43 finished with value: 0.5177311099620512 and parameters: {'max_depth': 6, 'learning_rate': 0.05755032335259449, 'n_estimators': 1399, 'gamma': 3.4154675231523077, 'min_child_weight': 52, 'colsample_bytree': 0.22526946041415288, 'subsample': 0.6392308828471379}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:03:39,612][0m Trial 44 finished with value: 0.5480037084819901 and parameters: {'max_depth': 8, 'learning_rate': 0.03844179407453206, 'n_estimators': 6500, 'gamma': 4.832046930169287, 'min_child_weight': 25, 'colsample_bytree': 0.2677153020210608, 'subsample': 0.6288217054250207}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:04:04,121][0m Trial 45 finished with value: 0.5133433449697695 and parameters: {'max_depth': 8, 'learning_rate': 0.04056874879699121, 'n_estimators': 6615, 'gamma': 3.934097350369622, 'min_child_weight': 24, 'colsample_bytree': 0.25716246889961814, 'subsample': 0.7539451597755757}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:04:32,937][0m Trial 46 finished with value: 0.5111939861788694 and parameters: {'max_depth': 7, 'learning_rate': 0.0794501614985975, 'n_estimators': 5649, 'gamma': 4.652457722448549, 'min_child_weight': 11, 'colsample_bytree': 0.21217673397091497, 'subsample': 0.6701004902549321}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:05:02,334][0m Trial 47 finished with value: 0.5270085947757588 and parameters: {'max_depth': 8, 'learning_rate': 0.09311174573754427, 'n_estimators': 6983, 'gamma': 3.0657341579890067, 'min_child_weight': 8, 'colsample_bytree': 0.2709228379539188, 'subsample': 0.7920217267219991}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:05:31,044][0m Trial 48 finished with value: 0.5120106637718292 and parameters: {'max_depth': 7, 'learning_rate': 0.05550239617118901, 'n_estimators': 7377, 'gamma': 3.5807205952593963, 'min_child_weight': 17, 'colsample_bytree': 0.3148674986558879, 'subsample': 0.6381555116707787}. Best is trial 21 with value: 0.5486524253216831.[0m


0 1 2 3 4 

[32m[I 2023-02-07 17:05:52,367][0m Trial 49 finished with value: 0.542442663551145 and parameters: {'max_depth': 9, 'learning_rate': 0.022345754723921567, 'n_estimators': 6027, 'gamma': 4.2253070281506, 'min_child_weight': 25, 'colsample_bytree': 0.23586560031242343, 'subsample': 0.7224512705994064}. Best is trial 21 with value: 0.5486524253216831.[0m


In [23]:
study.best_trial.value

0.5486524253216831

In [24]:
study.best_trial.params

{'max_depth': 5,
 'learning_rate': 0.020043729199262176,
 'n_estimators': 3116,
 'gamma': 5.342735345197131,
 'min_child_weight': 21,
 'colsample_bytree': 0.31594524397205576,
 'subsample': 0.5932747491109975}

# XGBoost Modeling

In [25]:
XGB_cv_scores, XGB_imp = list(), list()
preds = list()

## Running 5 times CV
# for i in range(5):
    
#     skf = StratifiedKFold(n_splits = 5, random_state = 40, shuffle = True)
skf = KFold(n_splits = 5, random_state = 42, shuffle = True)
    
for train_ix, test_ix in skf.split(X, Y):
        
    ## Splitting the data 
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
                
    ## Building RF model
    XGB_md = XGBRegressor(tree_method = 'hist',
                              colsample_bytree = 0.3159, 
                              gamma = 5.3427, 
                              learning_rate = 0.02, 
                              max_depth = 5, 
                              min_child_weight = 21, 
                              n_estimators = 3116, 
                              subsample = 0.59327, 
                              random_state = 42).fit(X_train, Y_train)
    XGB_imp.append(XGB_md.feature_importances_)
        
    ## Predicting on X_test and test
    XGB_pred_1 = XGB_md.predict(X_test)
    XGB_pred_2 = XGB_md.predict(test_md)
        
    ## Applying Optimal Rounder (using abhishek approach)
    optR = OptimizedRounder()
    optR.fit(XGB_pred_1, Y_test)
    coef = optR.coefficients()
    XGB_pred_1 = optR.predict(XGB_pred_1, coef).astype(int)
    XGB_pred_2 = optR.predict(XGB_pred_2, coef).astype(int)
        
    ## Computing roc-auc score
    XGB_cv_scores.append(cohen_kappa_score(Y_test, XGB_pred_1, weights = 'quadratic'))
    preds.append(XGB_pred_2)

XGB_cv_score = np.mean(XGB_cv_scores)    
print('The average roc-auc score over 5-folds (run 5 times) is:', XGB_cv_score)

The average roc-auc score over 5-folds (run 5 times) is: 0.5826067687016041


In [26]:
XGB_preds_test = pd.DataFrame(preds).mode(axis = 0).loc[0, ]

submission['quality'] = XGB_preds_test.astype(int)
submission.head()

Unnamed: 0,Id,quality
0,2056,5
1,2057,6
2,2058,5
3,2059,6
4,2060,6


In [27]:
submission['quality'].value_counts()

5    603
6    442
7    327
Name: quality, dtype: int64

In [28]:
submission.to_csv('XGB_Reg_FE_4.csv', index = False)