In [1]:
pip install xgboost optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [16]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.cluster import KMeans
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score, cohen_kappa_score, davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
# from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, XGBRegressor
# from catboost import CatBoostClassifier

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/TS-S3-Ep5/train.csv'
file_key_2 = 'Tabular-Playground-Series/TS-S3-Ep5/test.csv'
file_key_3 = 'Tabular-Playground-Series/TS-S3-Ep5/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

## Enginering features
train['alcohol_density'] = train['alcohol'] * train['density']
train['sulphate/density'] = train['sulphates']  / train['density']
train['alcohol_sulphate'] = train['alcohol'] * train['sulphates']

test['alcohol_density'] = test['alcohol']  * test['density']
test['sulphate/density'] = test['sulphates']  / test['density']
test['alcohol_sulphate'] = test['alcohol'] * test['sulphates']

test_md = test.copy()

X = train[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates', 'fixed acidity']]
Y = train['quality'] 

test_md = test_md[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates', 'fixed acidity']]

# Optimal Rounder

In [17]:
from functools import partial
import numpy as np
import scipy as sp

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8

        ll = cohen_kappa_score(y, X_p, weights = 'quadratic')
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        initial_coef = [3.5, 4.5, 5.5, 6.5, 7.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method = 'nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8
        return X_p

    def coefficients(self):
        return self.coef_['x']

# Optuna Optimzation

In [18]:
def objective(trial):
    
    ## Parameters to be evaluated
    param = dict(objective = 'reg:absoluteerror',
                 eval_metric = 'mae',
                 tree_method = 'hist', 
                 random_state = 42,
                 max_depth = trial.suggest_int('max_depth', 2, 10),
                 learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log = True),
                 n_estimators = trial.suggest_int('n_estimators', 30, 10000),
                 gamma = trial.suggest_float('gamma', 0, 10),
                 min_child_weight = trial.suggest_int('min_child_weight', 1, 100),
                 colsample_bytree = trial.suggest_float('colsample_bytree', 0.2, 0.9),
                 subsample = trial.suggest_float('subsample', 0.2, 0.9)
                )

    scores = []

#     skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    skf = KFold(n_splits = 5, shuffle = True, random_state = 42)
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
        print(fold, end = ' ')
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        Y_train , Y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

        model = XGBRegressor(**param).fit(X_train, Y_train)

        preds_train = model.predict(X_train)
        preds_valid = model.predict(X_valid)
        
        optR = OptimizedRounder()
        optR.fit(preds_train, Y_train)
        coef = optR.coefficients()
        preds_valid = optR.predict(preds_valid, coef).astype(int)
        
        score = cohen_kappa_score(Y_valid,  preds_valid, weights = "quadratic")
        scores.append(score)
        
    return np.mean(scores)

In [20]:
study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials = 50, timeout = 3600)

[32m[I 2023-02-08 17:22:18,222][0m A new study created in memory with name: no-name-f5172b81-eb87-42dd-9f69-53969e58d581[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:22:40,268][0m Trial 0 finished with value: 0.450832755964794 and parameters: {'max_depth': 5, 'learning_rate': 0.0020921514749607075, 'n_estimators': 410, 'gamma': 1.1407010403628104, 'min_child_weight': 42, 'colsample_bytree': 0.7415141958333946, 'subsample': 0.7049778584246975}. Best is trial 0 with value: 0.450832755964794.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:23:04,569][0m Trial 1 finished with value: 0.3017280606794285 and parameters: {'max_depth': 3, 'learning_rate': 0.0005144795239126138, 'n_estimators': 2295, 'gamma': 4.350703448429015, 'min_child_weight': 35, 'colsample_bytree': 0.7129661958116993, 'subsample': 0.26497537726195786}. Best is trial 0 with value: 0.450832755964794.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:23:36,922][0m Trial 2 finished with value: 0.4744726597277193 and parameters: {'max_depth': 2, 'learning_rate': 0.0038415170102228165, 'n_estimators': 6476, 'gamma': 9.825049990740547, 'min_child_weight': 33, 'colsample_bytree': 0.34115362991042497, 'subsample': 0.3704665424244431}. Best is trial 2 with value: 0.4744726597277193.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:24:06,151][0m Trial 3 finished with value: 0.4798543844047929 and parameters: {'max_depth': 3, 'learning_rate': 0.00010230159778281178, 'n_estimators': 5750, 'gamma': 1.0518985181078844, 'min_child_weight': 94, 'colsample_bytree': 0.2106702729884959, 'subsample': 0.5269928260390382}. Best is trial 3 with value: 0.4798543844047929.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:24:39,822][0m Trial 4 finished with value: 0.45832725196592 and parameters: {'max_depth': 6, 'learning_rate': 0.008913636218615406, 'n_estimators': 5147, 'gamma': 3.3417360409116057, 'min_child_weight': 52, 'colsample_bytree': 0.471232299336489, 'subsample': 0.6145436936831994}. Best is trial 3 with value: 0.4798543844047929.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:25:24,169][0m Trial 5 finished with value: 0.4750797953615235 and parameters: {'max_depth': 3, 'learning_rate': 0.041995732133792735, 'n_estimators': 8391, 'gamma': 1.9065875921671627, 'min_child_weight': 45, 'colsample_bytree': 0.8615214950156076, 'subsample': 0.8831949027682522}. Best is trial 3 with value: 0.4798543844047929.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:26:06,132][0m Trial 6 finished with value: 0.36044736201380784 and parameters: {'max_depth': 5, 'learning_rate': 0.0012979120241776649, 'n_estimators': 8299, 'gamma': 2.0006975470550623, 'min_child_weight': 10, 'colsample_bytree': 0.6400149972139679, 'subsample': 0.6806673646221141}. Best is trial 3 with value: 0.4798543844047929.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:26:27,374][0m Trial 7 finished with value: 0.49091136232430943 and parameters: {'max_depth': 3, 'learning_rate': 0.032324042927035855, 'n_estimators': 444, 'gamma': 7.246999772008447, 'min_child_weight': 25, 'colsample_bytree': 0.35061888888765486, 'subsample': 0.2697643578177739}. Best is trial 7 with value: 0.49091136232430943.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:26:56,200][0m Trial 8 finished with value: 0.4934755955145579 and parameters: {'max_depth': 3, 'learning_rate': 0.033504329254752135, 'n_estimators': 915, 'gamma': 7.212743144399761, 'min_child_weight': 80, 'colsample_bytree': 0.560696059333315, 'subsample': 0.38617439909081797}. Best is trial 8 with value: 0.4934755955145579.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:27:28,321][0m Trial 9 finished with value: 0.45151610814072224 and parameters: {'max_depth': 3, 'learning_rate': 0.01813371760496451, 'n_estimators': 5692, 'gamma': 9.791479960250008, 'min_child_weight': 40, 'colsample_bytree': 0.3422423174501848, 'subsample': 0.7269468988928018}. Best is trial 8 with value: 0.4934755955145579.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:27:51,254][0m Trial 10 finished with value: 0.4742920121751656 and parameters: {'max_depth': 9, 'learning_rate': 0.057999842327121215, 'n_estimators': 2841, 'gamma': 6.268170868880379, 'min_child_weight': 87, 'colsample_bytree': 0.5496131868585707, 'subsample': 0.44049988206609286}. Best is trial 8 with value: 0.4934755955145579.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:28:16,583][0m Trial 11 finished with value: 0.49556752371783885 and parameters: {'max_depth': 9, 'learning_rate': 0.08589212905455078, 'n_estimators': 578, 'gamma': 6.85269242184514, 'min_child_weight': 71, 'colsample_bytree': 0.4777599034387396, 'subsample': 0.21932693604299852}. Best is trial 11 with value: 0.49556752371783885.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:28:39,179][0m Trial 12 finished with value: 0.45885413442611467 and parameters: {'max_depth': 10, 'learning_rate': 0.06325621086223668, 'n_estimators': 2522, 'gamma': 7.218231255351239, 'min_child_weight': 71, 'colsample_bytree': 0.5063556857254877, 'subsample': 0.20929626015844638}. Best is trial 11 with value: 0.49556752371783885.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:29:01,675][0m Trial 13 finished with value: 0.46884727698305395 and parameters: {'max_depth': 8, 'learning_rate': 0.09638118293549625, 'n_estimators': 1528, 'gamma': 5.958157522047816, 'min_child_weight': 69, 'colsample_bytree': 0.5535784215366115, 'subsample': 0.350911468967703}. Best is trial 11 with value: 0.49556752371783885.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:29:29,070][0m Trial 14 finished with value: 0.487346810202353 and parameters: {'max_depth': 8, 'learning_rate': 0.01419121409961752, 'n_estimators': 3830, 'gamma': 8.057389183749272, 'min_child_weight': 73, 'colsample_bytree': 0.43719952570462545, 'subsample': 0.21618189411087183}. Best is trial 11 with value: 0.49556752371783885.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:29:55,445][0m Trial 15 finished with value: 0.40484211067316905 and parameters: {'max_depth': 6, 'learning_rate': 0.028833618444940822, 'n_estimators': 3765, 'gamma': 5.002600798896322, 'min_child_weight': 58, 'colsample_bytree': 0.638892769029174, 'subsample': 0.35829703741910335}. Best is trial 11 with value: 0.49556752371783885.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:30:30,189][0m Trial 16 finished with value: 0.46273663419792543 and parameters: {'max_depth': 8, 'learning_rate': 0.09623416871787518, 'n_estimators': 1283, 'gamma': 8.394203277007103, 'min_child_weight': 87, 'colsample_bytree': 0.4243010841932658, 'subsample': 0.46800631565333933}. Best is trial 11 with value: 0.49556752371783885.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:31:16,549][0m Trial 17 finished with value: 0.3743446094503093 and parameters: {'max_depth': 10, 'learning_rate': 0.009258381397186209, 'n_estimators': 9783, 'gamma': 5.394618279889082, 'min_child_weight': 62, 'colsample_bytree': 0.5903526709962194, 'subsample': 0.3077102501501386}. Best is trial 11 with value: 0.49556752371783885.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:31:31,858][0m Trial 18 finished with value: 0.5105339303130914 and parameters: {'max_depth': 7, 'learning_rate': 0.023361609711842982, 'n_estimators': 200, 'gamma': 6.502290737225899, 'min_child_weight': 79, 'colsample_bytree': 0.5094901258868261, 'subsample': 0.42070733800701776}. Best is trial 18 with value: 0.5105339303130914.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:31:46,236][0m Trial 19 finished with value: 0.5086717589296603 and parameters: {'max_depth': 7, 'learning_rate': 0.020022879660930255, 'n_estimators': 132, 'gamma': 4.273795362967081, 'min_child_weight': 97, 'colsample_bytree': 0.4822841895546369, 'subsample': 0.4695234389405247}. Best is trial 18 with value: 0.5105339303130914.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:32:14,516][0m Trial 20 finished with value: 0.5245713802187029 and parameters: {'max_depth': 7, 'learning_rate': 0.006502907169593187, 'n_estimators': 3756, 'gamma': 3.805700830178726, 'min_child_weight': 100, 'colsample_bytree': 0.3948929002152362, 'subsample': 0.5034661520953532}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:32:44,338][0m Trial 21 finished with value: 0.455168372953829 and parameters: {'max_depth': 7, 'learning_rate': 0.006494633448441481, 'n_estimators': 3697, 'gamma': 3.438940452655095, 'min_child_weight': 97, 'colsample_bytree': 0.401576623268467, 'subsample': 0.49319515187588897}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:33:10,175][0m Trial 22 finished with value: 0.48297103666920904 and parameters: {'max_depth': 7, 'learning_rate': 0.01790730865880313, 'n_estimators': 1722, 'gamma': 4.165267559854787, 'min_child_weight': 100, 'colsample_bytree': 0.49706918311119036, 'subsample': 0.43077243086972716}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:33:37,020][0m Trial 23 finished with value: 0.5017546004188811 and parameters: {'max_depth': 6, 'learning_rate': 0.004836529816095079, 'n_estimators': 3168, 'gamma': 5.5428176199077015, 'min_child_weight': 87, 'colsample_bytree': 0.3832388902002366, 'subsample': 0.5399096060873659}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:34:08,141][0m Trial 24 finished with value: 0.4713103023905605 and parameters: {'max_depth': 7, 'learning_rate': 0.01236600195945739, 'n_estimators': 4588, 'gamma': 4.517798944927689, 'min_child_weight': 91, 'colsample_bytree': 0.44331766539239303, 'subsample': 0.4322659996877859}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:34:47,544][0m Trial 25 finished with value: 0.4885389260951219 and parameters: {'max_depth': 5, 'learning_rate': 0.024834756343990738, 'n_estimators': 6746, 'gamma': 3.3981975525251435, 'min_child_weight': 80, 'colsample_bytree': 0.2631969197007449, 'subsample': 0.4912431010742174}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:35:16,169][0m Trial 26 finished with value: 0.49478360246495495 and parameters: {'max_depth': 7, 'learning_rate': 0.006516129300336893, 'n_estimators': 172, 'gamma': 5.003883594335357, 'min_child_weight': 80, 'colsample_bytree': 0.5097335617534192, 'subsample': 0.5285063682014691}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:35:43,963][0m Trial 27 finished with value: 0.4763926116647217 and parameters: {'max_depth': 9, 'learning_rate': 0.014079740957654524, 'n_estimators': 1947, 'gamma': 0.1573143624903972, 'min_child_weight': 100, 'colsample_bytree': 0.3848277208640081, 'subsample': 0.5803410019450492}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:36:10,696][0m Trial 28 finished with value: 0.4954882977319371 and parameters: {'max_depth': 6, 'learning_rate': 0.019926472858325345, 'n_estimators': 88, 'gamma': 6.139088687871946, 'min_child_weight': 90, 'colsample_bytree': 0.3230543614347852, 'subsample': 0.4091472199785961}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:36:33,053][0m Trial 29 finished with value: 0.47104951093554936 and parameters: {'max_depth': 8, 'learning_rate': 0.002385492276655553, 'n_estimators': 946, 'gamma': 3.803121616911306, 'min_child_weight': 79, 'colsample_bytree': 0.44815161472739623, 'subsample': 0.46798146113518113}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:37:27,547][0m Trial 30 finished with value: 0.48380941760726887 and parameters: {'max_depth': 4, 'learning_rate': 0.0099435106252981, 'n_estimators': 7474, 'gamma': 2.8862026259577984, 'min_child_weight': 94, 'colsample_bytree': 0.3025125211136712, 'subsample': 0.32888211364913966}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:38:00,266][0m Trial 31 finished with value: 0.5003000416639029 and parameters: {'max_depth': 6, 'learning_rate': 0.004368655432346946, 'n_estimators': 3227, 'gamma': 5.324183239705399, 'min_child_weight': 87, 'colsample_bytree': 0.4003066777739178, 'subsample': 0.5528956502522221}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:38:31,631][0m Trial 32 finished with value: 0.46914051476807195 and parameters: {'max_depth': 7, 'learning_rate': 0.006081752307538387, 'n_estimators': 2225, 'gamma': 4.5753021071718685, 'min_child_weight': 85, 'colsample_bytree': 0.4007737512731959, 'subsample': 0.3940524377870057}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:38:59,880][0m Trial 33 finished with value: 0.5029196220147811 and parameters: {'max_depth': 5, 'learning_rate': 0.0033089729706505745, 'n_estimators': 4364, 'gamma': 5.615245589960155, 'min_child_weight': 94, 'colsample_bytree': 0.37240103044396305, 'subsample': 0.49333398782703897}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:39:50,495][0m Trial 34 finished with value: 0.4862533858166138 and parameters: {'max_depth': 4, 'learning_rate': 0.0028203804920572716, 'n_estimators': 4493, 'gamma': 4.33761816148601, 'min_child_weight': 95, 'colsample_bytree': 0.27767069824054963, 'subsample': 0.5033822394649634}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:40:27,340][0m Trial 35 finished with value: 0.4539703236334714 and parameters: {'max_depth': 5, 'learning_rate': 0.0017371378687436004, 'n_estimators': 5567, 'gamma': 4.909627138155196, 'min_child_weight': 65, 'colsample_bytree': 0.44446478555882596, 'subsample': 0.45786208957511704}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:41:00,433][0m Trial 36 finished with value: 0.49651897538898326 and parameters: {'max_depth': 4, 'learning_rate': 0.0031840881657271463, 'n_estimators': 4655, 'gamma': 5.8559295525368285, 'min_child_weight': 94, 'colsample_bytree': 0.36026284591828495, 'subsample': 0.4037418135854002}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:41:22,479][0m Trial 37 finished with value: 0.47972938696943307 and parameters: {'max_depth': 6, 'learning_rate': 0.0010145636382380375, 'n_estimators': 1149, 'gamma': 3.939850388925783, 'min_child_weight': 76, 'colsample_bytree': 0.4833668201350535, 'subsample': 0.5051624979372861}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:41:58,022][0m Trial 38 finished with value: 0.48830458651488506 and parameters: {'max_depth': 5, 'learning_rate': 0.007661536521683182, 'n_estimators': 6384, 'gamma': 4.700142988081277, 'min_child_weight': 5, 'colsample_bytree': 0.2090516297551661, 'subsample': 0.590628109985925}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:42:26,260][0m Trial 39 finished with value: 0.48036096828658736 and parameters: {'max_depth': 7, 'learning_rate': 0.0046938533436299635, 'n_estimators': 5100, 'gamma': 6.647162093677937, 'min_child_weight': 29, 'colsample_bytree': 0.36538815779300493, 'subsample': 0.3740841731829427}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:42:53,863][0m Trial 40 finished with value: 0.4608284737775074 and parameters: {'max_depth': 8, 'learning_rate': 0.010806618888476609, 'n_estimators': 2481, 'gamma': 2.782415802566236, 'min_child_weight': 54, 'colsample_bytree': 0.5142834987102537, 'subsample': 0.629194990145633}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:43:20,828][0m Trial 41 finished with value: 0.506480235201097 and parameters: {'max_depth': 6, 'learning_rate': 0.0043522094006558446, 'n_estimators': 3253, 'gamma': 5.520486180228829, 'min_child_weight': 85, 'colsample_bytree': 0.37873663354656617, 'subsample': 0.5381207845675502}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:43:54,522][0m Trial 42 finished with value: 0.507521082481235 and parameters: {'max_depth': 5, 'learning_rate': 0.0035619722062159498, 'n_estimators': 3989, 'gamma': 5.6539400220277996, 'min_child_weight': 100, 'colsample_bytree': 0.324542559387779, 'subsample': 0.5581313244674397}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:44:22,807][0m Trial 43 finished with value: 0.5093908311103272 and parameters: {'max_depth': 6, 'learning_rate': 0.036989609911895045, 'n_estimators': 3245, 'gamma': 5.1861456115461495, 'min_child_weight': 100, 'colsample_bytree': 0.31071095072521304, 'subsample': 0.5547975674827467}. Best is trial 20 with value: 0.5245713802187029.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:44:46,630][0m Trial 44 finished with value: 0.530432165008472 and parameters: {'max_depth': 7, 'learning_rate': 0.037306935544411675, 'n_estimators': 711, 'gamma': 6.424280577329913, 'min_child_weight': 100, 'colsample_bytree': 0.24827297782231844, 'subsample': 0.619874598507635}. Best is trial 44 with value: 0.530432165008472.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:45:03,515][0m Trial 45 finished with value: 0.4715245332422356 and parameters: {'max_depth': 7, 'learning_rate': 0.04036629684045467, 'n_estimators': 643, 'gamma': 6.549194111878676, 'min_child_weight': 19, 'colsample_bytree': 0.23992777525055897, 'subsample': 0.6344656241390978}. Best is trial 44 with value: 0.530432165008472.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:45:20,219][0m Trial 46 finished with value: 0.4958032395482125 and parameters: {'max_depth': 8, 'learning_rate': 0.04433950577859374, 'n_estimators': 648, 'gamma': 6.224315122292967, 'min_child_weight': 91, 'colsample_bytree': 0.298177303668393, 'subsample': 0.5976402253992847}. Best is trial 44 with value: 0.530432165008472.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:45:42,361][0m Trial 47 finished with value: 0.504642070466378 and parameters: {'max_depth': 7, 'learning_rate': 0.02574549384071337, 'n_estimators': 1517, 'gamma': 5.039489439751967, 'min_child_weight': 97, 'colsample_bytree': 0.235468754032218, 'subsample': 0.6611316409402814}. Best is trial 44 with value: 0.530432165008472.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:46:19,190][0m Trial 48 finished with value: 0.49495306333120104 and parameters: {'max_depth': 6, 'learning_rate': 0.06552301605033681, 'n_estimators': 2108, 'gamma': 4.137852017727796, 'min_child_weight': 45, 'colsample_bytree': 0.3307307522583229, 'subsample': 0.5758303625100999}. Best is trial 44 with value: 0.530432165008472.[0m


0 1 2 3 4 

[32m[I 2023-02-08 17:46:39,814][0m Trial 49 finished with value: 0.46689890820811986 and parameters: {'max_depth': 9, 'learning_rate': 0.03594590358675309, 'n_estimators': 523, 'gamma': 6.104786108515054, 'min_child_weight': 82, 'colsample_bytree': 0.2828770677063024, 'subsample': 0.7088671264589825}. Best is trial 44 with value: 0.530432165008472.[0m


In [21]:
study.best_trial.value

0.530432165008472

In [22]:
study.best_trial.params

{'max_depth': 7,
 'learning_rate': 0.037306935544411675,
 'n_estimators': 711,
 'gamma': 6.424280577329913,
 'min_child_weight': 100,
 'colsample_bytree': 0.24827297782231844,
 'subsample': 0.619874598507635}

# XGBoost Modeling

In [87]:
XGB_cv_scores, XGB_imp = list(), list()
preds = list()

## Running 5 times CV
# for i in range(5):
    
#     skf = StratifiedKFold(n_splits = 5, random_state = 40, shuffle = True)
skf = KFold(n_splits = 10, random_state = 42, shuffle = True)
    
for train_ix, test_ix in skf.split(X, Y):
        
    ## Splitting the data 
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
                
    ## Building RF model
    XGB_md = XGBRegressor(**study.best_trial.params, random_state = 0).fit(X_train, Y_train)
#     XGB_md = XGBRegressor(tree_method = 'hist',
#                           colsample_bytree = 0.2715039879238372, 
#                           gamma = 8.977910227926444, 
#                           learning_rate = 0.048945719202738824, 
#                           max_depth = 6, 
#                           min_child_weight = 71, 
#                           n_estimators = 1189, 
#                           subsample = 0.739476345973539, 
#                           random_state = 42).fit(X_train, Y_train)
    XGB_imp.append(XGB_md.feature_importances_)
        
    ## Predicting on X_test and test
    XGB_pred_1 = XGB_md.predict(X_test)
    XGB_pred_2 = XGB_md.predict(test_md)
        
    ## Applying Optimal Rounder (using abhishek approach)
    optR = OptimizedRounder()
    optR.fit(XGB_md.predict(X_train), Y_train)
    coef = optR.coefficients()
    XGB_pred_1 = optR.predict(XGB_pred_1, coef).astype(int)
    XGB_pred_2 = optR.predict(XGB_pred_2, coef).astype(int)
        
    ## Computing weighted quadratic kappa
    XGB_cv_scores.append(cohen_kappa_score(Y_test, XGB_pred_1, weights = 'quadratic'))
    preds.append(XGB_pred_2)

XGB_cv_score = np.mean(XGB_cv_scores)    
print('The average roc-auc score over 5-folds (run 5 times) is:', XGB_cv_score)

The average roc-auc score over 5-folds (run 5 times) is: 0.5415626356860228


In [80]:
a = [0, 1, 2, 3]
np.argmax(a)

3

In [81]:
a[3]

3

In [85]:
x = [i for i in range(5)]
np.argmax(x)

4

In [86]:
x

[0, 1, 2, 3, 4]

In [78]:
XGB_preds_test = pd.DataFrame(preds).mode(axis = 0).loc[0, ]

submission['quality'] = XGB_preds_test.astype(int)
submission.head()

Unnamed: 0,Id,quality
0,2056,5
1,2057,6
2,2058,5
3,2059,6
4,2060,6


In [79]:
submission['quality'].value_counts()

5    602
6    443
7    327
Name: quality, dtype: int64

In [75]:
submission['quality'].value_counts()

5    634
6    410
7    328
Name: quality, dtype: int64

In [76]:
submission.to_csv('XGB_Reg_FE_6.csv', index = False)

In [10]:
submission['quality'].value_counts()

5    622
6    417
7    333
Name: quality, dtype: int64

In [11]:
submission.to_csv('XGB_Reg_FE_5.csv', index = False)

In [40]:
XGB_md = XGBRegressor(**study.best_trial.params, 
                      random_state = 4).fit(X, Y)
# XGB_md = XGBRegressor(tree_method = 'hist',
#                       colsample_bytree = 0.2715039879238372, 
#                       gamma = 8.977910227926444, 
#                       learning_rate = 0.048945719202738824, 
#                       max_depth = 6, 
#                       min_child_weight = 71, 
#                       n_estimators = 1189, 
#                       subsample = 0.739476345973539, 
#                       random_state = 4).fit(X, Y)

optR = OptimizedRounder()
optR.fit(XGB_md.predict(X), Y)
coef = optR.coefficients()
XGB_pred = XGB_md.predict(test_md)
XGB_pred = optR.predict(XGB_pred, coef).astype(int)

In [41]:
submission['quality'] = XGB_pred
submission.head()

Unnamed: 0,Id,quality
0,2056,5
1,2057,6
2,2058,5
3,2059,6
4,2060,6


In [88]:
file_name = 'XGB_Reg_FUll_Seed_' + str(1) + '.csv'
file_name

'XGB_Reg_FUll_Seed_1.csv'

In [39]:
## seed = 42
submission['quality'].value_counts()

5    594
6    454
7    324
Name: quality, dtype: int64

In [42]:
## seed 4
submission['quality'].value_counts()

5    671
6    380
7    321
Name: quality, dtype: int64

In [15]:
submission.to_csv('XGB_Reg_full_FE_6.csv', index = False)

# Modeling XGBoost Like Crazy

In [48]:
XGB_cv_scores, XGB_imp = list(), list()
preds = list()

## Running 5 times CV
for i in range(100):
    
    print(i)
    ## Building RF model
    XGB_md = XGBRegressor(**study.best_trial.params, 
                          random_state = i).fit(X, Y)
    
#     XGB_imp.append(XGB_md.feature_importances_)
        
    ## Predicting on X_test and test
    XGB_pred_1 = XGB_md.predict(X)
    XGB_pred_2 = XGB_md.predict(test_md)
    
    optR = OptimizedRounder()
    optR.fit(XGB_pred_1, Y)
    coef = optR.coefficients()
    XGB_pred_1 = optR.predict(XGB_pred_1, coef).astype(int)
    XGB_pred_2 = optR.predict(XGB_pred_2, coef).astype(int)
    
    ## Computing roc-auc score
    XGB_cv_scores.append(cohen_kappa_score(Y, XGB_pred_1, weights = 'quadratic'))
    preds.append(XGB_pred_2)

XGB_cv_score = np.mean(XGB_cv_scores)    
print('The average roc-auc score over 5-folds (run 5 times) is:', XGB_cv_score)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
The average roc-auc score over 5-folds (run 5 times) is: 0.5650031992255752


In [49]:
 pd.DataFrame(preds)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,1322,1323,1324,1325,1326,1327,1328,1329,1330,1331,1332,1333,1334,1335,1336,1337,1338,1339,1340,1341,1342,1343,1344,1345,1346,1347,1348,1349,1350,1351,1352,1353,1354,1355,1356,1357,1358,1359,1360,1361,1362,1363,1364,1365,1366,1367,1368,1369,1370,1371
0,5,6,5,6,6,6,6,6,6,5,5,6,7,6,6,5,7,5,5,7,7,6,5,7,5,5,7,5,7,5,5,7,5,5,6,5,7,6,5,5,5,5,5,7,7,6,5,5,6,5,...,6,7,6,5,5,5,5,5,5,5,5,7,7,5,6,6,5,6,6,5,7,5,5,6,7,7,5,6,6,5,5,6,5,5,5,5,5,7,7,7,6,5,5,7,7,5,7,5,5,5
1,5,6,5,6,6,6,5,6,6,5,5,6,7,6,6,5,7,5,5,7,7,6,5,7,5,5,7,5,7,5,5,7,5,5,6,5,7,6,5,5,5,5,5,7,7,6,5,5,6,5,...,6,7,6,5,5,5,5,5,5,5,5,7,7,5,6,6,5,6,6,5,7,5,5,5,7,7,5,6,6,5,5,6,5,5,5,5,5,7,7,7,6,5,5,7,7,5,7,5,5,5
2,5,6,5,7,6,6,5,6,6,5,5,6,7,6,6,5,7,5,5,7,7,6,5,7,5,5,7,5,7,5,5,7,5,5,6,5,7,6,5,5,5,5,5,7,7,6,5,5,6,5,...,6,7,6,5,5,5,5,5,5,5,5,7,7,5,6,6,5,6,6,5,7,5,5,5,7,7,5,6,6,5,5,6,5,5,5,5,5,7,7,7,6,5,5,7,7,5,7,5,5,5
3,5,6,5,7,6,6,6,6,6,6,5,6,7,6,6,5,7,5,5,7,7,6,6,7,5,5,7,6,7,6,5,7,5,5,6,5,7,6,5,5,5,5,5,7,7,6,5,5,6,5,...,6,7,6,5,5,5,5,5,5,5,5,7,7,5,6,6,5,6,6,5,7,5,5,6,7,7,5,6,6,5,5,6,5,5,5,5,5,7,7,7,6,5,5,7,7,6,7,5,5,5
4,5,6,5,6,6,6,5,6,6,5,5,6,6,6,6,5,7,5,5,7,7,6,5,7,5,5,7,5,7,5,5,6,5,5,6,5,7,6,5,5,5,5,5,7,7,6,5,5,6,5,...,6,7,6,5,5,5,5,5,5,5,5,7,7,5,6,6,5,6,6,5,7,5,5,5,7,7,5,6,6,5,5,6,5,5,5,5,5,7,7,7,6,5,5,7,7,5,7,5,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,5,6,5,7,6,6,5,6,6,6,5,6,7,6,6,5,7,5,5,7,7,6,6,7,5,5,7,5,7,5,5,7,5,5,6,5,7,6,5,5,5,5,5,7,7,6,5,5,6,5,...,6,7,6,5,5,5,5,5,5,5,5,7,7,5,6,6,5,6,6,5,7,5,5,6,7,7,5,6,6,5,5,6,5,5,5,5,5,7,7,7,6,5,5,7,7,5,7,5,5,5
96,5,6,5,6,6,6,6,6,6,6,5,6,7,6,6,5,7,5,5,7,7,6,6,7,5,5,7,6,7,5,5,7,5,5,6,6,7,6,5,5,5,5,5,7,7,6,5,5,6,5,...,6,7,6,5,5,5,5,5,5,5,5,7,7,5,6,6,5,6,6,5,7,5,5,6,7,7,5,6,6,5,5,6,5,5,5,5,5,7,7,7,6,5,5,7,7,6,7,5,5,5
97,5,6,5,6,6,6,5,6,6,5,5,6,6,6,6,5,7,5,5,7,7,6,5,7,5,5,7,5,7,5,5,6,5,5,6,5,7,6,5,5,5,5,5,7,7,6,5,5,6,5,...,6,7,6,5,5,5,5,5,5,5,5,7,7,5,6,6,5,6,6,5,7,5,5,5,7,7,5,6,6,5,5,6,5,5,5,5,5,7,7,7,6,5,5,7,7,5,7,5,5,5
98,5,6,5,6,6,6,6,6,6,6,5,6,6,6,6,5,7,5,5,7,7,6,6,7,5,5,7,5,7,6,5,7,5,5,6,6,7,6,5,5,5,5,5,7,7,6,5,5,6,5,...,6,7,6,5,5,5,5,5,5,5,5,7,7,5,6,6,5,6,6,5,7,5,5,6,7,7,5,6,6,5,5,6,5,5,5,5,5,7,7,7,6,5,5,7,7,6,7,5,5,5


In [50]:
XGB_preds_test = pd.DataFrame(preds).mode(axis = 0).loc[0, ]

submission['quality'] = XGB_preds_test.astype(int)
submission.head()

Unnamed: 0,Id,quality
0,2056,5
1,2057,6
2,2058,5
3,2059,6
4,2060,6


In [51]:
submission['quality'].value_counts()

5    652
6    392
7    328
Name: quality, dtype: int64

In [52]:
submission.to_csv('XGB_Reg_full_100_FE_7.csv', index = False)