In [1]:
pip install xgboost lightgbm catboost optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [74]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.cluster import KMeans
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score, cohen_kappa_score, davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/TS-S3-Ep5/train.csv'
file_key_2 = 'Tabular-Playground-Series/TS-S3-Ep5/test.csv'
file_key_3 = 'Tabular-Playground-Series/TS-S3-Ep5/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

In [62]:
train = train[np.isin(train['quality'], [5, 6, 7])].reset_index(drop = True)
train['quality'].value_counts()

5    839
6    778
7    333
Name: quality, dtype: int64

In [23]:
train['quality'].value_counts() 
# / train.shape[0]

5    839
6    778
7    333
4     55
8     39
3     12
Name: quality, dtype: int64

In [11]:
train['quality'] = np.where(((train['quality'] < 5) | (train['quality'] > 7)), 4, train['quality'])
train['quality'].value_counts()

5    839
6    778
7    333
4    106
Name: quality, dtype: int64

In [12]:
train['quality'].value_counts() / train.shape[0]

5    0.408074
6    0.378405
7    0.161965
4    0.051556
Name: quality, dtype: float64

# Feature Engineering 

In [75]:
train['alcohol_density'] = train['alcohol'] * train['density']
train['sulphate/density'] = train['sulphates']  / train['density']
train['alcohol_sulphate'] = train['alcohol'] * train['sulphates']

test['alcohol_density'] = test['alcohol']  * test['density']
test['sulphate/density'] = test['sulphates']  / test['density']
test['alcohol_sulphate'] = test['alcohol'] * test['sulphates']

In [76]:
test_md = test.copy()

X = train[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates']]
# X = train.drop(columns = ['Id', 'quality'], axis = 1)
Y = train['quality'] - 3
# test_md = test_md.drop(columns = 'Id', axis = 1)
test_md = test_md[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates']]

# XGBoost Modeling

In [92]:
def objective(trial):
    
    ## Parameters to be evaluated
    param = dict(objective = 'multi:softmax',
                 eval_metric = 'mlogloss',
                 tree_method ='hist', 
                 random_state = 42,
                 max_depth = trial.suggest_int('max_depth', 2, 10),
                 learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log = True),
                 n_estimators = trial.suggest_int('n_estimators', 30, 10000),
                 gamma = trial.suggest_float('gamma', 0, 10),
                 min_child_weight = trial.suggest_int('min_child_weight', 1, 100),
                 colsample_bytree = trial.suggest_float('colsample_bytree', 0.2, 0.9),
                 subsample = trial.suggest_float('subsample', 0.2, 0.9)
                )

    scores = []

    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state=42)
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        print(fold, end = ' ')
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train , y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

        model = XGBClassifier(**param)
        model.fit(X_train, y_train)

        preds_valid = model.predict(X_valid)
        score = cohen_kappa_score(y_valid,  preds_valid, weights = "quadratic")
        scores.append(score)
    return np.mean(scores)

In [93]:
study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials = 50, timeout = 3600)

[32m[I 2023-02-03 00:22:54,818][0m A new study created in memory with name: no-name-7f2c1fc0-ff7b-4608-9cc5-5ac1bdb2b089[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:23:17,987][0m Trial 0 finished with value: 0.5085029653995343 and parameters: {'max_depth': 7, 'learning_rate': 0.07893059761375568, 'n_estimators': 6239, 'gamma': 6.996574362263747, 'min_child_weight': 3, 'colsample_bytree': 0.6827053418368323, 'subsample': 0.8827994532173611}. Best is trial 0 with value: 0.5085029653995343.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:23:26,605][0m Trial 1 finished with value: 0.4794433632983127 and parameters: {'max_depth': 4, 'learning_rate': 0.0027765485982789414, 'n_estimators': 2192, 'gamma': 2.7654270668185674, 'min_child_weight': 69, 'colsample_bytree': 0.3716522890814079, 'subsample': 0.4392124782358095}. Best is trial 0 with value: 0.5085029653995343.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:23:48,185][0m Trial 2 finished with value: 0.4508549018646223 and parameters: {'max_depth': 3, 'learning_rate': 0.001533538496610234, 'n_estimators': 5552, 'gamma': 7.236939482341671, 'min_child_weight': 24, 'colsample_bytree': 0.38620147551103035, 'subsample': 0.3151538631939168}. Best is trial 0 with value: 0.5085029653995343.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:24:31,266][0m Trial 3 finished with value: 0.4697288460193517 and parameters: {'max_depth': 5, 'learning_rate': 0.000675696872988292, 'n_estimators': 9465, 'gamma': 3.098251838077214, 'min_child_weight': 31, 'colsample_bytree': 0.2900397512983707, 'subsample': 0.21076926693415593}. Best is trial 0 with value: 0.5085029653995343.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:24:38,889][0m Trial 4 finished with value: 0.39225522451128547 and parameters: {'max_depth': 7, 'learning_rate': 0.0003632985904051706, 'n_estimators': 1212, 'gamma': 2.620278707994669, 'min_child_weight': 60, 'colsample_bytree': 0.6033466435797181, 'subsample': 0.6800076328874772}. Best is trial 0 with value: 0.5085029653995343.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:24:45,082][0m Trial 5 finished with value: 0.366230721908767 and parameters: {'max_depth': 4, 'learning_rate': 0.00015126802692217228, 'n_estimators': 1196, 'gamma': 1.4110669826939903, 'min_child_weight': 67, 'colsample_bytree': 0.7410842068135965, 'subsample': 0.2981044498702003}. Best is trial 0 with value: 0.5085029653995343.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:24:58,397][0m Trial 6 finished with value: 0.49793502940613543 and parameters: {'max_depth': 5, 'learning_rate': 0.003437804533379529, 'n_estimators': 2645, 'gamma': 3.6851990293885195, 'min_child_weight': 28, 'colsample_bytree': 0.2520970534091917, 'subsample': 0.8975351531363702}. Best is trial 0 with value: 0.5085029653995343.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:25:51,393][0m Trial 7 finished with value: 0.49929786766182493 and parameters: {'max_depth': 8, 'learning_rate': 0.013248817106441862, 'n_estimators': 9549, 'gamma': 0.8444636906090397, 'min_child_weight': 15, 'colsample_bytree': 0.30013436990415554, 'subsample': 0.38245945076930765}. Best is trial 0 with value: 0.5085029653995343.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:26:10,199][0m Trial 8 finished with value: 0.3999980274898351 and parameters: {'max_depth': 2, 'learning_rate': 0.0076360729560522944, 'n_estimators': 4087, 'gamma': 0.6433193102059886, 'min_child_weight': 81, 'colsample_bytree': 0.6703852493756621, 'subsample': 0.39051009871659303}. Best is trial 0 with value: 0.5085029653995343.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:26:44,840][0m Trial 9 finished with value: 0.5194729719503379 and parameters: {'max_depth': 8, 'learning_rate': 0.0010583383972237734, 'n_estimators': 3847, 'gamma': 0.45980375115162286, 'min_child_weight': 30, 'colsample_bytree': 0.8413304308896525, 'subsample': 0.5568429064855053}. Best is trial 9 with value: 0.5194729719503379.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:27:19,364][0m Trial 10 finished with value: 0.3906003930460835 and parameters: {'max_depth': 10, 'learning_rate': 0.0001621726558887553, 'n_estimators': 7462, 'gamma': 5.183282899184295, 'min_child_weight': 98, 'colsample_bytree': 0.8992853705544194, 'subsample': 0.5844949622481617}. Best is trial 9 with value: 0.5194729719503379.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:27:39,907][0m Trial 11 finished with value: 0.5085457213534882 and parameters: {'max_depth': 9, 'learning_rate': 0.09798510589202353, 'n_estimators': 5818, 'gamma': 8.520134975695235, 'min_child_weight': 4, 'colsample_bytree': 0.8055879337771517, 'subsample': 0.801277032643908}. Best is trial 9 with value: 0.5194729719503379.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:27:55,474][0m Trial 12 finished with value: 0.49417645682407746 and parameters: {'max_depth': 10, 'learning_rate': 0.09001748280972952, 'n_estimators': 3943, 'gamma': 8.888510844878995, 'min_child_weight': 41, 'colsample_bytree': 0.880363901083819, 'subsample': 0.7138302285399748}. Best is trial 9 with value: 0.5194729719503379.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:28:26,805][0m Trial 13 finished with value: 0.499072102482204 and parameters: {'max_depth': 9, 'learning_rate': 0.02534906099164336, 'n_estimators': 7409, 'gamma': 9.967299914244725, 'min_child_weight': 4, 'colsample_bytree': 0.7948789919702535, 'subsample': 0.5068862957561383}. Best is trial 9 with value: 0.5194729719503379.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:29:05,959][0m Trial 14 finished with value: 0.5130538916517584 and parameters: {'max_depth': 8, 'learning_rate': 0.0010101520454277885, 'n_estimators': 4091, 'gamma': 0.06188980201689731, 'min_child_weight': 45, 'colsample_bytree': 0.8008712313157906, 'subsample': 0.7543966772168464}. Best is trial 9 with value: 0.5194729719503379.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:29:34,622][0m Trial 15 finished with value: 0.5188856303038637 and parameters: {'max_depth': 7, 'learning_rate': 0.0009910549638214169, 'n_estimators': 3874, 'gamma': 0.09123515286066092, 'min_child_weight': 46, 'colsample_bytree': 0.5513148711668796, 'subsample': 0.6108911982985201}. Best is trial 9 with value: 0.5194729719503379.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:29:53,457][0m Trial 16 finished with value: 0.5099625681666311 and parameters: {'max_depth': 6, 'learning_rate': 0.0005566959177259191, 'n_estimators': 2925, 'gamma': 1.6640683721616991, 'min_child_weight': 52, 'colsample_bytree': 0.5177935409617288, 'subsample': 0.6048476429047772}. Best is trial 9 with value: 0.5194729719503379.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:29:54,978][0m Trial 17 finished with value: 0.47240833380031544 and parameters: {'max_depth': 7, 'learning_rate': 0.00030956620036849724, 'n_estimators': 240, 'gamma': 0.08939723875993226, 'min_child_weight': 35, 'colsample_bytree': 0.5551610409887114, 'subsample': 0.5280551826047322}. Best is trial 9 with value: 0.5194729719503379.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:30:22,114][0m Trial 18 finished with value: 0.5105044352130692 and parameters: {'max_depth': 6, 'learning_rate': 0.0015505111239113244, 'n_estimators': 4914, 'gamma': 1.7198653177123797, 'min_child_weight': 19, 'colsample_bytree': 0.48976205458321564, 'subsample': 0.6369501405021187}. Best is trial 9 with value: 0.5194729719503379.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:30:57,359][0m Trial 19 finished with value: 0.40456182481292374 and parameters: {'max_depth': 8, 'learning_rate': 0.00010480400379578809, 'n_estimators': 6872, 'gamma': 4.3437269655641, 'min_child_weight': 49, 'colsample_bytree': 0.6225837666340059, 'subsample': 0.5180627778947544}. Best is trial 9 with value: 0.5194729719503379.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:31:11,050][0m Trial 20 finished with value: 0.5152332334128611 and parameters: {'max_depth': 9, 'learning_rate': 0.002886889572697518, 'n_estimators': 3330, 'gamma': 1.773514969584552, 'min_child_weight': 85, 'colsample_bytree': 0.7330070659888385, 'subsample': 0.6494948799708096}. Best is trial 9 with value: 0.5194729719503379.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:31:24,755][0m Trial 21 finished with value: 0.5216547210012331 and parameters: {'max_depth': 9, 'learning_rate': 0.0027905654449033503, 'n_estimators': 3084, 'gamma': 0.9969456811320532, 'min_child_weight': 76, 'colsample_bytree': 0.7431582760184835, 'subsample': 0.6453494091541063}. Best is trial 21 with value: 0.5216547210012331.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:32:03,096][0m Trial 22 finished with value: 0.5248974143171334 and parameters: {'max_depth': 8, 'learning_rate': 0.0016774237649515198, 'n_estimators': 4863, 'gamma': 0.08765622887047725, 'min_child_weight': 57, 'colsample_bytree': 0.851367667390416, 'subsample': 0.5925525343085731}. Best is trial 22 with value: 0.5248974143171334.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:32:23,387][0m Trial 23 finished with value: 0.522020422748992 and parameters: {'max_depth': 9, 'learning_rate': 0.004629494420844623, 'n_estimators': 4892, 'gamma': 0.9356942741905432, 'min_child_weight': 75, 'colsample_bytree': 0.8535905498692357, 'subsample': 0.584718493623243}. Best is trial 22 with value: 0.5248974143171334.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:32:43,189][0m Trial 24 finished with value: 0.5197149506137142 and parameters: {'max_depth': 10, 'learning_rate': 0.004548823245333882, 'n_estimators': 4647, 'gamma': 1.2230846478679414, 'min_child_weight': 79, 'colsample_bytree': 0.862653998609899, 'subsample': 0.6945863984446452}. Best is trial 22 with value: 0.5248974143171334.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:33:06,091][0m Trial 25 finished with value: 0.5208505708281754 and parameters: {'max_depth': 9, 'learning_rate': 0.00787497967463716, 'n_estimators': 5237, 'gamma': 1.9365773337414003, 'min_child_weight': 71, 'colsample_bytree': 0.7607860468022383, 'subsample': 0.581844019372575}. Best is trial 22 with value: 0.5248974143171334.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:33:44,129][0m Trial 26 finished with value: 0.5198093610209442 and parameters: {'max_depth': 9, 'learning_rate': 0.002104751375900819, 'n_estimators': 8268, 'gamma': 0.9832868093251742, 'min_child_weight': 96, 'colsample_bytree': 0.8370295107457163, 'subsample': 0.6529087287872735}. Best is trial 22 with value: 0.5248974143171334.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:33:52,943][0m Trial 27 finished with value: 0.5155876410574717 and parameters: {'max_depth': 10, 'learning_rate': 0.004972864357075661, 'n_estimators': 1874, 'gamma': 2.3051986125920783, 'min_child_weight': 88, 'colsample_bytree': 0.8987332834885708, 'subsample': 0.7421096847580667}. Best is trial 22 with value: 0.5248974143171334.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:34:25,945][0m Trial 28 finished with value: 0.5216888422082337 and parameters: {'max_depth': 8, 'learning_rate': 0.0019618824708241885, 'n_estimators': 6201, 'gamma': 1.1187673197336756, 'min_child_weight': 59, 'colsample_bytree': 0.8293048821841895, 'subsample': 0.47241471215293596}. Best is trial 22 with value: 0.5248974143171334.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:35:00,977][0m Trial 29 finished with value: 0.5161036888666349 and parameters: {'max_depth': 7, 'learning_rate': 0.0017479917496013922, 'n_estimators': 6408, 'gamma': 2.293497872173269, 'min_child_weight': 59, 'colsample_bytree': 0.7104909332296258, 'subsample': 0.48584737975007275}. Best is trial 22 with value: 0.5248974143171334.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:35:25,337][0m Trial 30 finished with value: 0.525318569237453 and parameters: {'max_depth': 8, 'learning_rate': 0.026122871155099082, 'n_estimators': 6402, 'gamma': 3.1396271213562654, 'min_child_weight': 60, 'colsample_bytree': 0.8410522417328885, 'subsample': 0.5455978571912129}. Best is trial 30 with value: 0.525318569237453.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:35:55,920][0m Trial 31 finished with value: 0.5154270903865 and parameters: {'max_depth': 8, 'learning_rate': 0.04325746467585684, 'n_estimators': 6356, 'gamma': 1.0414417834780145, 'min_child_weight': 61, 'colsample_bytree': 0.8288931309842342, 'subsample': 0.5592841772325492}. Best is trial 30 with value: 0.525318569237453.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:36:27,424][0m Trial 32 finished with value: 0.5236469124193771 and parameters: {'max_depth': 7, 'learning_rate': 0.01789579590440998, 'n_estimators': 7726, 'gamma': 3.0510717896703845, 'min_child_weight': 55, 'colsample_bytree': 0.783517305642641, 'subsample': 0.4623067054997251}. Best is trial 30 with value: 0.525318569237453.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:37:01,170][0m Trial 33 finished with value: 0.5254595310602787 and parameters: {'max_depth': 7, 'learning_rate': 0.01987983313416184, 'n_estimators': 8426, 'gamma': 3.4103969327748294, 'min_child_weight': 53, 'colsample_bytree': 0.7788197441332908, 'subsample': 0.4567715034504576}. Best is trial 33 with value: 0.5254595310602787.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:37:40,911][0m Trial 34 finished with value: 0.522336329883723 and parameters: {'max_depth': 6, 'learning_rate': 0.05117714701280564, 'n_estimators': 8568, 'gamma': 3.0368809732933966, 'min_child_weight': 54, 'colsample_bytree': 0.7930258065435288, 'subsample': 0.4621850609796544}. Best is trial 33 with value: 0.5254595310602787.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:38:17,655][0m Trial 35 finished with value: 0.5261690076603357 and parameters: {'max_depth': 7, 'learning_rate': 0.022667047240201684, 'n_estimators': 8601, 'gamma': 3.671879471101324, 'min_child_weight': 64, 'colsample_bytree': 0.6902992261005392, 'subsample': 0.41937284537087716}. Best is trial 35 with value: 0.5261690076603357.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:38:53,790][0m Trial 36 finished with value: 0.5264724022238702 and parameters: {'max_depth': 5, 'learning_rate': 0.02843876708619157, 'n_estimators': 8844, 'gamma': 5.158819701888897, 'min_child_weight': 64, 'colsample_bytree': 0.6951789775413684, 'subsample': 0.42066221683759925}. Best is trial 36 with value: 0.5264724022238702.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:39:27,967][0m Trial 37 finished with value: 0.5274373986835524 and parameters: {'max_depth': 5, 'learning_rate': 0.031005167601864923, 'n_estimators': 8851, 'gamma': 5.555121417819512, 'min_child_weight': 68, 'colsample_bytree': 0.6878264653902313, 'subsample': 0.41423100774189403}. Best is trial 37 with value: 0.5274373986835524.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:40:08,639][0m Trial 38 finished with value: 0.5232539950894436 and parameters: {'max_depth': 5, 'learning_rate': 0.049126290331453284, 'n_estimators': 9937, 'gamma': 5.425282013005059, 'min_child_weight': 67, 'colsample_bytree': 0.6841530473909936, 'subsample': 0.39991267025781}. Best is trial 37 with value: 0.5274373986835524.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:40:48,479][0m Trial 39 finished with value: 0.5178936320253843 and parameters: {'max_depth': 4, 'learning_rate': 0.06089135517478072, 'n_estimators': 8770, 'gamma': 5.864537756581539, 'min_child_weight': 67, 'colsample_bytree': 0.671353895446417, 'subsample': 0.43111239379423444}. Best is trial 37 with value: 0.5274373986835524.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:41:24,211][0m Trial 40 finished with value: 0.5238780090331367 and parameters: {'max_depth': 4, 'learning_rate': 0.034795997874491176, 'n_estimators': 9034, 'gamma': 4.486175490983368, 'min_child_weight': 65, 'colsample_bytree': 0.6373303983055624, 'subsample': 0.3341626721329876}. Best is trial 37 with value: 0.5274373986835524.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:41:56,345][0m Trial 41 finished with value: 0.5214984022006196 and parameters: {'max_depth': 5, 'learning_rate': 0.028251430619415076, 'n_estimators': 8054, 'gamma': 3.889312276774965, 'min_child_weight': 73, 'colsample_bytree': 0.7599673613153756, 'subsample': 0.4369443321156835}. Best is trial 37 with value: 0.5274373986835524.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:42:32,693][0m Trial 42 finished with value: 0.519098451058144 and parameters: {'max_depth': 6, 'learning_rate': 0.020749186464712577, 'n_estimators': 9268, 'gamma': 3.6038361869990254, 'min_child_weight': 64, 'colsample_bytree': 0.713643034379977, 'subsample': 0.343130450149677}. Best is trial 37 with value: 0.5274373986835524.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:43:14,286][0m Trial 43 finished with value: 0.5157884774577184 and parameters: {'max_depth': 5, 'learning_rate': 0.014149724968483391, 'n_estimators': 9857, 'gamma': 5.956547301860306, 'min_child_weight': 40, 'colsample_bytree': 0.7632303036824397, 'subsample': 0.43095789777866095}. Best is trial 37 with value: 0.5274373986835524.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:43:46,798][0m Trial 44 finished with value: 0.5264100482557034 and parameters: {'max_depth': 6, 'learning_rate': 0.06567790400842985, 'n_estimators': 6924, 'gamma': 4.628562039775121, 'min_child_weight': 50, 'colsample_bytree': 0.70062939526261, 'subsample': 0.37348730349683}. Best is trial 37 with value: 0.5274373986835524.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:44:14,655][0m Trial 45 finished with value: 0.5252001059102014 and parameters: {'max_depth': 3, 'learning_rate': 0.0685847805258, 'n_estimators': 7021, 'gamma': 4.675066153962026, 'min_child_weight': 47, 'colsample_bytree': 0.6822263620470372, 'subsample': 0.36675289497345864}. Best is trial 37 with value: 0.5274373986835524.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:44:46,218][0m Trial 46 finished with value: 0.5227638416693752 and parameters: {'max_depth': 6, 'learning_rate': 0.04268696465967494, 'n_estimators': 8099, 'gamma': 4.905465463506295, 'min_child_weight': 37, 'colsample_bytree': 0.6431808089783985, 'subsample': 0.2868477023273117}. Best is trial 37 with value: 0.5274373986835524.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:45:21,069][0m Trial 47 finished with value: 0.52442386967912 and parameters: {'max_depth': 5, 'learning_rate': 0.06448770549718999, 'n_estimators': 8587, 'gamma': 4.0241892549138765, 'min_child_weight': 53, 'colsample_bytree': 0.5972652787294238, 'subsample': 0.4109372286493358}. Best is trial 37 with value: 0.5274373986835524.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:45:52,474][0m Trial 48 finished with value: 0.5211516995739742 and parameters: {'max_depth': 6, 'learning_rate': 0.033468856471462576, 'n_estimators': 9142, 'gamma': 5.467375625464141, 'min_child_weight': 70, 'colsample_bytree': 0.7241407391058825, 'subsample': 0.37232947007738193}. Best is trial 37 with value: 0.5274373986835524.[0m


0 1 2 3 4 

[32m[I 2023-02-03 00:46:19,343][0m Trial 49 finished with value: 0.522761525250149 and parameters: {'max_depth': 3, 'learning_rate': 0.09770337536146823, 'n_estimators': 7627, 'gamma': 3.534787553175006, 'min_child_weight': 63, 'colsample_bytree': 0.6870582102458276, 'subsample': 0.49150059163150434}. Best is trial 37 with value: 0.5274373986835524.[0m


In [101]:
XGB_cv_scores, XGB_imp = list(), list()
preds = list()

## Running 5 times CV
for i in range(5):
    
    skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
    
    for train_ix, test_ix in skf.split(X, Y):
        
        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
                
        ## Building RF model
        XGB_md = XGBClassifier(tree_method = 'hist',
                               colsample_bytree = 0.7, 
                               gamma = 5.5, 
                               learning_rate = 0.031, 
                               max_depth = 5, 
                               min_child_weight = 68, 
                               n_estimators = 8800, 
                               subsample = 0.41, 
                               random_state = 42).fit(X_train, Y_train)
        XGB_imp.append(XGB_md.feature_importances_)
        
        ## Predicting on X_test and test
        XGB_pred_1 = XGB_md.predict(X_test)
        XGB_pred_2 = XGB_md.predict(test_md)
        
        ## Computing roc-auc score
        XGB_cv_scores.append(cohen_kappa_score(Y_test, XGB_pred_1, weights = 'quadratic'))
        preds.append(XGB_pred_2)

XGB_cv_score = np.mean(XGB_cv_scores)    
print('The average roc-auc score over 5-folds (run 5 times) is:', XGB_cv_score)

The average roc-auc score over 5-folds (run 5 times) is: 0.5280753543267158


In [102]:
XGB_preds_test = pd.DataFrame(preds).mode(axis = 0).loc[0, ] + 3

submission['quality'] = XGB_preds_test.astype(int)
submission.head()

Unnamed: 0,Id,quality
0,2056,5
1,2057,6
2,2058,5
3,2059,6
4,2060,6


In [103]:
submission['quality'].value_counts()

5    616
6    564
7    192
Name: quality, dtype: int64

In [104]:
submission.to_csv('XGB_baseline_FE_9.csv', index = False)

# LightGBM

In [55]:
lgb_cv_scores, lgb_imp = list(), list()
preds = list()

## Running 5 times CV
for i in range(5):
    
    skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
    
    for train_ix, test_ix in skf.split(X, Y):
        
        ## Splitting the data 
        X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
        Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
                
        ## Building RF model
        lgb_md = LGBMClassifier(n_estimators = 1000,
                                max_depth = 5,
                                learning_rate = 0.01,
                                num_leaves = 10,
                                lambda_l1 = 3,
                                lambda_l2 = 3,
                                bagging_fraction = 0.7,
                                feature_fraction = 0.7).fit(X_train, Y_train)
        lgb_imp.append(lgb_md.feature_importances_)
        
        ## Predicting on X_test and test
        lgb_pred_1 = lgb_md.predict(X_test)
        lgb_pred_2 = lgb_md.predict(test_md)
        
        ## Computing roc-auc score
        lgb_cv_scores.append(cohen_kappa_score(Y_test, lgb_pred_1, weights = 'quadratic'))
        preds.append(lgb_pred_2)

lgb_cv_score = np.mean(lgb_cv_scores)    
print('The average oof roc-auc score over 5-folds (run 5 times) is:', lgb_cv_score)

The average oof roc-auc score over 5-folds (run 5 times) is: 0.5408678479398848


In [56]:
lgb_preds_test = pd.DataFrame(preds).mode(axis = 0).loc[0, ] + 5

submission['quality'] = lgb_preds_test.astype(int)
submission.head()

Unnamed: 0,Id,quality
0,2056,5
1,2057,6
2,2058,6
3,2059,6
4,2060,6


In [57]:
submission['quality'].value_counts()

5    603
6    602
7    167
Name: quality, dtype: int64