In [1]:
pip install lightgbm optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.9.2-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 kB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.

In [2]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.cluster import KMeans
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score, cohen_kappa_score, davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor
# from xgboost import XGBClassifier, XGBRegressor
# from catboost import CatBoostClassifier

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/TS-S3-Ep5/train.csv'
file_key_2 = 'Tabular-Playground-Series/TS-S3-Ep5/test.csv'
file_key_3 = 'Tabular-Playground-Series/TS-S3-Ep5/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

## Enginering features
train['alcohol_density'] = train['alcohol'] * train['density']
train['sulphate/density'] = train['sulphates']  / train['density']
train['alcohol_sulphate'] = train['alcohol'] * train['sulphates']

test['alcohol_density'] = test['alcohol']  * test['density']
test['sulphate/density'] = test['sulphates']  / test['density']
test['alcohol_sulphate'] = test['alcohol'] * test['sulphates']

test_md = test.copy()

X = train[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates']]
Y = train['quality'] 

test_md = test_md[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates']]

# Optimal Rounder

In [3]:
from functools import partial
import numpy as np
import scipy as sp

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8

        ll = cohen_kappa_score(y, X_p, weights = 'quadratic')
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        initial_coef = [3.5, 4.5, 5.5, 6.5, 7.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method = 'nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8
        return X_p

    def coefficients(self):
        return self.coef_['x']

# Optuna Optimzation

In [5]:
def objective(trial):
    
    ## Parameters to be evaluated
    param = dict(objective = 'regression_l1',
                 verbose = -100,
                 boosting_type = 'gbdt', 
                 random_state = 929,
                 n_estimators = trial.suggest_int('n_estimators', 300, 10000),
                 learning_rate = trial.suggest_float('learning_rate', 0.001, 1, log=True),
                 max_depth = trial.suggest_int('max_depth', 3, 12),
                 lambda_l1 = trial.suggest_float('lambda_l1', 0.01, 10.0, log=True),
                 lambda_l2 = trial.suggest_float('lambda_l2', 0.01, 10.0, log=True),
                 num_leaves = trial.suggest_int('num_leaves', 2, 100),
                 bagging_fraction = trial.suggest_float('bagging_fraction', 0.2, 0.9),
                 feature_fraction = trial.suggest_float('feature_fraction', 0.2, 0.9)
                 )

    
    scores = []

#     skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    skf = KFold(n_splits = 5, shuffle = True, random_state = 929)
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
        print(fold, end = ' ')
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train , y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

        model = LGBMRegressor(**param)
        model.fit(X_train, y_train)

        preds_valid = model.predict(X_valid)
        
        optR = OptimizedRounder()
        optR.fit(preds_valid, y_valid)
        coef = optR.coefficients()
        preds_valid = optR.predict(preds_valid, coef).astype(int)
        
        score = cohen_kappa_score(y_valid,  preds_valid, weights = "quadratic")
        scores.append(score)
        
    return np.mean(scores)

In [6]:
study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials = 50, timeout = 3600)

[32m[I 2023-02-07 02:29:38,277][0m A new study created in memory with name: no-name-338afae0-cbfa-488d-b979-11012939aa7b[0m




[32m[I 2023-02-07 02:29:57,610][0m Trial 0 finished with value: 0.5368140012168374 and parameters: {'n_estimators': 9296, 'learning_rate': 0.026558910851302436, 'max_depth': 6, 'lambda_l1': 4.582742665666303, 'lambda_l2': 0.03460127173380391, 'num_leaves': 37, 'bagging_fraction': 0.6598751261273172, 'feature_fraction': 0.5939549378494506}. Best is trial 0 with value: 0.5368140012168374.[0m




[32m[I 2023-02-07 02:30:08,502][0m Trial 1 finished with value: 0.5288062218829817 and parameters: {'n_estimators': 3208, 'learning_rate': 0.1262903061626408, 'max_depth': 7, 'lambda_l1': 0.21561964317607024, 'lambda_l2': 0.013699935120781047, 'num_leaves': 72, 'bagging_fraction': 0.3260913281683937, 'feature_fraction': 0.8456116454653508}. Best is trial 0 with value: 0.5368140012168374.[0m




[32m[I 2023-02-07 02:30:15,267][0m Trial 2 finished with value: 0.56832660477582 and parameters: {'n_estimators': 2221, 'learning_rate': 0.00188655572643738, 'max_depth': 10, 'lambda_l1': 0.9228415924358394, 'lambda_l2': 4.780670669490834, 'num_leaves': 99, 'bagging_fraction': 0.523177551845722, 'feature_fraction': 0.24513576962676087}. Best is trial 2 with value: 0.56832660477582.[0m




[32m[I 2023-02-07 02:30:17,764][0m Trial 3 finished with value: 0.5326722781361822 and parameters: {'n_estimators': 362, 'learning_rate': 0.17577522753452496, 'max_depth': 11, 'lambda_l1': 4.819756502079283, 'lambda_l2': 0.015716365222118913, 'num_leaves': 90, 'bagging_fraction': 0.34981793938828015, 'feature_fraction': 0.6913906927695924}. Best is trial 2 with value: 0.56832660477582.[0m




[32m[I 2023-02-07 02:30:27,716][0m Trial 4 finished with value: 0.5328000674665038 and parameters: {'n_estimators': 9099, 'learning_rate': 0.23746989473519495, 'max_depth': 4, 'lambda_l1': 6.943498075706224, 'lambda_l2': 3.2604732137553962, 'num_leaves': 55, 'bagging_fraction': 0.5041460095035251, 'feature_fraction': 0.39232870471777576}. Best is trial 2 with value: 0.56832660477582.[0m




[32m[I 2023-02-07 02:30:40,194][0m Trial 5 finished with value: 0.5598984907909614 and parameters: {'n_estimators': 3255, 'learning_rate': 0.010862714152269238, 'max_depth': 10, 'lambda_l1': 0.01934868569799695, 'lambda_l2': 1.1075445043065988, 'num_leaves': 71, 'bagging_fraction': 0.2859486256480403, 'feature_fraction': 0.5649866543079058}. Best is trial 2 with value: 0.56832660477582.[0m




[32m[I 2023-02-07 02:30:54,208][0m Trial 6 finished with value: 0.5459720105271233 and parameters: {'n_estimators': 6580, 'learning_rate': 0.006274196168024323, 'max_depth': 6, 'lambda_l1': 1.7779229755633081, 'lambda_l2': 0.3254737865972963, 'num_leaves': 19, 'bagging_fraction': 0.25338661209859237, 'feature_fraction': 0.8594800467389019}. Best is trial 2 with value: 0.56832660477582.[0m




[32m[I 2023-02-07 02:31:01,438][0m Trial 7 finished with value: 0.5591916107313052 and parameters: {'n_estimators': 6130, 'learning_rate': 0.003982207394874493, 'max_depth': 3, 'lambda_l1': 0.012286382587799257, 'lambda_l2': 0.01777298432253797, 'num_leaves': 28, 'bagging_fraction': 0.7832096743066028, 'feature_fraction': 0.43752438855042286}. Best is trial 2 with value: 0.56832660477582.[0m




[32m[I 2023-02-07 02:31:04,616][0m Trial 8 finished with value: 0.5269093322813712 and parameters: {'n_estimators': 435, 'learning_rate': 0.3914078438863934, 'max_depth': 9, 'lambda_l1': 1.469486682408759, 'lambda_l2': 0.0144365452864539, 'num_leaves': 51, 'bagging_fraction': 0.7874365146739215, 'feature_fraction': 0.8326830501382243}. Best is trial 2 with value: 0.56832660477582.[0m




[32m[I 2023-02-07 02:31:14,010][0m Trial 9 finished with value: 0.5183176995881833 and parameters: {'n_estimators': 5466, 'learning_rate': 0.014474920766450355, 'max_depth': 12, 'lambda_l1': 0.08174616310726539, 'lambda_l2': 6.180787444262563, 'num_leaves': 12, 'bagging_fraction': 0.29175422702908316, 'feature_fraction': 0.6748535589224531}. Best is trial 2 with value: 0.56832660477582.[0m




[32m[I 2023-02-07 02:31:21,822][0m Trial 10 finished with value: 0.5702790253433532 and parameters: {'n_estimators': 2708, 'learning_rate': 0.0012888666438808346, 'max_depth': 9, 'lambda_l1': 0.6537151002553993, 'lambda_l2': 7.841836180289107, 'num_leaves': 99, 'bagging_fraction': 0.5268764937391888, 'feature_fraction': 0.2182839576359682}. Best is trial 10 with value: 0.5702790253433532.[0m




[32m[I 2023-02-07 02:31:29,867][0m Trial 11 finished with value: 0.5617701919188737 and parameters: {'n_estimators': 2899, 'learning_rate': 0.0012465824558000277, 'max_depth': 9, 'lambda_l1': 0.7488144444196517, 'lambda_l2': 6.001755496923743, 'num_leaves': 100, 'bagging_fraction': 0.5072105092695441, 'feature_fraction': 0.21774942421185625}. Best is trial 10 with value: 0.5702790253433532.[0m




[32m[I 2023-02-07 02:31:37,318][0m Trial 12 finished with value: 0.5662813971154071 and parameters: {'n_estimators': 2214, 'learning_rate': 0.0013665465554101308, 'max_depth': 9, 'lambda_l1': 0.3987575872918385, 'lambda_l2': 2.2289708289641164, 'num_leaves': 86, 'bagging_fraction': 0.610828680944249, 'feature_fraction': 0.2017746222390157}. Best is trial 10 with value: 0.5702790253433532.[0m




[32m[I 2023-02-07 02:31:43,408][0m Trial 13 finished with value: 0.5646626493883706 and parameters: {'n_estimators': 1896, 'learning_rate': 0.002607197145212494, 'max_depth': 12, 'lambda_l1': 0.22505378906955809, 'lambda_l2': 8.932972573445591, 'num_leaves': 80, 'bagging_fraction': 0.4576198085168098, 'feature_fraction': 0.3057398202544042}. Best is trial 10 with value: 0.5702790253433532.[0m




[32m[I 2023-02-07 02:31:58,133][0m Trial 14 finished with value: 0.5635360888658199 and parameters: {'n_estimators': 4310, 'learning_rate': 0.0025923042996312986, 'max_depth': 8, 'lambda_l1': 1.4543610797560076, 'lambda_l2': 1.4779174956211392, 'num_leaves': 99, 'bagging_fraction': 0.42268818958080434, 'feature_fraction': 0.31998777808929124}. Best is trial 10 with value: 0.5702790253433532.[0m




[32m[I 2023-02-07 02:32:13,965][0m Trial 15 finished with value: 0.5705895610348328 and parameters: {'n_estimators': 4575, 'learning_rate': 0.0014062053449784818, 'max_depth': 10, 'lambda_l1': 0.5222377117960593, 'lambda_l2': 9.760421460375074, 'num_leaves': 62, 'bagging_fraction': 0.5804601027862397, 'feature_fraction': 0.42158491445179647}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:32:31,971][0m Trial 16 finished with value: 0.562606893821896 and parameters: {'n_estimators': 6979, 'learning_rate': 0.004723861394561081, 'max_depth': 7, 'lambda_l1': 0.10125182023829092, 'lambda_l2': 9.2576285774692, 'num_leaves': 58, 'bagging_fraction': 0.6175531628857972, 'feature_fraction': 0.45232632133004047}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:32:36,127][0m Trial 17 finished with value: 0.5381676162293615 and parameters: {'n_estimators': 4479, 'learning_rate': 0.06255820483600266, 'max_depth': 11, 'lambda_l1': 0.4798238397954659, 'lambda_l2': 0.8689171601201627, 'num_leaves': 3, 'bagging_fraction': 0.8952316161543652, 'feature_fraction': 0.3254150286595468}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:32:48,472][0m Trial 18 finished with value: 0.5621378000637034 and parameters: {'n_estimators': 4182, 'learning_rate': 0.0010725712777571728, 'max_depth': 8, 'lambda_l1': 2.8197408581937804, 'lambda_l2': 2.558022663842123, 'num_leaves': 38, 'bagging_fraction': 0.40374574780658923, 'feature_fraction': 0.4767173654735512}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:33:14,425][0m Trial 19 finished with value: 0.5586996320814259 and parameters: {'n_estimators': 7537, 'learning_rate': 0.001035001625740501, 'max_depth': 10, 'lambda_l1': 0.7799374866704374, 'lambda_l2': 3.444508355341562, 'num_leaves': 70, 'bagging_fraction': 0.5635535737280362, 'feature_fraction': 0.3750459858747271}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:33:20,698][0m Trial 20 finished with value: 0.5649427335864369 and parameters: {'n_estimators': 4824, 'learning_rate': 0.006449371464044226, 'max_depth': 5, 'lambda_l1': 8.206327694478105, 'lambda_l2': 9.810588088756903, 'num_leaves': 47, 'bagging_fraction': 0.21117378313555607, 'feature_fraction': 0.26055568699420745}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:33:26,393][0m Trial 21 finished with value: 0.5704058494439211 and parameters: {'n_estimators': 1810, 'learning_rate': 0.0022495602540919726, 'max_depth': 10, 'lambda_l1': 0.8061963535718971, 'lambda_l2': 4.561114045792595, 'num_leaves': 90, 'bagging_fraction': 0.5225771423121834, 'feature_fraction': 0.2675809176113483}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:33:31,315][0m Trial 22 finished with value: 0.5685385715246746 and parameters: {'n_estimators': 1360, 'learning_rate': 0.0027168311259498767, 'max_depth': 11, 'lambda_l1': 0.43420263049686747, 'lambda_l2': 4.777386069557158, 'num_leaves': 85, 'bagging_fraction': 0.4622607672451613, 'feature_fraction': 0.29750638927102685}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:33:39,703][0m Trial 23 finished with value: 0.5625543639915218 and parameters: {'n_estimators': 3689, 'learning_rate': 0.0022089080494461454, 'max_depth': 9, 'lambda_l1': 2.4215487817860413, 'lambda_l2': 2.007534439976968, 'num_leaves': 79, 'bagging_fraction': 0.5599558205878411, 'feature_fraction': 0.20423840716965955}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:33:44,987][0m Trial 24 finished with value: 0.5639309086424912 and parameters: {'n_estimators': 1623, 'learning_rate': 0.0036923025575963555, 'max_depth': 10, 'lambda_l1': 0.9074837243914244, 'lambda_l2': 3.9684826083033955, 'num_leaves': 63, 'bagging_fraction': 0.3765080630184825, 'feature_fraction': 0.36693303795587995}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:34:01,090][0m Trial 25 finished with value: 0.5660866521089843 and parameters: {'n_estimators': 5389, 'learning_rate': 0.0017262714685939834, 'max_depth': 8, 'lambda_l1': 0.26284648429317103, 'lambda_l2': 0.6714524926030995, 'num_leaves': 92, 'bagging_fraction': 0.458447502021483, 'feature_fraction': 0.26796753055976025}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:34:05,147][0m Trial 26 finished with value: 0.5693175263149979 and parameters: {'n_estimators': 965, 'learning_rate': 0.0010492363892694748, 'max_depth': 11, 'lambda_l1': 0.6039889776250604, 'lambda_l2': 6.326077019409926, 'num_leaves': 80, 'bagging_fraction': 0.6495112034570562, 'feature_fraction': 0.26470567751057755}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:34:12,282][0m Trial 27 finished with value: 0.5583142525479283 and parameters: {'n_estimators': 2628, 'learning_rate': 0.008560182443561765, 'max_depth': 12, 'lambda_l1': 1.2378788718475535, 'lambda_l2': 9.752508259220823, 'num_leaves': 65, 'bagging_fraction': 0.5798560383697204, 'feature_fraction': 0.36665387350380646}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:34:20,545][0m Trial 28 finished with value: 0.5548378012991085 and parameters: {'n_estimators': 3555, 'learning_rate': 0.004158775108038928, 'max_depth': 9, 'lambda_l1': 2.4588464132391943, 'lambda_l2': 1.860665089059172, 'num_leaves': 92, 'bagging_fraction': 0.5178020519510669, 'feature_fraction': 0.3291982146446412}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:34:37,345][0m Trial 29 finished with value: 0.5466556839694 and parameters: {'n_estimators': 8592, 'learning_rate': 0.020435251413673767, 'max_depth': 6, 'lambda_l1': 4.059412434635193, 'lambda_l2': 0.08556073944668795, 'num_leaves': 39, 'bagging_fraction': 0.6820571228145796, 'feature_fraction': 0.4818313950608871}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:34:47,130][0m Trial 30 finished with value: 0.557381437811663 and parameters: {'n_estimators': 2408, 'learning_rate': 0.0017250381004024183, 'max_depth': 10, 'lambda_l1': 0.5027269982741891, 'lambda_l2': 3.233063042127908, 'num_leaves': 74, 'bagging_fraction': 0.4097518976283062, 'feature_fraction': 0.5150156227097582}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:34:54,816][0m Trial 31 finished with value: 0.5669721926128343 and parameters: {'n_estimators': 1149, 'learning_rate': 0.0016705000771732985, 'max_depth': 11, 'lambda_l1': 0.5864453577448293, 'lambda_l2': 4.371852139626282, 'num_leaves': 80, 'bagging_fraction': 0.6538473898057886, 'feature_fraction': 0.25864236330776086}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:34:59,046][0m Trial 32 finished with value: 0.5695568899834003 and parameters: {'n_estimators': 1044, 'learning_rate': 0.001065680716252395, 'max_depth': 11, 'lambda_l1': 1.0038082622216233, 'lambda_l2': 6.5843910343039695, 'num_leaves': 85, 'bagging_fraction': 0.6035271056597108, 'feature_fraction': 0.2726129428771375}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:35:06,103][0m Trial 33 finished with value: 0.548413237801916 and parameters: {'n_estimators': 1682, 'learning_rate': 0.0023186066362699273, 'max_depth': 10, 'lambda_l1': 0.9588743887477075, 'lambda_l2': 6.251723530765726, 'num_leaves': 91, 'bagging_fraction': 0.5606560813627773, 'feature_fraction': 0.40306390160173294}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:35:14,648][0m Trial 34 finished with value: 0.5699664806552185 and parameters: {'n_estimators': 2862, 'learning_rate': 0.0015145817674514022, 'max_depth': 11, 'lambda_l1': 0.9851993371210503, 'lambda_l2': 2.9249915870408008, 'num_leaves': 94, 'bagging_fraction': 0.5953477634082056, 'feature_fraction': 0.2927376701400953}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:35:23,611][0m Trial 35 finished with value: 0.5600531184604418 and parameters: {'n_estimators': 2997, 'learning_rate': 0.0031933313875389417, 'max_depth': 8, 'lambda_l1': 0.3001871552695735, 'lambda_l2': 2.9808984596058057, 'num_leaves': 97, 'bagging_fraction': 0.536314428423581, 'feature_fraction': 0.23402478599051757}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:35:33,052][0m Trial 36 finished with value: 0.5630340357627288 and parameters: {'n_estimators': 3836, 'learning_rate': 0.0017545680722876045, 'max_depth': 10, 'lambda_l1': 1.8462834166779518, 'lambda_l2': 4.182910982751696, 'num_leaves': 95, 'bagging_fraction': 0.49165931236448535, 'feature_fraction': 0.34781535383658746}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:35:46,992][0m Trial 37 finished with value: 0.5624405818267959 and parameters: {'n_estimators': 3249, 'learning_rate': 0.0017426979191030073, 'max_depth': 12, 'lambda_l1': 0.3483805895890914, 'lambda_l2': 2.5507055651941997, 'num_leaves': 85, 'bagging_fraction': 0.5124370167522067, 'feature_fraction': 0.40912892310462545}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:36:09,802][0m Trial 38 finished with value: 0.559063225567731 and parameters: {'n_estimators': 9932, 'learning_rate': 0.005973748300291943, 'max_depth': 7, 'lambda_l1': 0.657844830711665, 'lambda_l2': 1.5411683414710549, 'num_leaves': 75, 'bagging_fraction': 0.5526121266952019, 'feature_fraction': 0.2353245811551184}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:36:24,086][0m Trial 39 finished with value: 0.5560360447033322 and parameters: {'n_estimators': 5930, 'learning_rate': 0.003395227003068612, 'max_depth': 9, 'lambda_l1': 1.1242338826754519, 'lambda_l2': 1.1455407370148505, 'num_leaves': 31, 'bagging_fraction': 0.47749376205916133, 'feature_fraction': 0.3000869723775755}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:36:30,115][0m Trial 40 finished with value: 0.5549264164789673 and parameters: {'n_estimators': 2168, 'learning_rate': 0.008459322477549994, 'max_depth': 10, 'lambda_l1': 1.3259062669498765, 'lambda_l2': 5.132536220859812, 'num_leaves': 46, 'bagging_fraction': 0.5347190301687673, 'feature_fraction': 0.349442142775527}. Best is trial 15 with value: 0.5705895610348328.[0m




[32m[I 2023-02-07 02:36:33,292][0m Trial 41 finished with value: 0.5707658047285621 and parameters: {'n_estimators': 626, 'learning_rate': 0.0014444169166542527, 'max_depth': 11, 'lambda_l1': 0.9211405943114689, 'lambda_l2': 7.358317686265839, 'num_leaves': 86, 'bagging_fraction': 0.5951456101240298, 'feature_fraction': 0.28404195228514567}. Best is trial 41 with value: 0.5707658047285621.[0m




[32m[I 2023-02-07 02:36:37,063][0m Trial 42 finished with value: 0.5692630360705746 and parameters: {'n_estimators': 862, 'learning_rate': 0.0014537585653109275, 'max_depth': 11, 'lambda_l1': 0.7583010251046329, 'lambda_l2': 7.650725838321539, 'num_leaves': 88, 'bagging_fraction': 0.5928417503973653, 'feature_fraction': 0.28949812925672563}. Best is trial 41 with value: 0.5707658047285621.[0m




[32m[I 2023-02-07 02:36:40,349][0m Trial 43 finished with value: 0.569518927318178 and parameters: {'n_estimators': 618, 'learning_rate': 0.002185447428075791, 'max_depth': 11, 'lambda_l1': 0.3558817460761094, 'lambda_l2': 5.440352298330589, 'num_leaves': 95, 'bagging_fraction': 0.6273111493369659, 'feature_fraction': 0.24468713586533822}. Best is trial 41 with value: 0.5707658047285621.[0m




[32m[I 2023-02-07 02:36:47,398][0m Trial 44 finished with value: 0.5617590337785515 and parameters: {'n_estimators': 2736, 'learning_rate': 0.0030407848494947297, 'max_depth': 12, 'lambda_l1': 1.7140450495960222, 'lambda_l2': 7.6819315028698085, 'num_leaves': 68, 'bagging_fraction': 0.6743836206266386, 'feature_fraction': 0.207031519460888}. Best is trial 41 with value: 0.5707658047285621.[0m




[32m[I 2023-02-07 02:36:54,549][0m Trial 45 finished with value: 0.5625419514572576 and parameters: {'n_estimators': 2063, 'learning_rate': 0.0013640996556546396, 'max_depth': 10, 'lambda_l1': 0.18096877700797434, 'lambda_l2': 3.9977053944358816, 'num_leaves': 100, 'bagging_fraction': 0.5840573011150644, 'feature_fraction': 0.2920003906525429}. Best is trial 41 with value: 0.5707658047285621.[0m




[32m[I 2023-02-07 02:36:57,113][0m Trial 46 finished with value: 0.5753883618519515 and parameters: {'n_estimators': 407, 'learning_rate': 0.004849615548816945, 'max_depth': 9, 'lambda_l1': 0.5761640772659122, 'lambda_l2': 7.739502415963833, 'num_leaves': 59, 'bagging_fraction': 0.5302549229601505, 'feature_fraction': 0.3291806876448777}. Best is trial 46 with value: 0.5753883618519515.[0m




[32m[I 2023-02-07 02:36:59,615][0m Trial 47 finished with value: 0.5725006231835612 and parameters: {'n_estimators': 415, 'learning_rate': 0.004407187882977995, 'max_depth': 9, 'lambda_l1': 0.5550321046562284, 'lambda_l2': 7.286319773639334, 'num_leaves': 57, 'bagging_fraction': 0.5268789475153901, 'feature_fraction': 0.33753876795855986}. Best is trial 46 with value: 0.5753883618519515.[0m




[32m[I 2023-02-07 02:37:02,139][0m Trial 48 finished with value: 0.570336369291824 and parameters: {'n_estimators': 436, 'learning_rate': 0.005238856446230232, 'max_depth': 9, 'lambda_l1': 0.470888859433325, 'lambda_l2': 7.221179908301856, 'num_leaves': 57, 'bagging_fraction': 0.5078817257920297, 'feature_fraction': 0.3317943644828354}. Best is trial 46 with value: 0.5753883618519515.[0m




[32m[I 2023-02-07 02:37:04,801][0m Trial 49 finished with value: 0.5720761594665239 and parameters: {'n_estimators': 367, 'learning_rate': 0.004252974713284697, 'max_depth': 8, 'lambda_l1': 0.6242021442079363, 'lambda_l2': 9.90978821994292, 'num_leaves': 61, 'bagging_fraction': 0.44319481913981007, 'feature_fraction': 0.40271796113998604}. Best is trial 46 with value: 0.5753883618519515.[0m


# LightGBM Modeling

In [5]:
lgb_cv_scores, lgb_imp = list(), list()
preds = list()

skf = KFold(n_splits = 5, shuffle = True, random_state = 42)
    
for train_ix, test_ix in skf.split(X, Y):
        
    ## Splitting the data 
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
    ## Building the model
    lgb_md = LGBMRegressor(n_estimators = 1147,
                            max_depth = 10,
                            learning_rate = 0.018,
                            num_leaves = 51,
                            lambda_l1 = 0.2547,
                            lambda_l2 = 0.09305,
                            bagging_fraction = 0.6074,
                            feature_fraction = 0.2868,
                            random_state = 42).fit(X_train, Y_train)
    lgb_imp.append(lgb_md.feature_importances_)
    
    ## Predicting on X_test and test
    lgb_pred_1 = lgb_md.predict(X_test)
    lgb_pred_2 = lgb_md.predict(test_md)
        
    ## Applying Optimal Rounder (using abhishek approach)
    optR = OptimizedRounder()
    optR.fit(lgb_pred_1, Y_test)
    coef = optR.coefficients()
    lgb_pred_1 = optR.predict(lgb_pred_1, coef).astype(int)
    lgb_pred_2 = optR.predict(lgb_pred_2, coef).astype(int)
        
    ## Computing roc-auc score
    lgb_cv_scores.append(cohen_kappa_score(Y_test, lgb_pred_1, weights = 'quadratic'))
    preds.append(lgb_pred_2)

lgb_cv_score = np.mean(lgb_cv_scores)    
print('The average roc-auc score over 5-folds (run 5 times) is:', lgb_cv_score)

The average roc-auc score over 5-folds (run 5 times) is: 0.5519794514993521


In [6]:
lgb_preds_test = pd.DataFrame(preds).mode(axis = 0).loc[0, ]

submission['quality'] = lgb_preds_test.astype(int)
submission.head()

Unnamed: 0,Id,quality
0,2056,5
1,2057,6
2,2058,6
3,2059,7
4,2060,6


In [7]:
submission['quality'].value_counts()

5    552
6    484
7    334
4      2
Name: quality, dtype: int64

In [8]:
submission.to_csv('LightGBM_Reg_FE_2.csv', index = False)