In [1]:
pip install lightgbm optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.9.3-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 kB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m 

In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.cluster import KMeans
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score, cohen_kappa_score, davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor
# from xgboost import XGBClassifier, XGBRegressor
# from catboost import CatBoostClassifier

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/TS-S3-Ep5/train.csv'
file_key_2 = 'Tabular-Playground-Series/TS-S3-Ep5/test.csv'
file_key_3 = 'Tabular-Playground-Series/TS-S3-Ep5/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

## Enginering features
train['alcohol_density'] = train['alcohol'] * train['density']
train['sulphate/density'] = train['sulphates']  / train['density']
train['alcohol_sulphate'] = train['alcohol'] * train['sulphates']

test['alcohol_density'] = test['alcohol']  * test['density']
test['sulphate/density'] = test['sulphates']  / test['density']
test['alcohol_sulphate'] = test['alcohol'] * test['sulphates']

test_md = test.copy()

X = train[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates']]
Y = train['quality'] 

test_md = test_md[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates']]

# Optimal Rounder

In [2]:
from functools import partial
import numpy as np
import scipy as sp

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8

        ll = cohen_kappa_score(y, X_p, weights = 'quadratic')
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        initial_coef = [3.5, 4.5, 5.5, 6.5, 7.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method = 'nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8
        return X_p

    def coefficients(self):
        return self.coef_['x']

# Optuna Optimzation

In [17]:
def objective(trial):
    
    ## Parameters to be evaluated
    param = dict(objective = 'regression_l1',
                 verbose = -100,
                 boosting_type = 'gbdt', 
                 random_state = 906,
                 n_estimators = trial.suggest_int('n_estimators', 300, 10000),
                 learning_rate = trial.suggest_float('learning_rate', 0.001, 1, log = True),
                 max_depth = trial.suggest_int('max_depth', 3, 12),
                 lambda_l1 = trial.suggest_float('lambda_l1', 0.01, 10.0, log = True),
                 lambda_l2 = trial.suggest_float('lambda_l2', 0.01, 10.0, log = True),
                 num_leaves = trial.suggest_int('num_leaves', 2, 100),
                 bagging_fraction = trial.suggest_float('bagging_fraction', 0.2, 0.9),
                 feature_fraction = trial.suggest_float('feature_fraction', 0.2, 0.9)
                 )

    
    scores = []

#     skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    skf = KFold(n_splits = 5, shuffle = True, random_state = 906)
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
        print(fold, end = ' ')
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train , y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

        model = LGBMRegressor(**param)
        model.fit(X_train, y_train)
        
        preds_train = model.predict(X_train)
        preds_valid = model.predict(X_valid)
        
        optR = OptimizedRounder()
        optR.fit(preds_train, y_train)
        coef = optR.coefficients()
        preds_valid = optR.predict(preds_valid, coef).astype(int)
        
        score = cohen_kappa_score(y_valid,  preds_valid, weights = "quadratic")
        scores.append(score)
        
    return np.mean(scores)

In [18]:
study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials = 50, timeout = 3600)

[32m[I 2023-02-08 15:29:46,731][0m A new study created in memory with name: no-name-15e96651-4389-4e22-8513-a2bc1d95d5ca[0m




[32m[I 2023-02-08 15:30:03,698][0m Trial 0 finished with value: 0.516689131644715 and parameters: {'n_estimators': 3373, 'learning_rate': 0.16739079532185688, 'max_depth': 6, 'lambda_l1': 0.4611750875550995, 'lambda_l2': 1.5831600435314832, 'num_leaves': 14, 'bagging_fraction': 0.4347627657334713, 'feature_fraction': 0.655767761194281}. Best is trial 0 with value: 0.516689131644715.[0m




[32m[I 2023-02-08 15:30:12,212][0m Trial 1 finished with value: 0.5241736422613635 and parameters: {'n_estimators': 716, 'learning_rate': 0.0925117868493144, 'max_depth': 5, 'lambda_l1': 0.02534001537032737, 'lambda_l2': 5.342166436108739, 'num_leaves': 90, 'bagging_fraction': 0.7361433861295443, 'feature_fraction': 0.8264289814555326}. Best is trial 1 with value: 0.5241736422613635.[0m




[32m[I 2023-02-08 15:30:30,944][0m Trial 2 finished with value: 0.5425881135900469 and parameters: {'n_estimators': 2573, 'learning_rate': 0.01087724945654237, 'max_depth': 7, 'lambda_l1': 0.12980511327342548, 'lambda_l2': 0.04129710696240072, 'num_leaves': 20, 'bagging_fraction': 0.894019146644788, 'feature_fraction': 0.42035944797116087}. Best is trial 2 with value: 0.5425881135900469.[0m




[32m[I 2023-02-08 15:30:49,412][0m Trial 3 finished with value: 0.517077515530252 and parameters: {'n_estimators': 9521, 'learning_rate': 0.6473765921739143, 'max_depth': 3, 'lambda_l1': 7.979102294068591, 'lambda_l2': 0.034400792516741864, 'num_leaves': 77, 'bagging_fraction': 0.3370291317002282, 'feature_fraction': 0.34997406085937444}. Best is trial 2 with value: 0.5425881135900469.[0m




[32m[I 2023-02-08 15:31:28,876][0m Trial 4 finished with value: 0.5429988603678814 and parameters: {'n_estimators': 9183, 'learning_rate': 0.0059845617698605505, 'max_depth': 4, 'lambda_l1': 1.2804807244767717, 'lambda_l2': 0.015335549195759284, 'num_leaves': 26, 'bagging_fraction': 0.3529490976887196, 'feature_fraction': 0.5735661139763994}. Best is trial 4 with value: 0.5429988603678814.[0m




[32m[I 2023-02-08 15:31:50,195][0m Trial 5 finished with value: 0.5097238851865787 and parameters: {'n_estimators': 5791, 'learning_rate': 0.2649896455211532, 'max_depth': 7, 'lambda_l1': 7.2650216338158815, 'lambda_l2': 1.1707766080367055, 'num_leaves': 73, 'bagging_fraction': 0.7630614191881331, 'feature_fraction': 0.2946749850635341}. Best is trial 4 with value: 0.5429988603678814.[0m




[32m[I 2023-02-08 15:32:02,126][0m Trial 6 finished with value: 0.5408285516851545 and parameters: {'n_estimators': 1972, 'learning_rate': 0.015106322328108464, 'max_depth': 3, 'lambda_l1': 0.08977331404960187, 'lambda_l2': 5.499345640501282, 'num_leaves': 13, 'bagging_fraction': 0.5535308955781724, 'feature_fraction': 0.500258142764123}. Best is trial 4 with value: 0.5429988603678814.[0m




[32m[I 2023-02-08 15:32:13,953][0m Trial 7 finished with value: 0.5195684062096251 and parameters: {'n_estimators': 541, 'learning_rate': 0.00370144198684703, 'max_depth': 12, 'lambda_l1': 0.5831176361579506, 'lambda_l2': 0.1358389769710866, 'num_leaves': 25, 'bagging_fraction': 0.6597997991121771, 'feature_fraction': 0.7756784082849328}. Best is trial 4 with value: 0.5429988603678814.[0m




[32m[I 2023-02-08 15:32:50,446][0m Trial 8 finished with value: 0.5351211083453157 and parameters: {'n_estimators': 8854, 'learning_rate': 0.001542835801086175, 'max_depth': 10, 'lambda_l1': 0.554521495897459, 'lambda_l2': 0.6996920834744506, 'num_leaves': 13, 'bagging_fraction': 0.5858148334185626, 'feature_fraction': 0.31273333366266737}. Best is trial 4 with value: 0.5429988603678814.[0m




[32m[I 2023-02-08 15:33:11,403][0m Trial 9 finished with value: 0.5019593324867501 and parameters: {'n_estimators': 7491, 'learning_rate': 0.04160310678535334, 'max_depth': 4, 'lambda_l1': 0.791807540400206, 'lambda_l2': 0.14049233284942084, 'num_leaves': 85, 'bagging_fraction': 0.678515663011693, 'feature_fraction': 0.6268866198222791}. Best is trial 4 with value: 0.5429988603678814.[0m




[32m[I 2023-02-08 15:33:33,996][0m Trial 10 finished with value: 0.5430034430785515 and parameters: {'n_estimators': 6011, 'learning_rate': 0.0010016811623781863, 'max_depth': 9, 'lambda_l1': 2.286984981214179, 'lambda_l2': 0.010108658031251948, 'num_leaves': 52, 'bagging_fraction': 0.20808052988062034, 'feature_fraction': 0.21457006203541673}. Best is trial 10 with value: 0.5430034430785515.[0m




[32m[I 2023-02-08 15:33:56,713][0m Trial 11 finished with value: 0.5426573989859093 and parameters: {'n_estimators': 6297, 'learning_rate': 0.0011178841750136771, 'max_depth': 9, 'lambda_l1': 2.3416620269627444, 'lambda_l2': 0.01011107490609825, 'num_leaves': 49, 'bagging_fraction': 0.201582280972342, 'feature_fraction': 0.22233475766430305}. Best is trial 10 with value: 0.5430034430785515.[0m




[32m[I 2023-02-08 15:34:37,345][0m Trial 12 finished with value: 0.5252667360616083 and parameters: {'n_estimators': 7759, 'learning_rate': 0.004001634491031737, 'max_depth': 9, 'lambda_l1': 2.216512414479632, 'lambda_l2': 0.015706999221605084, 'num_leaves': 50, 'bagging_fraction': 0.22379421392118595, 'feature_fraction': 0.47642797030434936}. Best is trial 10 with value: 0.5430034430785515.[0m




[32m[I 2023-02-08 15:34:56,217][0m Trial 13 finished with value: 0.5467373961929349 and parameters: {'n_estimators': 4527, 'learning_rate': 0.001023018393103875, 'max_depth': 11, 'lambda_l1': 1.9368421058417875, 'lambda_l2': 0.010035104278333684, 'num_leaves': 33, 'bagging_fraction': 0.3517924694300438, 'feature_fraction': 0.21099337718263128}. Best is trial 13 with value: 0.5467373961929349.[0m




[32m[I 2023-02-08 15:35:17,880][0m Trial 14 finished with value: 0.5400968113570379 and parameters: {'n_estimators': 4570, 'learning_rate': 0.0013432678693236812, 'max_depth': 12, 'lambda_l1': 3.06898009457714, 'lambda_l2': 0.03432325833338344, 'num_leaves': 38, 'bagging_fraction': 0.4388665578916173, 'feature_fraction': 0.20998827342769943}. Best is trial 13 with value: 0.5467373961929349.[0m




[32m[I 2023-02-08 15:35:35,097][0m Trial 15 finished with value: 0.5386808624808486 and parameters: {'n_estimators': 4567, 'learning_rate': 0.002447477304854005, 'max_depth': 10, 'lambda_l1': 3.9842118674408162, 'lambda_l2': 0.01015218523083368, 'num_leaves': 64, 'bagging_fraction': 0.2891017167935228, 'feature_fraction': 0.20674264662862615}. Best is trial 13 with value: 0.5467373961929349.[0m




[32m[I 2023-02-08 15:36:13,084][0m Trial 16 finished with value: 0.5276370855552417 and parameters: {'n_estimators': 6491, 'learning_rate': 0.001002385566277975, 'max_depth': 11, 'lambda_l1': 1.4268672574581216, 'lambda_l2': 0.07833632437409202, 'num_leaves': 40, 'bagging_fraction': 0.42624809983170264, 'feature_fraction': 0.4077067115324843}. Best is trial 13 with value: 0.5467373961929349.[0m




[32m[I 2023-02-08 15:36:35,607][0m Trial 17 finished with value: 0.5474734563336925 and parameters: {'n_estimators': 4004, 'learning_rate': 0.0025839953941233194, 'max_depth': 9, 'lambda_l1': 0.24119972935955158, 'lambda_l2': 0.30799950377050833, 'num_leaves': 63, 'bagging_fraction': 0.25803808806093975, 'feature_fraction': 0.29948987372452307}. Best is trial 17 with value: 0.5474734563336925.[0m




[32m[I 2023-02-08 15:36:54,558][0m Trial 18 finished with value: 0.5330446442770136 and parameters: {'n_estimators': 3664, 'learning_rate': 0.007402176579133682, 'max_depth': 8, 'lambda_l1': 0.26442760879297705, 'lambda_l2': 0.3022106178495693, 'num_leaves': 65, 'bagging_fraction': 0.29346557967086445, 'feature_fraction': 0.2941002966689019}. Best is trial 17 with value: 0.5474734563336925.[0m




[32m[I 2023-02-08 15:37:19,230][0m Trial 19 finished with value: 0.5485466418031059 and parameters: {'n_estimators': 1961, 'learning_rate': 0.002284415495180285, 'max_depth': 11, 'lambda_l1': 0.24681174710323123, 'lambda_l2': 0.3830136368263303, 'num_leaves': 38, 'bagging_fraction': 0.4896415335895309, 'feature_fraction': 0.35826043075638603}. Best is trial 19 with value: 0.5485466418031059.[0m




[32m[I 2023-02-08 15:37:37,723][0m Trial 20 finished with value: 0.5365796942694743 and parameters: {'n_estimators': 1633, 'learning_rate': 0.018925823730361783, 'max_depth': 8, 'lambda_l1': 0.2505304500014106, 'lambda_l2': 0.36767997858588214, 'num_leaves': 59, 'bagging_fraction': 0.503504435977992, 'feature_fraction': 0.38599526770709525}. Best is trial 19 with value: 0.5485466418031059.[0m




[32m[I 2023-02-08 15:37:54,181][0m Trial 21 finished with value: 0.5456005928372407 and parameters: {'n_estimators': 3401, 'learning_rate': 0.0024405673798395754, 'max_depth': 11, 'lambda_l1': 0.15624837103152053, 'lambda_l2': 0.300287787851289, 'num_leaves': 36, 'bagging_fraction': 0.37291357893433347, 'feature_fraction': 0.2910547910285711}. Best is trial 19 with value: 0.5485466418031059.[0m




[32m[I 2023-02-08 15:38:16,185][0m Trial 22 finished with value: 0.5431676238380609 and parameters: {'n_estimators': 4606, 'learning_rate': 0.0022778261745038503, 'max_depth': 11, 'lambda_l1': 0.06200636743371233, 'lambda_l2': 0.0934189944861076, 'num_leaves': 32, 'bagging_fraction': 0.27515678832275015, 'feature_fraction': 0.34584441233871077}. Best is trial 19 with value: 0.5485466418031059.[0m




[32m[I 2023-02-08 15:38:25,085][0m Trial 23 finished with value: 0.5377775669515218 and parameters: {'n_estimators': 2515, 'learning_rate': 0.00342099297991379, 'max_depth': 10, 'lambda_l1': 0.31667397504940176, 'lambda_l2': 0.1662165219463892, 'num_leaves': 3, 'bagging_fraction': 0.3917392158880628, 'feature_fraction': 0.2839629570764546}. Best is trial 19 with value: 0.5485466418031059.[0m




[32m[I 2023-02-08 15:38:57,157][0m Trial 24 finished with value: 0.5253454142127781 and parameters: {'n_estimators': 3935, 'learning_rate': 0.006252875928861207, 'max_depth': 12, 'lambda_l1': 0.7852269482079011, 'lambda_l2': 0.062005966054188226, 'num_leaves': 42, 'bagging_fraction': 0.48746410573930077, 'feature_fraction': 0.2601249168470019}. Best is trial 19 with value: 0.5485466418031059.[0m




[32m[I 2023-02-08 15:39:28,410][0m Trial 25 finished with value: 0.5411199932884363 and parameters: {'n_estimators': 5322, 'learning_rate': 0.0017587512803832884, 'max_depth': 11, 'lambda_l1': 0.18513247808215172, 'lambda_l2': 0.603418905287889, 'num_leaves': 99, 'bagging_fraction': 0.3263391984278114, 'feature_fraction': 0.34960046951306695}. Best is trial 19 with value: 0.5485466418031059.[0m




[32m[I 2023-02-08 15:39:39,208][0m Trial 26 finished with value: 0.5528093820370178 and parameters: {'n_estimators': 1431, 'learning_rate': 0.0023220906472575607, 'max_depth': 10, 'lambda_l1': 0.05644849642443072, 'lambda_l2': 0.2276777791893894, 'num_leaves': 56, 'bagging_fraction': 0.39094304273905595, 'feature_fraction': 0.25379791570249327}. Best is trial 26 with value: 0.5528093820370178.[0m




[32m[I 2023-02-08 15:39:51,282][0m Trial 27 finished with value: 0.5296509090875194 and parameters: {'n_estimators': 1322, 'learning_rate': 0.004025799056558257, 'max_depth': 9, 'lambda_l1': 0.056921642715425325, 'lambda_l2': 0.1801759667556994, 'num_leaves': 59, 'bagging_fraction': 0.25467233011268686, 'feature_fraction': 0.45329769261941794}. Best is trial 26 with value: 0.5528093820370178.[0m




[32m[I 2023-02-08 15:40:15,055][0m Trial 28 finished with value: 0.526351357172835 and parameters: {'n_estimators': 2716, 'learning_rate': 0.009184099497120157, 'max_depth': 10, 'lambda_l1': 0.013356243116721105, 'lambda_l2': 0.25889456919468123, 'num_leaves': 71, 'bagging_fraction': 0.39955956719091523, 'feature_fraction': 0.3798061669224757}. Best is trial 26 with value: 0.5528093820370178.[0m




[32m[I 2023-02-08 15:40:27,173][0m Trial 29 finished with value: 0.5414716413685262 and parameters: {'n_estimators': 1119, 'learning_rate': 0.0019709908860821184, 'max_depth': 8, 'lambda_l1': 0.32169694607573657, 'lambda_l2': 1.3471828151442822, 'num_leaves': 47, 'bagging_fraction': 0.45712941795219253, 'feature_fraction': 0.43436848319200805}. Best is trial 26 with value: 0.5528093820370178.[0m




[32m[I 2023-02-08 15:40:41,802][0m Trial 30 finished with value: 0.5461696180068527 and parameters: {'n_estimators': 2038, 'learning_rate': 0.00586436842143968, 'max_depth': 7, 'lambda_l1': 0.0998261276270765, 'lambda_l2': 0.5202664076537102, 'num_leaves': 56, 'bagging_fraction': 0.31475485116165675, 'feature_fraction': 0.5182608440232123}. Best is trial 26 with value: 0.5528093820370178.[0m




[32m[I 2023-02-08 15:40:55,650][0m Trial 31 finished with value: 0.5550390377470522 and parameters: {'n_estimators': 2723, 'learning_rate': 0.001894372421097123, 'max_depth': 11, 'lambda_l1': 0.4027195442727825, 'lambda_l2': 0.7521393165306123, 'num_leaves': 29, 'bagging_fraction': 0.3710533949653182, 'feature_fraction': 0.2500180023066253}. Best is trial 31 with value: 0.5550390377470522.[0m




[32m[I 2023-02-08 15:41:09,744][0m Trial 32 finished with value: 0.5463737999946728 and parameters: {'n_estimators': 2895, 'learning_rate': 0.002703167231762968, 'max_depth': 10, 'lambda_l1': 0.40510192679412693, 'lambda_l2': 0.9324710190226286, 'num_leaves': 44, 'bagging_fraction': 0.40333444024271997, 'feature_fraction': 0.2563499965804407}. Best is trial 31 with value: 0.5550390377470522.[0m




[32m[I 2023-02-08 15:41:24,721][0m Trial 33 finished with value: 0.5388403812780682 and parameters: {'n_estimators': 3145, 'learning_rate': 0.0017257350777116567, 'max_depth': 12, 'lambda_l1': 0.2016734168048747, 'lambda_l2': 2.233141724679358, 'num_leaves': 27, 'bagging_fraction': 0.47835303648501176, 'feature_fraction': 0.2537434020197921}. Best is trial 31 with value: 0.5550390377470522.[0m




[32m[I 2023-02-08 15:41:31,009][0m Trial 34 finished with value: 0.5454950959223115 and parameters: {'n_estimators': 316, 'learning_rate': 0.004230944997733426, 'max_depth': 9, 'lambda_l1': 0.038158796841876216, 'lambda_l2': 0.4384592032604111, 'num_leaves': 19, 'bagging_fraction': 0.24920453063089715, 'feature_fraction': 0.3296852841406367}. Best is trial 31 with value: 0.5550390377470522.[0m




[32m[I 2023-02-08 15:41:44,293][0m Trial 35 finished with value: 0.5576314002889131 and parameters: {'n_estimators': 2057, 'learning_rate': 0.0028340232667834744, 'max_depth': 6, 'lambda_l1': 0.18078853991285831, 'lambda_l2': 0.7550241946424767, 'num_leaves': 82, 'bagging_fraction': 0.37125129346914726, 'feature_fraction': 0.3773044655610498}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:41:53,632][0m Trial 36 finished with value: 0.5460734877842965 and parameters: {'n_estimators': 1066, 'learning_rate': 0.0016305558682014624, 'max_depth': 6, 'lambda_l1': 0.10964961819082857, 'lambda_l2': 0.8189604665959226, 'num_leaves': 85, 'bagging_fraction': 0.3753797360548568, 'feature_fraction': 0.37738051689644947}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:42:08,222][0m Trial 37 finished with value: 0.5370486377090973 and parameters: {'n_estimators': 1952, 'learning_rate': 0.010074653313741492, 'max_depth': 6, 'lambda_l1': 0.13221115156784607, 'lambda_l2': 2.24495888183153, 'num_leaves': 100, 'bagging_fraction': 0.4294822140484257, 'feature_fraction': 0.4356833128652025}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:42:21,627][0m Trial 38 finished with value: 0.5466798993321624 and parameters: {'n_estimators': 2273, 'learning_rate': 0.004953237564397625, 'max_depth': 5, 'lambda_l1': 0.17690427151590543, 'lambda_l2': 0.5212962132171464, 'num_leaves': 79, 'bagging_fraction': 0.34101274771733, 'feature_fraction': 0.34046576760433017}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:42:33,038][0m Trial 39 finished with value: 0.5558277622049076 and parameters: {'n_estimators': 1685, 'learning_rate': 0.002939388002726449, 'max_depth': 5, 'lambda_l1': 0.0750907128134336, 'lambda_l2': 1.0359025925202638, 'num_leaves': 20, 'bagging_fraction': 0.5093239856974769, 'feature_fraction': 0.39780536507196895}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:42:38,511][0m Trial 40 finished with value: 0.49605533185538214 and parameters: {'n_estimators': 933, 'learning_rate': 0.003380060491141557, 'max_depth': 5, 'lambda_l1': 0.06612924347219362, 'lambda_l2': 1.7462615487514295, 'num_leaves': 4, 'bagging_fraction': 0.3116999159000776, 'feature_fraction': 0.4140409032690155}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:42:47,184][0m Trial 41 finished with value: 0.5436395591874339 and parameters: {'n_estimators': 2049, 'learning_rate': 0.0030330997299077463, 'max_depth': 4, 'lambda_l1': 0.09037592078616431, 'lambda_l2': 0.9018272158701278, 'num_leaves': 22, 'bagging_fraction': 0.5273672082969458, 'feature_fraction': 0.3692607185862401}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:42:56,747][0m Trial 42 finished with value: 0.5512730249688621 and parameters: {'n_estimators': 1634, 'learning_rate': 0.0016883818503746328, 'max_depth': 6, 'lambda_l1': 0.14088589826034814, 'lambda_l2': 0.6523244743528013, 'num_leaves': 31, 'bagging_fraction': 0.4600344723845823, 'feature_fraction': 0.3320742948917043}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:43:10,264][0m Trial 43 finished with value: 0.541645556936502 and parameters: {'n_estimators': 1508, 'learning_rate': 0.0015049831191382124, 'max_depth': 6, 'lambda_l1': 0.13376179244431927, 'lambda_l2': 0.6206041394456384, 'num_leaves': 17, 'bagging_fraction': 0.45380393446147965, 'feature_fraction': 0.31743534336917817}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:43:17,126][0m Trial 44 finished with value: 0.5567267043511374 and parameters: {'n_estimators': 486, 'learning_rate': 0.005285044638289517, 'max_depth': 5, 'lambda_l1': 0.035115830078967424, 'lambda_l2': 1.0320354863920331, 'num_leaves': 10, 'bagging_fraction': 0.5523450637405137, 'feature_fraction': 0.25053062847639884}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:43:23,921][0m Trial 45 finished with value: 0.5428283167950889 and parameters: {'n_estimators': 656, 'learning_rate': 0.005220731495732958, 'max_depth': 5, 'lambda_l1': 0.027278395014550955, 'lambda_l2': 1.5269195087775562, 'num_leaves': 7, 'bagging_fraction': 0.544678385366338, 'feature_fraction': 0.2451984043167164}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:43:30,638][0m Trial 46 finished with value: 0.5566195464307555 and parameters: {'n_estimators': 802, 'learning_rate': 0.007221119352156271, 'max_depth': 3, 'lambda_l1': 0.03489446351971202, 'lambda_l2': 1.106217124764079, 'num_leaves': 10, 'bagging_fraction': 0.6168959355458993, 'feature_fraction': 0.24259812614458906}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:43:36,266][0m Trial 47 finished with value: 0.5447255518379654 and parameters: {'n_estimators': 357, 'learning_rate': 0.007513603315844528, 'max_depth': 3, 'lambda_l1': 0.04040507332550002, 'lambda_l2': 1.0923461498341436, 'num_leaves': 6, 'bagging_fraction': 0.6021446672385868, 'feature_fraction': 0.27647534931977313}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:43:43,237][0m Trial 48 finished with value: 0.5395209142722307 and parameters: {'n_estimators': 778, 'learning_rate': 0.012838308749953665, 'max_depth': 4, 'lambda_l1': 0.018508053970504694, 'lambda_l2': 2.737264589384271, 'num_leaves': 13, 'bagging_fraction': 0.5879710299266914, 'feature_fraction': 0.32334467410853635}. Best is trial 35 with value: 0.5576314002889131.[0m




[32m[I 2023-02-08 15:43:51,925][0m Trial 49 finished with value: 0.5420252119823579 and parameters: {'n_estimators': 2499, 'learning_rate': 0.0048422561879380545, 'max_depth': 5, 'lambda_l1': 0.03339064349263303, 'lambda_l2': 1.187247856328268, 'num_leaves': 9, 'bagging_fraction': 0.6084339432148098, 'feature_fraction': 0.23284683514769403}. Best is trial 35 with value: 0.5576314002889131.[0m


In [22]:
study.best_trial

FrozenTrial(number=35, state=TrialState.COMPLETE, values=[0.5576314002889131], datetime_start=datetime.datetime(2023, 2, 8, 15, 41, 31, 10888), datetime_complete=datetime.datetime(2023, 2, 8, 15, 41, 44, 292933), params={'n_estimators': 2057, 'learning_rate': 0.0028340232667834744, 'max_depth': 6, 'lambda_l1': 0.18078853991285831, 'lambda_l2': 0.7550241946424767, 'num_leaves': 82, 'bagging_fraction': 0.37125129346914726, 'feature_fraction': 0.3773044655610498}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=10000, log=False, low=300, step=1), 'learning_rate': FloatDistribution(high=1.0, log=True, low=0.001, step=None), 'max_depth': IntDistribution(high=12, log=False, low=3, step=1), 'lambda_l1': FloatDistribution(high=10.0, log=True, low=0.01, step=None), 'lambda_l2': FloatDistribution(high=10.0, log=True, low=0.01, step=None), 'num_leaves': IntDistribution(high=100, log=False, low=2, step=1), 'bagging_fraction': FloatDistrib

In [20]:
study.best_trial.values

[0.5576314002889131]

In [23]:
study.best_trial.params

{'n_estimators': 2057,
 'learning_rate': 0.0028340232667834744,
 'max_depth': 6,
 'lambda_l1': 0.18078853991285831,
 'lambda_l2': 0.7550241946424767,
 'num_leaves': 82,
 'bagging_fraction': 0.37125129346914726,
 'feature_fraction': 0.3773044655610498}

# LightGBM Modeling

In [24]:
lgb_cv_scores, lgb_imp = list(), list()
preds = list()

skf = KFold(n_splits = 5, shuffle = True, random_state = 906)
    
for train_ix, test_ix in skf.split(X, Y):
        
    ## Splitting the data 
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
    ## Building the model
    lgb_md = LGBMRegressor(n_estimators = 2057,
                            max_depth = 6,
                            learning_rate = 0.0028340232667834744,
                            num_leaves = 82,
                            lambda_l1 = 0.18078853991285831,
                            lambda_l2 = 0.7550241946424767,
                            bagging_fraction = 0.37125129346914726,
                            feature_fraction = 0.3773044655610498,
                            random_state = 906).fit(X_train, Y_train)
    lgb_imp.append(lgb_md.feature_importances_)
    
    ## Predicting on X_test and test
    lgb_pred_1 = lgb_md.predict(X_test)
    lgb_pred_2 = lgb_md.predict(test_md)
        
    ## Applying Optimal Rounder (using abhishek approach)
    optR = OptimizedRounder()
    optR.fit(lgb_md.predict(X_train), Y_train)
    coef = optR.coefficients()
    lgb_pred_1 = optR.predict(lgb_pred_1, coef).astype(int)
    lgb_pred_2 = optR.predict(lgb_pred_2, coef).astype(int)
        
    ## Computing roc-auc score
    lgb_cv_scores.append(cohen_kappa_score(Y_test, lgb_pred_1, weights = 'quadratic'))
    preds.append(lgb_pred_2)

lgb_cv_score = np.mean(lgb_cv_scores)    
print('The average weighted kappa score score over 5-folds (run 5 times) is:', lgb_cv_score)

The average weighted kappa score score over 5-folds (run 5 times) is: 0.5427570369149294


In [25]:
lgb_preds_test = pd.DataFrame(preds).mode(axis = 0).loc[0, ]

submission['quality'] = lgb_preds_test.astype(int)
submission.head()

Unnamed: 0,Id,quality
0,2056,5
1,2057,6
2,2058,5
3,2059,6
4,2060,7


In [26]:
submission['quality'].value_counts()

5    576
6    431
7    365
Name: quality, dtype: int64

In [49]:
lgb_md = LGBMRegressor(n_estimators = 2057,
                            max_depth = 6,
                            learning_rate = 0.0028340232667834744,
                            num_leaves = 82,
                            lambda_l1 = 0.18078853991285831,
                            lambda_l2 = 0.7550241946424767,
                            bagging_fraction = 0.37125129346914726,
                            feature_fraction = 0.3773044655610498,
                            random_state = 906).fit(X, Y)

optR = OptimizedRounder()
optR.fit(lgb_md.predict(X), Y)
coef = optR.coefficients()
lgb_pred = lgb_md.predict(test_md)
lgb_pred = optR.predict(lgb_pred, coef).astype(int)



In [57]:
coef

array([3.35384325, 5.0094123 , 5.64198756, 6.14471119, 7.58699092])

In [58]:
submission['quality'] = lgb_pred
submission.head() 

Unnamed: 0,Id,quality
0,2056,5
1,2057,6
2,2058,5
3,2059,6
4,2060,6


In [59]:
submission['quality'].value_counts()

5    557
6    396
7    363
4     56
Name: quality, dtype: int64

In [60]:
submission.to_csv('LightGBM_Reg_full_FE_4.csv', index = False)

In [27]:
best_sub = pd.read_csv('kaggle_submission.csv')
best_sub['quality'].value_counts()

6    691
5    529
7    152
Name: quality, dtype: int64

In [28]:
oscar_best = pd.read_csv('XGB_Reg_FE_2.csv')
oscar_best['quality'].value_counts()

5    634
6    425
7    313
Name: quality, dtype: int64

In [None]:
submission.to_csv('LightGBM_Reg_FE_3.csv', index = False)

In [29]:
2 + 2

4

In [30]:
data_out = pd.merge(best_sub, oscar_best, on = 'Id')
data_out.head(10)

Unnamed: 0,Id,quality_x,quality_y
0,2056,5,5
1,2057,6,6
2,2058,5,5
3,2059,6,6
4,2060,6,6
5,2061,6,6
6,2062,6,5
7,2063,6,6
8,2064,7,6
9,2065,6,6


In [39]:
to_check = data_out[~(data_out['quality_x'] == data_out['quality_y'])]['Id'].tolist()

In [40]:
train.describe()

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,alcohol_density,sulphate/density,alcohol_sulphate
count,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0,2056.0
mean,1027.5,8.365175,0.527601,0.265058,2.398881,0.081856,16.955982,49.236868,0.996748,3.310569,0.641308,10.414972,5.720817,10.380431,0.643404,6.714287
std,593.660397,1.70539,0.173164,0.188267,0.858824,0.023729,10.00971,32.961141,0.001827,0.142321,0.137942,1.028825,0.853146,1.018378,0.138401,1.73627
min,0.0,5.0,0.18,0.0,1.2,0.012,1.0,7.0,0.99007,2.74,0.39,8.7,3.0,8.66781,0.390938,3.783
25%,513.75,7.2,0.39,0.09,1.9,0.071,8.0,22.0,0.9956,3.2,0.55,9.5,5.0,9.4829,0.550028,5.4
50%,1027.5,7.95,0.52,0.25,2.2,0.079,16.0,44.0,0.9967,3.31,0.61,10.1,6.0,10.06566,0.61182,6.326
75%,1541.25,9.2,0.64,0.42,2.6,0.09,24.0,65.0,0.9978,3.39,0.72,11.0,6.0,11.001925,0.719748,7.702
max,2055.0,15.9,1.58,0.76,14.0,0.414,68.0,289.0,1.00369,3.78,1.95,14.0,8.0,13.96248,1.95626,21.45


In [47]:
train[train['alcohol'] > 11]

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,alcohol_density,sulphate/density,alcohol_sulphate
0,0,8.0,0.50,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1,6,12.048212,0.773310,9.317
1,1,9.3,0.30,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8,6,12.781312,0.670980,8.576
2,2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.99660,3.52,0.73,11.3,7,11.261580,0.732490,8.249
6,6,7.2,0.87,0.00,2.3,0.080,6.0,18.0,0.99552,3.34,0.60,11.3,6,11.249376,0.602700,6.780
16,16,11.5,0.27,0.60,2.3,0.089,11.0,25.0,0.99628,3.10,0.44,12.0,7,11.955360,0.441643,5.280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2044,2044,8.2,0.29,0.42,4.3,0.085,6.0,12.0,0.99627,3.30,0.62,12.1,6,12.054867,0.622321,7.502
2050,2050,11.9,0.38,0.49,2.3,0.081,24.0,42.0,0.99940,3.15,0.59,11.5,6,11.493100,0.590354,6.785
2051,2051,6.6,0.31,0.13,2.0,0.056,29.0,42.0,0.99388,3.52,0.87,12.0,7,11.926560,0.875357,10.440
2053,2053,7.7,0.43,0.42,1.7,0.071,19.0,37.0,0.99258,3.32,0.77,12.5,8,12.407250,0.775756,9.625


In [45]:
test[np.isin(test['Id'], to_check)].describe()

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,alcohol_density,sulphate/density,alcohol_sulphate
count,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0,284.0
mean,2758.559859,8.634859,0.496232,0.29838,2.497007,0.082444,16.147887,42.376761,0.996756,3.3025,0.731268,10.549296,10.514598,0.733621,7.699254
std,399.555866,1.82401,0.177911,0.19309,0.942124,0.020255,9.157011,26.720295,0.001735,0.133514,0.13678,0.824771,0.815785,0.137098,1.469124
min,2062.0,5.0,0.2,0.0,1.5,0.041,3.0,8.0,0.99182,2.89,0.45,9.0,8.9694,0.452261,4.95
25%,2398.25,7.3,0.36,0.12,2.0,0.06975,7.0,22.0,0.9956,3.2,0.64,10.0,9.9604,0.644944,6.385
50%,2785.5,8.25,0.455,0.31,2.2,0.078,15.0,37.0,0.9966,3.3,0.73,10.5,10.47018,0.73351,7.91
75%,3092.75,9.825,0.61,0.4525,2.6,0.089,23.0,54.0,0.997885,3.3825,0.8025,11.0,10.950555,0.805491,8.585
max,3424.0,15.6,1.09,0.74,7.9,0.214,41.0,147.0,1.00289,3.78,1.62,14.0,13.9412,1.625527,16.2
