In [24]:
pip install lightgbm optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.cluster import KMeans
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score, cohen_kappa_score, davies_bouldin_score, calinski_harabasz_score, silhouette_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor
# from xgboost import XGBClassifier, XGBRegressor
# from catboost import CatBoostClassifier

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/TS-S3-Ep5/train.csv'
file_key_2 = 'Tabular-Playground-Series/TS-S3-Ep5/test.csv'
file_key_3 = 'Tabular-Playground-Series/TS-S3-Ep5/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

## Enginering features
train['alcohol_density'] = train['alcohol'] * train['density']
train['sulphate/density'] = train['sulphates']  / train['density']
train['alcohol_sulphate'] = train['alcohol'] * train['sulphates']

test['alcohol_density'] = test['alcohol']  * test['density']
test['sulphate/density'] = test['sulphates']  / test['density']
test['alcohol_sulphate'] = test['alcohol'] * test['sulphates']

test_md = test.copy()

X = train[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates']]
Y = train['quality'] 

test_md = test_md[['sulphate/density', 'alcohol_density', 'alcohol', 'sulphates']]

# Optimal Rounder

In [2]:
from functools import partial
import numpy as np
import scipy as sp

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8

        ll = cohen_kappa_score(y, X_p, weights = 'quadratic')
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        initial_coef = [3.5, 4.5, 5.5, 6.5, 7.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method = 'nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8
        return X_p

    def coefficients(self):
        return self.coef_['x']

# Optuna Optimzation

In [3]:
def objective(trial):
    
    ## Parameters to be evaluated
    param = dict(objective = 'regression_l1',
                 verbose = -100,
                 boosting_type = 'gbdt', 
                 random_state = 42,
                 n_estimators = trial.suggest_int('n_estimators', 300, 10000),
                 learning_rate = trial.suggest_float('learning_rate', 0.001, 1, log=True),
                 max_depth = trial.suggest_int('max_depth', 3, 12),
                 lambda_l1 = trial.suggest_float('lambda_l1', 0.01, 10.0, log=True),
                 lambda_l2 = trial.suggest_float('lambda_l2', 0.01, 10.0, log=True),
                 num_leaves = trial.suggest_int('num_leaves', 2, 100),
                 bagging_fraction = trial.suggest_float('bagging_fraction', 0.2, 0.9),
                 feature_fraction = trial.suggest_float('feature_fraction', 0.2, 0.9)
                 )

    
    scores = []

#     skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    skf = KFold(n_splits = 5, shuffle = True, random_state = 42)
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, Y)):
        
        print(fold, end = ' ')
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train , y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

        model = LGBMRegressor(**param)
        model.fit(X_train, y_train)

        preds_valid = model.predict(X_valid)
        
        optR = OptimizedRounder()
        optR.fit(preds_valid, y_valid)
        coef = optR.coefficients()
        preds_valid = optR.predict(preds_valid, coef).astype(int)
        
        score = cohen_kappa_score(y_valid,  preds_valid, weights = "quadratic")
        scores.append(score)
        
    return np.mean(scores)

In [28]:
study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials = 50, timeout = 3600)

[32m[I 2023-02-06 20:53:22,549][0m A new study created in memory with name: no-name-105458ed-b7b8-4a45-8d60-972b8c18b1b0[0m




[32m[I 2023-02-06 20:53:29,090][0m Trial 0 finished with value: 0.3880404541003152 and parameters: {'n_estimators': 6968, 'learning_rate': 0.0030967402634097716, 'max_depth': 11, 'lambda_l1': 0.38218629383871416, 'lambda_l2': 0.027010052172435354, 'num_leaves': 2, 'bagging_fraction': 0.3319118604868666, 'feature_fraction': 0.7672166744399376}. Best is trial 0 with value: 0.3880404541003152.[0m




[32m[I 2023-02-06 20:53:33,155][0m Trial 1 finished with value: 0.5254418885456448 and parameters: {'n_estimators': 610, 'learning_rate': 0.32549346930959216, 'max_depth': 6, 'lambda_l1': 0.015170233035481328, 'lambda_l2': 1.253274261270629, 'num_leaves': 94, 'bagging_fraction': 0.5377442807200433, 'feature_fraction': 0.703251472437088}. Best is trial 1 with value: 0.5254418885456448.[0m




[32m[I 2023-02-06 20:53:52,018][0m Trial 2 finished with value: 0.5502906374368911 and parameters: {'n_estimators': 9418, 'learning_rate': 0.010448648763031115, 'max_depth': 8, 'lambda_l1': 1.2551387582780704, 'lambda_l2': 9.67294376014615, 'num_leaves': 15, 'bagging_fraction': 0.8659700547350371, 'feature_fraction': 0.2958456555563941}. Best is trial 2 with value: 0.5502906374368911.[0m




[32m[I 2023-02-06 20:53:58,847][0m Trial 3 finished with value: 0.5382543018325476 and parameters: {'n_estimators': 1530, 'learning_rate': 0.018706794891134866, 'max_depth': 8, 'lambda_l1': 0.2618107952024081, 'lambda_l2': 0.12833804591143266, 'num_leaves': 61, 'bagging_fraction': 0.6281663288880251, 'feature_fraction': 0.670661848039721}. Best is trial 2 with value: 0.5502906374368911.[0m




[32m[I 2023-02-06 20:54:15,708][0m Trial 4 finished with value: 0.5463294456110377 and parameters: {'n_estimators': 3778, 'learning_rate': 0.00260832606450151, 'max_depth': 11, 'lambda_l1': 0.2594846395974583, 'lambda_l2': 0.011216130723046053, 'num_leaves': 55, 'bagging_fraction': 0.587269849788161, 'feature_fraction': 0.5931908277861608}. Best is trial 2 with value: 0.5502906374368911.[0m




[32m[I 2023-02-06 20:54:30,501][0m Trial 5 finished with value: 0.5118112771348977 and parameters: {'n_estimators': 4693, 'learning_rate': 0.31979872469289455, 'max_depth': 8, 'lambda_l1': 0.16045227750768043, 'lambda_l2': 3.2815959055707293, 'num_leaves': 57, 'bagging_fraction': 0.41898406440744307, 'feature_fraction': 0.6499222237531525}. Best is trial 2 with value: 0.5502906374368911.[0m




[32m[I 2023-02-06 20:54:36,975][0m Trial 6 finished with value: 0.5404857726181251 and parameters: {'n_estimators': 2771, 'learning_rate': 0.9306851588097913, 'max_depth': 11, 'lambda_l1': 3.966149899134551, 'lambda_l2': 0.025885566286394147, 'num_leaves': 92, 'bagging_fraction': 0.8253701770624062, 'feature_fraction': 0.27735515154102236}. Best is trial 2 with value: 0.5502906374368911.[0m




[32m[I 2023-02-06 20:54:44,993][0m Trial 7 finished with value: 0.4925510314956177 and parameters: {'n_estimators': 2274, 'learning_rate': 0.9991450950790579, 'max_depth': 8, 'lambda_l1': 1.1926022324749614, 'lambda_l2': 0.2005476911987778, 'num_leaves': 65, 'bagging_fraction': 0.7221457594275438, 'feature_fraction': 0.8271986675849092}. Best is trial 2 with value: 0.5502906374368911.[0m




[32m[I 2023-02-06 20:54:55,525][0m Trial 8 finished with value: 0.5591330226377825 and parameters: {'n_estimators': 9494, 'learning_rate': 0.001283788600272066, 'max_depth': 4, 'lambda_l1': 4.287288975612532, 'lambda_l2': 0.48567893958477937, 'num_leaves': 100, 'bagging_fraction': 0.7573972922198822, 'feature_fraction': 0.20475314452014554}. Best is trial 8 with value: 0.5591330226377825.[0m




[32m[I 2023-02-06 20:54:57,572][0m Trial 9 finished with value: 0.5652895509115479 and parameters: {'n_estimators': 467, 'learning_rate': 0.05240981859986616, 'max_depth': 9, 'lambda_l1': 0.118994335900865, 'lambda_l2': 1.9163687705329426, 'num_leaves': 6, 'bagging_fraction': 0.3479590683812721, 'feature_fraction': 0.5709253871055042}. Best is trial 9 with value: 0.5652895509115479.[0m




[32m[I 2023-02-06 20:55:10,356][0m Trial 10 finished with value: 0.5407571194060747 and parameters: {'n_estimators': 7122, 'learning_rate': 0.054500339739466754, 'max_depth': 5, 'lambda_l1': 0.038795531062368584, 'lambda_l2': 1.4203332360312941, 'num_leaves': 26, 'bagging_fraction': 0.255480866678903, 'feature_fraction': 0.4764865254536225}. Best is trial 9 with value: 0.5652895509115479.[0m




[32m[I 2023-02-06 20:55:24,213][0m Trial 11 finished with value: 0.5300494387241805 and parameters: {'n_estimators': 9456, 'learning_rate': 0.0010531591424863286, 'max_depth': 3, 'lambda_l1': 8.52678229846395, 'lambda_l2': 0.5240348577880385, 'num_leaves': 32, 'bagging_fraction': 0.44067716291804343, 'feature_fraction': 0.45610680402524095}. Best is trial 9 with value: 0.5652895509115479.[0m




[32m[I 2023-02-06 20:55:31,235][0m Trial 12 finished with value: 0.5677019478328463 and parameters: {'n_estimators': 6622, 'learning_rate': 0.04793223771258008, 'max_depth': 3, 'lambda_l1': 0.08225945719813345, 'lambda_l2': 0.6021347560337426, 'num_leaves': 78, 'bagging_fraction': 0.728733119756391, 'feature_fraction': 0.20837171985254213}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:55:44,196][0m Trial 13 finished with value: 0.521468940225392 and parameters: {'n_estimators': 6466, 'learning_rate': 0.05156863107114966, 'max_depth': 6, 'lambda_l1': 0.07530037070297133, 'lambda_l2': 3.148019992105491, 'num_leaves': 79, 'bagging_fraction': 0.21117138587583456, 'feature_fraction': 0.5020300513301204}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:56:04,956][0m Trial 14 finished with value: 0.5192305949486263 and parameters: {'n_estimators': 5776, 'learning_rate': 0.08177798475395352, 'max_depth': 10, 'lambda_l1': 0.051289957691119305, 'lambda_l2': 0.7398717494224422, 'num_leaves': 41, 'bagging_fraction': 0.6678836346551112, 'feature_fraction': 0.40019344176501753}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:56:32,092][0m Trial 15 finished with value: 0.540985931585937 and parameters: {'n_estimators': 7932, 'learning_rate': 0.01973439661685472, 'max_depth': 9, 'lambda_l1': 0.017878889770942964, 'lambda_l2': 0.18722759843481038, 'num_leaves': 77, 'bagging_fraction': 0.5021022854525654, 'feature_fraction': 0.5721680518066218}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:56:41,529][0m Trial 16 finished with value: 0.5287703705383164 and parameters: {'n_estimators': 4500, 'learning_rate': 0.09967749274450048, 'max_depth': 6, 'lambda_l1': 0.10865783291932259, 'lambda_l2': 2.379294569380132, 'num_leaves': 76, 'bagging_fraction': 0.6590794353098028, 'feature_fraction': 0.38991511350469776}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:56:43,273][0m Trial 17 finished with value: 0.4039718316236881 and parameters: {'n_estimators': 480, 'learning_rate': 0.011270651432791757, 'max_depth': 3, 'lambda_l1': 0.03803491178105155, 'lambda_l2': 9.802919139105397, 'num_leaves': 43, 'bagging_fraction': 0.8920932984559278, 'feature_fraction': 0.8971247229668342}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:56:54,397][0m Trial 18 finished with value: 0.5576388106803416 and parameters: {'n_estimators': 8179, 'learning_rate': 0.1244633704083593, 'max_depth': 12, 'lambda_l1': 0.11051774440437333, 'lambda_l2': 0.4151758212376172, 'num_leaves': 7, 'bagging_fraction': 0.3561436149711876, 'feature_fraction': 0.5684265952126187}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:57:04,464][0m Trial 19 finished with value: 0.5461550750874069 and parameters: {'n_estimators': 3060, 'learning_rate': 0.039093060386562545, 'max_depth': 7, 'lambda_l1': 0.011120144434198953, 'lambda_l2': 0.862194092829523, 'num_leaves': 24, 'bagging_fraction': 0.5670025948618933, 'feature_fraction': 0.3986821389512214}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:57:23,315][0m Trial 20 finished with value: 0.5530399291586827 and parameters: {'n_estimators': 5393, 'learning_rate': 0.02755842266981102, 'max_depth': 9, 'lambda_l1': 0.02608543651640553, 'lambda_l2': 0.2818304166175938, 'num_leaves': 68, 'bagging_fraction': 0.47834868498751215, 'feature_fraction': 0.20120353292017296}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:57:37,496][0m Trial 21 finished with value: 0.5569275888878191 and parameters: {'n_estimators': 8395, 'learning_rate': 0.009784576811608927, 'max_depth': 4, 'lambda_l1': 0.5179068557051463, 'lambda_l2': 0.44901903268254395, 'num_leaves': 98, 'bagging_fraction': 0.7736121134091272, 'feature_fraction': 0.24784997196769115}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:57:50,747][0m Trial 22 finished with value: 0.5606602627216797 and parameters: {'n_estimators': 8654, 'learning_rate': 0.005162929046471226, 'max_depth': 4, 'lambda_l1': 0.08473079871939207, 'lambda_l2': 1.6598594534172855, 'num_leaves': 87, 'bagging_fraction': 0.7422502386942813, 'feature_fraction': 0.33845872952435974}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:58:00,887][0m Trial 23 finished with value: 0.5616495911120282 and parameters: {'n_estimators': 5931, 'learning_rate': 0.0045925533670990665, 'max_depth': 4, 'lambda_l1': 0.06276381274409497, 'lambda_l2': 1.8550516634661298, 'num_leaves': 88, 'bagging_fraction': 0.7208627228849043, 'feature_fraction': 0.31228895610948754}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:58:08,766][0m Trial 24 finished with value: 0.5554718724765291 and parameters: {'n_estimators': 6181, 'learning_rate': 0.02924394190931495, 'max_depth': 3, 'lambda_l1': 0.057131295647788245, 'lambda_l2': 4.3880763084961, 'num_leaves': 81, 'bagging_fraction': 0.687158321137505, 'feature_fraction': 0.32478418428002087}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:58:28,713][0m Trial 25 finished with value: 0.5520858608995871 and parameters: {'n_estimators': 7273, 'learning_rate': 0.006294566968486827, 'max_depth': 5, 'lambda_l1': 0.14437700076462917, 'lambda_l2': 1.682637219313357, 'num_leaves': 46, 'bagging_fraction': 0.6084032468238985, 'feature_fraction': 0.26669850016270963}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:58:37,835][0m Trial 26 finished with value: 0.558788379456262 and parameters: {'n_estimators': 4248, 'learning_rate': 0.027194745306915544, 'max_depth': 5, 'lambda_l1': 0.027645496760829204, 'lambda_l2': 0.8193711710368262, 'num_leaves': 69, 'bagging_fraction': 0.8254666179319955, 'feature_fraction': 0.34215865519159283}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:58:55,560][0m Trial 27 finished with value: 0.5352184814048122 and parameters: {'n_estimators': 5264, 'learning_rate': 0.017105847786013297, 'max_depth': 9, 'lambda_l1': 0.06905066743876292, 'lambda_l2': 5.397091446186392, 'num_leaves': 87, 'bagging_fraction': 0.6972894714821216, 'feature_fraction': 0.5197942752252654}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:59:04,128][0m Trial 28 finished with value: 0.5572099861794717 and parameters: {'n_estimators': 3468, 'learning_rate': 0.05665846190495744, 'max_depth': 7, 'lambda_l1': 0.14482019274347768, 'lambda_l2': 2.164528489498517, 'num_leaves': 35, 'bagging_fraction': 0.6349324034580863, 'feature_fraction': 0.2008989183915979}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:59:14,848][0m Trial 29 finished with value: 0.5573571770118588 and parameters: {'n_estimators': 6439, 'learning_rate': 0.002233005774910824, 'max_depth': 10, 'lambda_l1': 0.04838784824399976, 'lambda_l2': 0.8050369562481035, 'num_leaves': 11, 'bagging_fraction': 0.5593537609407931, 'feature_fraction': 0.4418571206379379}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:59:19,767][0m Trial 30 finished with value: 0.39193782808798605 and parameters: {'n_estimators': 7592, 'learning_rate': 0.003905401504656467, 'max_depth': 4, 'lambda_l1': 0.19944591738286535, 'lambda_l2': 1.2377419081959316, 'num_leaves': 2, 'bagging_fraction': 0.7897687923565024, 'feature_fraction': 0.5249918434676478}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:59:31,073][0m Trial 31 finished with value: 0.5550748333496054 and parameters: {'n_estimators': 8589, 'learning_rate': 0.004614092481679211, 'max_depth': 4, 'lambda_l1': 0.09228001516420256, 'lambda_l2': 2.59788803635261, 'num_leaves': 89, 'bagging_fraction': 0.7200312848706591, 'feature_fraction': 0.3201272801068543}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:59:34,128][0m Trial 32 finished with value: 0.5545653940144895 and parameters: {'n_estimators': 1648, 'learning_rate': 0.007150106706253386, 'max_depth': 3, 'lambda_l1': 0.07999625989552536, 'lambda_l2': 1.5082936209383282, 'num_leaves': 87, 'bagging_fraction': 0.7297056185232735, 'feature_fraction': 0.26085821456592506}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 20:59:45,342][0m Trial 33 finished with value: 0.5562693776961283 and parameters: {'n_estimators': 6654, 'learning_rate': 0.003868991359838406, 'max_depth': 5, 'lambda_l1': 0.1149866308295789, 'lambda_l2': 5.664568529706053, 'num_leaves': 84, 'bagging_fraction': 0.6511334869311137, 'feature_fraction': 0.3663779856758822}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 21:00:02,689][0m Trial 34 finished with value: 0.5635485992245819 and parameters: {'n_estimators': 8860, 'learning_rate': 0.01360063882962628, 'max_depth': 6, 'lambda_l1': 0.07059106589255425, 'lambda_l2': 1.051544517896921, 'num_leaves': 72, 'bagging_fraction': 0.7469550000136629, 'feature_fraction': 0.3082285352952409}. Best is trial 12 with value: 0.5677019478328463.[0m




[32m[I 2023-02-06 21:00:14,753][0m Trial 35 finished with value: 0.569335194817751 and parameters: {'n_estimators': 5801, 'learning_rate': 0.01386592547479894, 'max_depth': 6, 'lambda_l1': 0.39223995473686235, 'lambda_l2': 1.1020067319079518, 'num_leaves': 71, 'bagging_fraction': 0.6132401957724513, 'feature_fraction': 0.3091056606342296}. Best is trial 35 with value: 0.569335194817751.[0m




[32m[I 2023-02-06 21:00:38,113][0m Trial 36 finished with value: 0.5646900608078845 and parameters: {'n_estimators': 9850, 'learning_rate': 0.01024016902910065, 'max_depth': 7, 'lambda_l1': 0.34049111733610343, 'lambda_l2': 1.1473366860052066, 'num_leaves': 72, 'bagging_fraction': 0.5919658045770205, 'feature_fraction': 0.24080381039400467}. Best is trial 35 with value: 0.569335194817751.[0m




[32m[I 2023-02-06 21:01:05,239][0m Trial 37 finished with value: 0.5435560031937026 and parameters: {'n_estimators': 9942, 'learning_rate': 0.008342340573486186, 'max_depth': 7, 'lambda_l1': 0.3898339609696622, 'lambda_l2': 0.6987847571708531, 'num_leaves': 59, 'bagging_fraction': 0.527490816754782, 'feature_fraction': 0.611434618893371}. Best is trial 35 with value: 0.569335194817751.[0m




[32m[I 2023-02-06 21:01:12,062][0m Trial 38 finished with value: 0.5681756492173202 and parameters: {'n_estimators': 1608, 'learning_rate': 0.014719310746076355, 'max_depth': 10, 'lambda_l1': 0.22166862623726483, 'lambda_l2': 0.9887694029246434, 'num_leaves': 72, 'bagging_fraction': 0.6088997976038988, 'feature_fraction': 0.24356051834186077}. Best is trial 35 with value: 0.569335194817751.[0m




[32m[I 2023-02-06 21:01:23,222][0m Trial 39 finished with value: 0.5709286837925462 and parameters: {'n_estimators': 1147, 'learning_rate': 0.018886819794747486, 'max_depth': 10, 'lambda_l1': 0.2547569839550015, 'lambda_l2': 0.09305640452831758, 'num_leaves': 51, 'bagging_fraction': 0.6074516497392743, 'feature_fraction': 0.2868664156108183}. Best is trial 39 with value: 0.5709286837925462.[0m




[32m[I 2023-02-06 21:01:28,912][0m Trial 40 finished with value: 0.5655432714095092 and parameters: {'n_estimators': 1144, 'learning_rate': 0.022118519648825507, 'max_depth': 12, 'lambda_l1': 0.2182914723972292, 'lambda_l2': 0.08313812956839213, 'num_leaves': 52, 'bagging_fraction': 0.6269692322331828, 'feature_fraction': 0.27907293473344047}. Best is trial 39 with value: 0.5709286837925462.[0m




[32m[I 2023-02-06 21:01:35,137][0m Trial 41 finished with value: 0.5631780356907958 and parameters: {'n_estimators': 1298, 'learning_rate': 0.01515704828385605, 'max_depth': 12, 'lambda_l1': 0.23254905372082768, 'lambda_l2': 0.11231497474997365, 'num_leaves': 52, 'bagging_fraction': 0.6180757326821472, 'feature_fraction': 0.2878271087304469}. Best is trial 39 with value: 0.5709286837925462.[0m




[32m[I 2023-02-06 21:01:40,246][0m Trial 42 finished with value: 0.5666727275132282 and parameters: {'n_estimators': 1080, 'learning_rate': 0.02052765914573895, 'max_depth': 11, 'lambda_l1': 0.5391969757186842, 'lambda_l2': 0.07503981802049173, 'num_leaves': 64, 'bagging_fraction': 0.5984127676970329, 'feature_fraction': 0.23338183480097677}. Best is trial 39 with value: 0.5709286837925462.[0m




[32m[I 2023-02-06 21:01:47,359][0m Trial 43 finished with value: 0.5622978032150789 and parameters: {'n_estimators': 1909, 'learning_rate': 0.014584955645753182, 'max_depth': 10, 'lambda_l1': 0.6175657667487522, 'lambda_l2': 0.05413257277295514, 'num_leaves': 63, 'bagging_fraction': 0.585964567008524, 'feature_fraction': 0.23264690511532934}. Best is trial 39 with value: 0.5709286837925462.[0m




[32m[I 2023-02-06 21:01:55,777][0m Trial 44 finished with value: 0.5622741819159455 and parameters: {'n_estimators': 2261, 'learning_rate': 0.02287033255610984, 'max_depth': 11, 'lambda_l1': 0.29877579661884895, 'lambda_l2': 0.04603480387041988, 'num_leaves': 58, 'bagging_fraction': 0.546629319720624, 'feature_fraction': 0.2388972981307508}. Best is trial 39 with value: 0.5709286837925462.[0m




[32m[I 2023-02-06 21:02:04,129][0m Trial 45 finished with value: 0.5606814121949402 and parameters: {'n_estimators': 2443, 'learning_rate': 0.01247334323385397, 'max_depth': 10, 'lambda_l1': 0.7084623947063738, 'lambda_l2': 0.5429781247239907, 'num_leaves': 64, 'bagging_fraction': 0.6867238761955465, 'feature_fraction': 0.2867214003754529}. Best is trial 39 with value: 0.5709286837925462.[0m




[32m[I 2023-02-06 21:02:08,882][0m Trial 46 finished with value: 0.5639845627949966 and parameters: {'n_estimators': 873, 'learning_rate': 0.018116362223556025, 'max_depth': 11, 'lambda_l1': 0.1857298987998187, 'lambda_l2': 0.3077791647819188, 'num_leaves': 73, 'bagging_fraction': 0.5934534465341691, 'feature_fraction': 0.23748445208532556}. Best is trial 39 with value: 0.5709286837925462.[0m




[32m[I 2023-02-06 21:02:14,818][0m Trial 47 finished with value: 0.5631774476500565 and parameters: {'n_estimators': 1708, 'learning_rate': 0.03569780471633349, 'max_depth': 8, 'lambda_l1': 0.46320820289673226, 'lambda_l2': 0.19023462533545885, 'num_leaves': 68, 'bagging_fraction': 0.6498010133032265, 'feature_fraction': 0.22029736271660963}. Best is trial 39 with value: 0.5709286837925462.[0m




[32m[I 2023-02-06 21:02:28,877][0m Trial 48 finished with value: 0.5686545172007703 and parameters: {'n_estimators': 3993, 'learning_rate': 0.00828498794020625, 'max_depth': 11, 'lambda_l1': 0.28980304381838196, 'lambda_l2': 0.2605049133399094, 'num_leaves': 48, 'bagging_fraction': 0.53149979229554, 'feature_fraction': 0.2703811616120403}. Best is trial 39 with value: 0.5709286837925462.[0m




[32m[I 2023-02-06 21:02:41,238][0m Trial 49 finished with value: 0.5699614373224217 and parameters: {'n_estimators': 3744, 'learning_rate': 0.00874629729447718, 'max_depth': 10, 'lambda_l1': 0.2853887101315789, 'lambda_l2': 0.3556487592536847, 'num_leaves': 48, 'bagging_fraction': 0.5298697965942635, 'feature_fraction': 0.27392396545478515}. Best is trial 39 with value: 0.5709286837925462.[0m


# LightGBM Modeling

In [5]:
lgb_cv_scores, lgb_imp = list(), list()
preds = list()

skf = KFold(n_splits = 5, shuffle = True, random_state = 42)
    
for train_ix, test_ix in skf.split(X, Y):
        
    ## Splitting the data 
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
    ## Building the model
    lgb_md = LGBMRegressor(n_estimators = 1147,
                            max_depth = 10,
                            learning_rate = 0.018,
                            num_leaves = 51,
                            lambda_l1 = 0.2547,
                            lambda_l2 = 0.09305,
                            bagging_fraction = 0.6074,
                            feature_fraction = 0.2868,
                            random_state = 42).fit(X_train, Y_train)
    lgb_imp.append(lgb_md.feature_importances_)
    
    ## Predicting on X_test and test
    lgb_pred_1 = lgb_md.predict(X_test)
    lgb_pred_2 = lgb_md.predict(test_md)
        
    ## Applying Optimal Rounder (using abhishek approach)
    optR = OptimizedRounder()
    optR.fit(lgb_pred_1, Y_test)
    coef = optR.coefficients()
    lgb_pred_1 = optR.predict(lgb_pred_1, coef).astype(int)
    lgb_pred_2 = optR.predict(lgb_pred_2, coef).astype(int)
        
    ## Computing roc-auc score
    lgb_cv_scores.append(cohen_kappa_score(Y_test, lgb_pred_1, weights = 'quadratic'))
    preds.append(lgb_pred_2)

lgb_cv_score = np.mean(lgb_cv_scores)    
print('The average roc-auc score over 5-folds (run 5 times) is:', lgb_cv_score)

The average roc-auc score over 5-folds (run 5 times) is: 0.5519794514993521


In [6]:
lgb_preds_test = pd.DataFrame(preds).mode(axis = 0).loc[0, ]

submission['quality'] = lgb_preds_test.astype(int)
submission.head()

Unnamed: 0,Id,quality
0,2056,5
1,2057,6
2,2058,6
3,2059,7
4,2060,6


In [7]:
submission['quality'].value_counts()

5    552
6    484
7    334
4      2
Name: quality, dtype: int64

In [8]:
submission.to_csv('LightGBM_Reg_FE_2.csv', index = False)