<h1> Load Dependencies

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
mpl.style.use('seaborn')
from matplotlib import pyplot as plt


from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report



<h1> Load the data

In [2]:
# Load the data from the csv files

train_data = pd.read_csv("preprocessed_train_3.csv", index_col=0)
test_data = pd.read_csv("preprocessed_test_3.csv", index_col=0) 

In [3]:
target = "change_type"
X = train_data.drop(columns = target)
Y = train_data[target].map({'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
       'Mega Projects': 5})

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, shuffle=True)

<h1> Giga Classifier : The one to gather all

In [5]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

def new_sampling(X, y, strategy):
    strat_under = {}

    for cat, nb in strategy.items():
        if nb < np.unique(y, return_counts=True)[1][cat]:
            strat_under[cat] = nb
        else : 
            strat_under[cat] = np.unique(y, return_counts=True)[1][cat]
    X, y = RandomUnderSampler(sampling_strategy=strat_under).fit_resample(X, y)
    X, y = RandomOverSampler(sampling_strategy=strategy).fit_resample(X, y)

    return X, y

class GigaClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, **params):
        self.params = params

        if "param_class0" in params.keys():
            self.clf0 = LGBMClassifier(**params["param_class0"])
        else :
            self.clf0 = LGBMClassifier()
        
        if "param_class1" in params.keys():
            self.clf1 = LGBMClassifier(**params["param_class1"])
        else :
            self.clf1 = LGBMClassifier()

        if "param_class2" in params.keys():
            self.clf2 = LGBMClassifier(**params["param_class2"])
        else :
            self.clf2 = LGBMClassifier()

        if "param_class3" in params.keys():
            self.clf3 = LGBMClassifier(**params["param_class3"])
        else :
            self.clf3 = LGBMClassifier()

        if "param_class4" in params.keys():
            self.clf4 = LGBMClassifier(**params["param_class4"])
        else :
            self.clf4 = LGBMClassifier()

        if "param_class5" in params.keys():
            self.clf5 = LGBMClassifier(**params["param_class5"])
        else :
            self.clf5 = LGBMClassifier()

        if "param_class23" in params.keys():
            self.clf23 = LGBMClassifier(**params["param_class23"])
        else :
            self.clf23 = LGBMClassifier()

        if "param_class02" in params.keys():
            self.clf02 = LGBMClassifier(**params["param_class02"])
        else:
            self.clf02 = LGBMClassifier()

        if "param_class_final" in params.keys():
            self.clf_final = LogisticRegression(**params["param_class_final"])
        else :
            self.clf_final = LogisticRegression()

    def fit(self, X, y):
        X, _ = check_X_y(X, y)
        
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y
    
        _, n = np.unique(self.y_, return_counts=True)
        
        # Classifier 0

        if "strategy0" in self.params.keys():
            strategy = self.params["strategy0"]
            print('ok')
        else :
            k = n[0]*4
            strategy = {0 : k, 1 : int(k/5), 2 : int(k/5), 3 : int(k/5), 4 : int(k/5), 5 : int(k/5)}

        X_train0, y_train0 = new_sampling(self.X_, self.y_, strategy)
        y_train0 = y_train0.apply(lambda x : 1 if x == 0 else 0)

        self.clf0 = self.clf0.fit(X_train0, y_train0)

        del X_train0
        del y_train0

        # Classifier 1
        if "strategy1" in self.params.keys():
            strategy = self.params["strategy1"]
        else :
            k = n[1]*7
            strategy = {0 : int(k/5), 1 : k, 2 : int(k/5), 3 : int(k/5), 4 : int(k/5), 5 : int(k/5)}

        X_train1, y_train1 = new_sampling(self.X_, self.y_, strategy)
        y_train1 = y_train1.apply(lambda x : 1 if x == 1 else 0)

        self.clf1 = self.clf1.fit(X_train1, y_train1)

        del X_train1
        del y_train1
        
        # Classifier 2

        if "strategy2" in self.params.keys():
            strategy = self.params["strategy2"]
        else :
            k = n[2]//2
            strategy = {0 : int(k/5), 1 : int(k/5), 2 : k, 3 : int(k/5), 4 : int(k/5), 5 : int(k/5)}

        X_train2, y_train2 = new_sampling(self.X_, self.y_, strategy)
        y_train2 = y_train2.apply(lambda x : 1 if x == 2 else 0)

        self.clf2 = self.clf2.fit(X_train2, y_train2)

        del X_train2
        del y_train2
        
        # Classifier 3
        if "strategy3" in self.params.keys():
            strategy = self.params["strategy3"]
        else :
            k = n[3]
            strategy = {0 : int(k/5), 1 : int(k/5), 2 : int(k/5), 3 : k, 4 : int(k/5), 5 : int(k/5)}

        X_train3, y_train3 = new_sampling(self.X_, self.y_, strategy)
        y_train3 = y_train3.apply(lambda x : 1 if x == 3 else 0)

        self.clf3 = self.clf3.fit(X_train3, y_train3)

        del X_train3
        del y_train3
        
        # Classifier 4
        if "strategy4" in self.params.keys():
            strategy = self.params["strategy4"]
        else :
            k = n[4]*15
            strategy = {0 : int(k/5), 1 : int(k/5), 2 : int(k/5), 3 : int(k/5), 4 : k, 5 : int(k/5)}
        X_train4, y_train4 = new_sampling(self.X_, self.y_, strategy)
        y_train4 = y_train4.apply(lambda x : 1 if x == 4 else 0)

        self.clf4 = self.clf4.fit(X_train4, y_train4)

        del X_train4
        del y_train4
        
        # Classifier 5
        if "strategy5" in self.params.keys():
            strategy = self.params["strategy5"]
        else :
            k = n[5]*30
            strategy = {0 : int(k/5), 1 : int(k/5), 2 : int(k/5), 3 : int(k/5), 4 : int(k/5), 5 : k}

        X_train5, y_train5 = new_sampling(self.X_, self.y_, strategy)
        y_train5 = y_train5.apply(lambda x : 1 if x == 5 else 0)

        self.clf5 = self.clf5.fit(X_train5, y_train5)

        del X_train5
        del y_train5

        # Classifier 2-3

        mask = (self.y_ >= 2)&(self.y_ <= 3)
        X_train_23 = self.X_[mask]
        y_train_23 = self.y_[mask].apply(lambda x : 0 if x == 2 else 1)

        self.clf23.fit(X_train_23, y_train_23)

        del X_train_23
        del y_train_23

        # Classifier 0-2

        mask = (self.y_ == 0) | (self.y_ == 2)
        X_train_02 = self.X_[mask]
        y_train_02 = self.y_[mask].apply(lambda x : 1 if x == 2 else 0)

        self.clf02.fit(X_train_02, y_train_02)

        del X_train_02
        del y_train_02


        # # Final classifier

        self.clf_final.fit(np.array([self.clf0.predict_proba(self.X_)[:, 1], self.clf1.predict_proba(self.X_)[:, 1], self.clf2.predict_proba(self.X_)[:, 1], self.clf3.predict_proba(self.X_)[:, 1], self.clf4.predict_proba(self.X_)[:, 1], self.clf5.predict_proba(self.X_)[:, 1], self.clf23.predict_proba(self.X_)[:, 1], self.clf02.predict_proba(self.X_)[:, 1]]).T, self.y_)

        return self

    def predict(self, X):

        check_is_fitted(self)
        X = check_array(X)

        y_pred = self.clf_final.predict(np.array([self.clf0.predict_proba(X)[:, 1], self.clf1.predict_proba(X)[:, 1], self.clf2.predict_proba(X)[:, 1], self.clf3.predict_proba(X)[:, 1], self.clf4.predict_proba(X)[:, 1], self.clf5.predict_proba(X)[:, 1], self.clf23.predict_proba(X)[:, 1], self.clf02.predict_proba(X)[:, 1]]).T)

        return y_pred

In [6]:
# Some errors from the computer made some issue with the data.
# It comes from nowhere ...

X_train = np.nan_to_num(X_train)
X_test = np.nan_to_num(X_test)
test_data = np.nan_to_num(test_data)

<h3> Optimisation of the sampling strategy

In [5]:
from sklearn.metrics import f1_score
import optuna
import lightgbm as lgb

def objective(trial):
    
    dtrain = lgb.Dataset(X_train, label=y_train)


    strategy0 = {
                      0: trial.suggest_int("k_0_0", 100, 70000),
                      1: trial.suggest_int("k_1_0", 100, 70000),
                      2: trial.suggest_int("k_2_0", 100, 70000),
                      3: trial.suggest_int("k_3_0", 100, 70000),
                      4: trial.suggest_int("k_4_0", 100, 70000),
                      5: trial.suggest_int("k_5_0", 100, 15000),


    }

    strategy1 = {
                      0: trial.suggest_int("k_0_1", 100, 70000),
                      1: trial.suggest_int("k_1_1", 100, 70000),
                      2: trial.suggest_int("k_2_1", 100, 70000),
                      3: trial.suggest_int("k_3_1", 100, 70000),
                      4: trial.suggest_int("k_4_1", 100, 70000),
                      5: trial.suggest_int("k_5_1", 100, 15000),

    }

    strategy2 = {
                      0: trial.suggest_int("k_0_2", 100, 70000),
                      1: trial.suggest_int("k_1_2", 100, 70000),
                      2: trial.suggest_int("k_2_2", 100, 70000),
                      3: trial.suggest_int("k_3_2", 100, 70000),
                      4: trial.suggest_int("k_4_2", 100, 70000),
                      5: trial.suggest_int("k_5_2", 100, 15000),

    }

    strategy3 = {
                      0: trial.suggest_int("k_0_3", 100, 70000),
                      1: trial.suggest_int("k_1_3", 100, 70000),
                      2: trial.suggest_int("k_2_3", 100, 70000),
                      3: trial.suggest_int("k_3_3", 100, 70000),
                      4: trial.suggest_int("k_4_3", 100, 70000),
                      5: trial.suggest_int("k_5_3", 100, 15000),

    }

    strategy4 = {
                      0: trial.suggest_int("k_0_4", 100, 70000),
                      1: trial.suggest_int("k_1_4", 100, 70000),
                      2: trial.suggest_int("k_2_4", 100, 70000),
                      3: trial.suggest_int("k_3_4", 100, 70000),
                      4: trial.suggest_int("k_4_4", 100, 70000),
                      5: trial.suggest_int("k_5_4", 100, 15000),

    }

    strategy5 = {
                      0: trial.suggest_int("k_0_5", 100, 70000),
                      1: trial.suggest_int("k_1_5", 100, 70000),
                      2: trial.suggest_int("k_2_5", 100, 70000),
                      3: trial.suggest_int("k_3_5", 100, 70000),
                      4: trial.suggest_int("k_4_5", 100, 70000),
                      5: trial.suggest_int("k_5_5", 100, 15000),
    }

    
 
    param = {'strategy0' : strategy0, 'strategy1' : strategy1, 'strategy2' : strategy2, 
             'strategy3' : strategy3, 'strategy4' : strategy4, 'strategy5' : strategy5,
             }
    Giga = GigaClassifier(param=param).fit(X_train, y_train)
    preds = Giga.predict(X_test)
    score = 0.8*f1_score(y_test, preds, average='micro') + 0.2*0.8*f1_score(y_test, preds, average='macro')
    return score

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

<h2> New parameters from another optimisation

In [None]:
params0 = {
    'lambda_l1': 8.065393401857228e-06,
    'lambda_l2': 0.029803367056138796,
    'num_leaves': 190,
    'feature_fraction': 0.797014941354135,
    'bagging_fraction': 0.6056293592061583,
    'bagging_freq': 6,
    'min_child_samples': 35,
}
params1 = {
    'lambda_l1': 0.0014150936181843796,
    'lambda_l2': 0.015200649198535246,
    'num_leaves': 213,
    'feature_fraction': 0.6891955399281373,
    'bagging_fraction': 0.6304609421580529,
    'bagging_freq': 3,
    'min_child_samples': 78,
}
params2 = {
    'lambda_l1': 0.00043925265669491026,
    'lambda_l2': 9.479211152048597e-08,
    'num_leaves': 216,
    'feature_fraction': 0.5456789734159285,
    'bagging_fraction': 0.699812579931553,
    'bagging_freq': 1,
    'min_child_samples': 93,
}
params3 = {
    'lambda_l1': 0.01636000817108605,
    'lambda_l2': 0.1681160879676377,
    'num_leaves': 82,
    'feature_fraction': 0.5035311652439594,
    'bagging_fraction': 0.6759265334335514,
    'bagging_freq': 5,
    'min_child_samples': 93,
}
params4 = {
    'lambda_l1': 3.23091933287523e-05,
    'lambda_l2': 1.576373921261149,
    'num_leaves': 81,
    'feature_fraction': 0.9004686195418676,
    'bagging_fraction': 0.5815720681139029,
    'bagging_freq': 1,
    'min_child_samples': 21,
}
params5 = {
    'lambda_l1': 7.3489291515387846e-06,
    'lambda_l2': 8.823411285304899e-08,
    'num_leaves': 212,
    'feature_fraction': 0.6363314441863244,
    'bagging_fraction':  0.48949866504397654,
    'bagging_freq': 5,
    'min_child_samples': 38,
}
params23 = {
    'lambda_l1': .1279932407823657e-07,
    'lambda_l2': 9.707107657903811e-07,
    'num_leaves': 3,
    'feature_fraction': 0.5895943660453787,
    'bagging_fraction': 0.4995724225169822,
    'bagging_freq': 7,
    'min_child_samples': 65,
}
strategy0 = {
    0 : 66269,
    1 : 47889,
    2 : 1300,
    3 : 59972,
    4 : 27613,
    5 : 4831,
}
strategy1 = {
    0 : 22202,
    1 : 63926,
    2 : 49998,
    3 : 20079,
    4 : 49417,
    5 : 8442,
}
strategy2 = {
    0 : 42206,
    1 : 63036,
    2 : 62078,
    3 : 51447,
    4 : 45821,
    5 : 3484,
}
strategy3 = {
    0 : 19449,
    1 : 18530,
    2 : 13774,
    3 : 32118,
    4 : 638,
    5 : 7377,
}
strategy4 = {
    0 : 48220,
    1 : 51799,
    2 : 9703,
    3 : 63043,
    4 : 4353,
    5 : 12506,
}
strategy5 = {
    0 : 46176,
    1 : 39871,
    2 : 52339,
    3 : 62464,
    4 : 51783,
    5 : 11293,
}

model = GigaClassifier(param_class0 = params0, param_class1 = params1, param_class2 = params2,
                       param_class3 = params3, param_class4 = params4, param_class5 = params5,
                       param_class23 = params23, strategy0 = strategy0, strategy1 = strategy1,
                       strategy2 = strategy2, strategy3 = strategy3, strategy4 = strategy4,
                       strategy5 = strategy5)
model = model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)

print(classification_report(y_pred, y_test))
cf_matrix = confusion_matrix(y_pred, y_test)
print(cf_matrix)

              precision    recall  f1-score   support

           0       0.92      0.76      0.83      8995
           1       0.76      0.70      0.73      4054
           2       0.78      0.83      0.81     38627
           3       0.69      0.67      0.68     25583
           4       0.13      0.20      0.16       238
           5       0.07      0.40      0.12         5

    accuracy                           0.76     77502
   macro avg       0.56      0.59      0.55     77502
weighted avg       0.76      0.76      0.76     77502

[[ 6811    27  1482   665     9     1]
 [   38  2844   338   820    13     1]
 [  197   149 32023  6161    87    10]
 [  386   732  7002 17238   211    14]
 [    2     3    77   107    48     1]
 [    0     1     0     1     1     2]]


In [9]:
# For submission

y_pred_final = model.predict(test_data)
print(y_pred_final.shape)

pred_df = pd.DataFrame(y_pred_final, columns=['change_type'])
pred_df.to_csv("submissions/raph_sample_submission.csv", index=True, index_label='Id')

(121704,)


<h2> Global optimisation on micro f1-score and Cross-Validation

In [11]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

def new_sampling(X, y, strategy):
    strat_under = {}

    for cat, nb in strategy.items():
        if nb < np.unique(y, return_counts=True)[1][cat]:
            strat_under[cat] = nb
        else : 
            strat_under[cat] = np.unique(y, return_counts=True)[1][cat]
    X, y = RandomUnderSampler(sampling_strategy=strat_under).fit_resample(X, y)
    X, y = RandomOverSampler(sampling_strategy=strategy).fit_resample(X, y)

    return X, y

class GigaClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, **params):
        self.params = params

        if "param_class0" in params.keys():
            self.clf0 = LGBMClassifier(**params["param_class0"])
        else :
            self.clf0 = LGBMClassifier()
        
        if "param_class1" in params.keys():
            self.clf1 = LGBMClassifier(**params["param_class1"])
        else :
            self.clf1 = LGBMClassifier()

        if "param_class2" in params.keys():
            self.clf2 = LGBMClassifier(**params["param_class2"])
        else :
            self.clf2 = LGBMClassifier()

        if "param_class3" in params.keys():
            self.clf3 = LGBMClassifier(**params["param_class3"])
        else :
            self.clf3 = LGBMClassifier()

        if "param_class4" in params.keys():
            self.clf4 = LGBMClassifier(**params["param_class4"])
        else :
            self.clf4 = LGBMClassifier()

        if "param_class5" in params.keys():
            self.clf5 = LGBMClassifier(**params["param_class5"])
        else :
            self.clf5 = LGBMClassifier()

        if "param_class23" in params.keys():
            self.clf23 = LGBMClassifier(**params["param_class23"])
        else :
            self.clf23 = LGBMClassifier()

        if "param_class02" in params.keys():
            self.clf02 = LGBMClassifier(**params["param_class02"])
        else:
            self.clf02 = LGBMClassifier()

        if "param_class_final" in params.keys():
            self.clf_final = LogisticRegressionCV(cv = 5, **params["param_class_final"])
        else :
            self.clf_final = LogisticRegressionCV(cv = 5)

    def fit(self, X, y):
        X, _ = check_X_y(X, y)
        
        self.classes_ = unique_labels(y)

        self.X_ = X
        self.y_ = y
    
        _, n = np.unique(self.y_, return_counts=True)
        
        # Classifier 0

        if "strategy0" in self.params.keys():
            strategy = self.params["strategy0"]
            print('ok')
        else :
            k = n[0]*4
            strategy = {0 : k, 1 : int(k/5), 2 : int(k/5), 3 : int(k/5), 4 : int(k/5), 5 : int(k/5)}

        X_train0, y_train0 = new_sampling(self.X_, self.y_, strategy)
        y_train0 = y_train0.apply(lambda x : 1 if x == 0 else 0)

        self.clf0 = self.clf0.fit(X_train0, y_train0)

        del X_train0
        del y_train0

        # Classifier 1
        if "strategy1" in self.params.keys():
            strategy = self.params["strategy1"]
        else :
            k = n[1]*7
            strategy = {0 : int(k/5), 1 : k, 2 : int(k/5), 3 : int(k/5), 4 : int(k/5), 5 : int(k/5)}

        X_train1, y_train1 = new_sampling(self.X_, self.y_, strategy)
        y_train1 = y_train1.apply(lambda x : 1 if x == 1 else 0)

        self.clf1 = self.clf1.fit(X_train1, y_train1)

        del X_train1
        del y_train1
        
        # Classifier 2

        if "strategy2" in self.params.keys():
            strategy = self.params["strategy2"]
        else :
            k = n[2]//2
            strategy = {0 : int(k/5), 1 : int(k/5), 2 : k, 3 : int(k/5), 4 : int(k/5), 5 : int(k/5)}

        X_train2, y_train2 = new_sampling(self.X_, self.y_, strategy)
        y_train2 = y_train2.apply(lambda x : 1 if x == 2 else 0)

        self.clf2 = self.clf2.fit(X_train2, y_train2)

        del X_train2
        del y_train2
        
        # Classifier 3
        if "strategy3" in self.params.keys():
            strategy = self.params["strategy3"]
        else :
            k = n[3]
            strategy = {0 : int(k/5), 1 : int(k/5), 2 : int(k/5), 3 : k, 4 : int(k/5), 5 : int(k/5)}

        X_train3, y_train3 = new_sampling(self.X_, self.y_, strategy)
        y_train3 = y_train3.apply(lambda x : 1 if x == 3 else 0)

        self.clf3 = self.clf3.fit(X_train3, y_train3)

        del X_train3
        del y_train3
        
        # Classifier 4
        if "strategy4" in self.params.keys():
            strategy = self.params["strategy4"]
        else :
            k = n[4]*15
            strategy = {0 : int(k/5), 1 : int(k/5), 2 : int(k/5), 3 : int(k/5), 4 : k, 5 : int(k/5)}
        X_train4, y_train4 = new_sampling(self.X_, self.y_, strategy)
        y_train4 = y_train4.apply(lambda x : 1 if x == 4 else 0)

        self.clf4 = self.clf4.fit(X_train4, y_train4)

        del X_train4
        del y_train4
        
        # Classifier 5
        if "strategy5" in self.params.keys():
            strategy = self.params["strategy5"]
        else :
            k = n[5]*30
            strategy = {0 : int(k/5), 1 : int(k/5), 2 : int(k/5), 3 : int(k/5), 4 : int(k/5), 5 : k}

        X_train5, y_train5 = new_sampling(self.X_, self.y_, strategy)
        y_train5 = y_train5.apply(lambda x : 1 if x == 5 else 0)

        self.clf5 = self.clf5.fit(X_train5, y_train5)

        del X_train5
        del y_train5

        # Classifier 2-3

        mask = (self.y_ >= 2)&(self.y_ <= 3)
        X_train_23 = self.X_[mask]
        y_train_23 = self.y_[mask].apply(lambda x : 0 if x == 2 else 1)

        self.clf23.fit(X_train_23, y_train_23)

        del X_train_23
        del y_train_23

        # Classifier 0-2

        mask = (self.y_ == 0) | (self.y_ == 2)
        X_train_02 = self.X_[mask]
        y_train_02 = self.y_[mask].apply(lambda x : 1 if x == 2 else 0)

        self.clf02.fit(X_train_02, y_train_02)

        del X_train_02
        del y_train_02


        # # Final classifier

        self.clf_final.fit(np.array([self.clf0.predict_proba(self.X_)[:, 1], self.clf1.predict_proba(self.X_)[:, 1], self.clf2.predict_proba(self.X_)[:, 1], self.clf3.predict_proba(self.X_)[:, 1], self.clf4.predict_proba(self.X_)[:, 1], self.clf5.predict_proba(self.X_)[:, 1], self.clf23.predict_proba(self.X_)[:, 1], self.clf02.predict_proba(self.X_)[:, 1]]).T, self.y_)

        return self

    def predict(self, X):

        check_is_fitted(self)
        X = check_array(X)

        y_pred = self.clf_final.predict(np.array([self.clf0.predict_proba(X)[:, 1], self.clf1.predict_proba(X)[:, 1], self.clf2.predict_proba(X)[:, 1], self.clf3.predict_proba(X)[:, 1], self.clf4.predict_proba(X)[:, 1], self.clf5.predict_proba(X)[:, 1], self.clf23.predict_proba(X)[:, 1], self.clf02.predict_proba(X)[:, 1]]).T)

        return y_pred

In [14]:
import lightgbm as lgb

def objective(trial):
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    param_class0 = {
                      "objective": "binary",
                      "metric": "binary_logloss",
                      "verbosity": -1,
                      "boosting_type": "gbdt",
                      "lambda_l1": trial.suggest_float("lambda_l1_0", 1e-8, 10.0, log=True),
                      "lambda_l2": trial.suggest_float("lambda_l2_0", 1e-8, 10.0, log=True),
                      "num_leaves": trial.suggest_int("num_leaves_0", 2, 256),
                      "feature_fraction": trial.suggest_float("feature_fraction_0", 0.4, 1.0),
                      "bagging_fraction": trial.suggest_float("bagging_fraction_0", 0.4, 1.0),
                      "bagging_freq": trial.suggest_int("bagging_freq_0", 1, 7),
                      "min_child_samples": trial.suggest_int("min_child_samples_0", 5, 100),
    }

    param_class1 = {
                      "objective": "binary",
                      "metric": "binary_logloss",
                      "verbosity": -1,
                      "boosting_type": "gbdt",
                      "lambda_l1": trial.suggest_float("lambda_l1_1", 1e-8, 10.0, log=True),
                      "lambda_l2": trial.suggest_float("lambda_l2_1", 1e-8, 10.0, log=True),
                      "num_leaves": trial.suggest_int("num_leaves_1", 2, 256),
                      "feature_fraction": trial.suggest_float("feature_fraction_1", 0.4, 1.0),
                      "bagging_fraction": trial.suggest_float("bagging_fraction_1", 0.4, 1.0),
                      "bagging_freq": trial.suggest_int("bagging_freq_1", 1, 7),
                      "min_child_samples": trial.suggest_int("min_child_samples_1", 5, 100),
    }

    param_class2 = {
                      "objective": "binary",
                      "metric": "binary_logloss",
                      "verbosity": -1,
                      "boosting_type": "gbdt",
                      "lambda_l1": trial.suggest_float("lambda_l1_2", 1e-8, 10.0, log=True),
                      "lambda_l2": trial.suggest_float("lambda_l2_2", 1e-8, 10.0, log=True),
                      "num_leaves": trial.suggest_int("num_leaves_2", 2, 256),
                      "feature_fraction": trial.suggest_float("feature_fraction_2", 0.4, 1.0),
                      "bagging_fraction": trial.suggest_float("bagging_fraction_2", 0.4, 1.0),
                      "bagging_freq": trial.suggest_int("bagging_freq_2", 1, 7),
                      "min_child_samples": trial.suggest_int("min_child_samples_2", 5, 100),
    }

    param_class3 = {
                      "objective": "binary",
                      "metric": "binary_logloss",
                      "verbosity": -1,
                      "boosting_type": "gbdt",
                      "lambda_l1": trial.suggest_float("lambda_l1_3", 1e-8, 10.0, log=True),
                      "lambda_l2": trial.suggest_float("lambda_l2_3", 1e-8, 10.0, log=True),
                      "num_leaves": trial.suggest_int("num_leaves_3", 2, 256),
                      "feature_fraction": trial.suggest_float("feature_fraction_3", 0.4, 1.0),
                      "bagging_fraction": trial.suggest_float("bagging_fraction_3", 0.4, 1.0),
                      "bagging_freq": trial.suggest_int("bagging_freq_3", 1, 7),
                      "min_child_samples": trial.suggest_int("min_child_samples_3", 5, 100),
    }

    param_class4 = {
                      "objective": "binary",
                      "metric": "binary_logloss",
                      "verbosity": -1,
                      "boosting_type": "gbdt",
                      "lambda_l1": trial.suggest_float("lambda_l1_4", 1e-8, 10.0, log=True),
                      "lambda_l2": trial.suggest_float("lambda_l2_4", 1e-8, 10.0, log=True),
                      "num_leaves": trial.suggest_int("num_leaves_4", 2, 256),
                      "feature_fraction": trial.suggest_float("feature_fraction_4", 0.4, 1.0),
                      "bagging_fraction": trial.suggest_float("bagging_fraction_4", 0.4, 1.0),
                      "bagging_freq": trial.suggest_int("bagging_freq_4", 1, 7),
                      "min_child_samples": trial.suggest_int("min_child_samples_4", 5, 100),
    }

    param_class5 = {
                      "objective": "binary",
                      "metric": "binary_logloss",
                      "verbosity": -1,
                      "boosting_type": "gbdt",
                      "lambda_l1": trial.suggest_float("lambda_l1_5", 1e-8, 10.0, log=True),
                      "lambda_l2": trial.suggest_float("lambda_l2_5", 1e-8, 10.0, log=True),
                      "num_leaves": trial.suggest_int("num_leaves_5", 2, 256),
                      "feature_fraction": trial.suggest_float("feature_fraction_5", 0.4, 1.0),
                      "bagging_fraction": trial.suggest_float("bagging_fraction_5", 0.4, 1.0),
                      "bagging_freq": trial.suggest_int("bagging_freq_5", 1, 7),
                      "min_child_samples": trial.suggest_int("min_child_samples_5", 5, 100),
    }

    param_class23 = {
                      "objective": "binary",
                      "metric": "binary_logloss",
                      "verbosity": -1,
                      "boosting_type": "gbdt",
                      "lambda_l1": trial.suggest_float("lambda_l1_6", 1e-8, 10.0, log=True),
                      "lambda_l2": trial.suggest_float("lambda_l2_6", 1e-8, 10.0, log=True),
                      "num_leaves": trial.suggest_int("num_leaves_6", 2, 256),
                      "feature_fraction": trial.suggest_float("feature_fraction_6", 0.4, 1.0),
                      "bagging_fraction": trial.suggest_float("bagging_fraction_6", 0.4, 1.0),
                      "bagging_freq": trial.suggest_int("bagging_freq_6", 1, 7),
                      "min_child_samples": trial.suggest_int("min_child_samples_6", 5, 100),
    }

    strategy0 = {
                      0: trial.suggest_int("k_0_0", 100, 40000),
                      1: trial.suggest_int("k_1_0", 100, 20000),
                      2: trial.suggest_int("k_2_0", 100, 120000),
                      3: trial.suggest_int("k_3_0", 100, 70000),
                      4: trial.suggest_int("k_4_0", 100, 15000),
                      5: trial.suggest_int("k_5_0", 100, 8000),


    }

    strategy1 = {
                      0: trial.suggest_int("k_0_1", 100, 40000),
                      1: trial.suggest_int("k_1_1", 100, 30000),
                      2: trial.suggest_int("k_2_1", 100, 12000),
                      3: trial.suggest_int("k_3_1", 100, 70000),
                      4: trial.suggest_int("k_4_1", 100, 15000),
                      5: trial.suggest_int("k_5_1", 100, 8000),

    }

    strategy2 = {
                      0: trial.suggest_int("k_0_2", 100, 40000),
                      1: trial.suggest_int("k_1_2", 100, 70000),
                      2: trial.suggest_int("k_2_2", 100, 120000),
                      3: trial.suggest_int("k_3_2", 100, 70000),
                      4: trial.suggest_int("k_4_2", 100, 15000),
                      5: trial.suggest_int("k_5_2", 100, 8000),

    }

    strategy3 = {
                      0: trial.suggest_int("k_0_3", 100, 40000),
                      1: trial.suggest_int("k_1_3", 100, 70000),
                      2: trial.suggest_int("k_2_3", 100, 120000),
                      3: trial.suggest_int("k_3_3", 100, 70000),
                      4: trial.suggest_int("k_4_3", 100, 18000),
                      5: trial.suggest_int("k_5_3", 100, 8000),

    }

    strategy4 = {
                      0: trial.suggest_int("k_0_4", 100, 40000),
                      1: trial.suggest_int("k_1_4", 100, 70000),
                      2: trial.suggest_int("k_2_4", 100, 70000),
                      3: trial.suggest_int("k_3_4", 100, 70000),
                      4: trial.suggest_int("k_4_4", 100, 15000),
                      5: trial.suggest_int("k_5_4", 100, 15000),

    }

    strategy5 = {
                      0: trial.suggest_int("k_0_5", 100, 40000),
                      1: trial.suggest_int("k_1_5", 100, 70000),
                      2: trial.suggest_int("k_2_5", 100, 120000),
                      3: trial.suggest_int("k_3_5", 100, 70000),
                      4: trial.suggest_int("k_4_5", 100, 70000),
                      5: trial.suggest_int("k_5_5", 100, 15000),
    }

    


        
        
  
    param = {'strategy0' : strategy0, 'strategy1' : strategy1, 'strategy2' : strategy2, 
             'strategy3' : strategy3, 'strategy4' : strategy4, 'strategy5' : strategy5,
             'param_class0' : param_class0, 'param_class1' : param_class1, 'param_class2' : param_class2, 
             'param_class3' : param_class3, 'param_class4' : param_class4, 'param_class5' : param_class5,
             'param_class23' : param_class23}
    Giga = GigaClassifier(param=param).fit(X_train, y_train)
    preds = Giga.predict(X_test)
    score = f1_score(y_test, preds, average='micro')
    return score

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
# Parameters for best
# 'lambda_l1_0': 2.622499162058228e-06, 'lambda_l2_0': 1.0238839304813224e-07, 'num_leaves_0': 199, 'feature_fraction_0': 0.411873339991237, 'bagging_fraction_0': 0.7445149277661633, 'bagging_freq_0': 6, 'min_child_samples_0': 6, 'lambda_l1_1': 0.025357166676963343, 'lambda_l2_1': 5.943170115391853, 'num_leaves_1': 146, 'feature_fraction_1': 0.8157016350444458, 'bagging_fraction_1': 0.8265957747327676, 'bagging_freq_1': 1, 'min_child_samples_1': 30, 'lambda_l1_2': 1.809784483494622e-05, 'lambda_l2_2': 1.7020746364298178e-07, 'num_leaves_2': 133, 'feature_fraction_2': 0.8695021268037981, 'bagging_fraction_2': 0.8113784925789771, 'bagging_freq_2': 2, 'min_child_samples_2': 11, 'lambda_l1_3': 3.0433742028157333e-07, 'lambda_l2_3': 7.646445515896699e-07, 'num_leaves_3': 178, 'feature_fraction_3': 0.7069056371794968, 'bagging_fraction_3': 0.5362513785909214, 'bagging_freq_3': 4, 'min_child_samples_3': 10, 'lambda_l1_4': 0.00012122778217885948, 'lambda_l2_4': 1.6084994046334085e-08, 'num_leaves_4': 89, 'feature_fraction_4': 0.5863990514141588, 'bagging_fraction_4': 0.8280470534097605, 'bagging_freq_4': 3, 'min_child_samples_4': 21, 'lambda_l1_5': 0.00026352973454501734, 'lambda_l2_5': 0.39486459136812085, 'num_leaves_5': 171, 'feature_fraction_5': 0.7678570870540501, 'bagging_fraction_5': 0.7153548983282392, 'bagging_freq_5': 4, 'min_child_samples_5': 16, 'lambda_l1_6': 1.4725187161289567e-08, 'lambda_l2_6': 0.0002608849372253336, 'num_leaves_6': 171, 'feature_fraction_6': 0.7230555773598342, 'bagging_fraction_6': 0.42025252466404367, 'bagging_freq_6': 7, 'min_child_samples_6': 53, 'k_0_0': 9128, 'k_1_0': 15203, 'k_2_0': 20815, 'k_3_0': 68450, 'k_4_0': 3996, 'k_5_0': 6126, 'k_0_1': 35047, 'k_1_1': 27669, 'k_2_1': 6503, 'k_3_1': 57595, 'k_4_1': 1195, 'k_5_1': 2326, 'k_0_2': 29559, 'k_1_2': 18637, 'k_2_2': 67956, 'k_3_2': 57930, 'k_4_2': 7156, 'k_5_2': 831, 'k_0_3': 26681, 'k_1_3': 38048, 'k_2_3': 101189, 'k_3_3': 59517, 'k_4_3': 848, 'k_5_3': 1967, 'k_0_4': 35086, 'k_1_4': 66236, 'k_2_4': 65599, 'k_3_4': 65815, 'k_4_4': 327, 'k_5_4': 2039, 'k_0_5': 39448, 'k_1_5': 41879, 'k_2_5': 105054, 'k_3_5': 36181, 'k_4_5': 8930, 'k_5_5': 7432
