In [1]:
# Imported Libraries

import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import plot_roc_curve, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression


from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb

from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
data_train = pd.read_csv('gen_data.csv', index_col=0)
data_train.head()

Unnamed: 0_level_0,ps_ind_01,ps_ind_03,ps_ind_14,ps_ind_15,ps_reg_01,ps_reg_02,ps_car_11,ps_car_12,ps_car_13,ps_car_14,...,ps_car_04_cat_3,ps_car_04_cat_4,ps_car_04_cat_5,ps_car_04_cat_6,ps_car_04_cat_7,ps_car_04_cat_8,ps_car_04_cat_9,ps_car_07_cat_1,ps_car_08_cat_1,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0.285714,0.454545,0.0,0.846154,0.777778,0.111111,0.666667,0.230157,0.227024,0.495899,...,0,0,0,0,0,0,0,1,0,0
19,0.714286,0.363636,0.0,0.461538,1.0,1.0,0.666667,0.271085,0.225363,0.563076,...,0,0,0,0,0,0,0,1,1,0
20,0.285714,0.272727,0.0,0.615385,0.666667,0.055556,0.666667,0.155592,0.139523,0.49205,...,0,0,0,0,0,0,0,1,1,0
26,0.714286,0.272727,0.0,0.461538,1.0,0.388889,1.0,0.230157,0.190073,0.565409,...,0,0,0,0,0,0,0,1,1,0
35,0.285714,0.272727,0.0,0.692308,1.0,0.055556,1.0,0.207162,0.252418,0.512093,...,0,0,0,0,0,0,0,1,0,0


In [3]:
data_val = pd.read_csv('data_val.csv', index_col=0)
data_val.head()

Unnamed: 0_level_0,ps_ind_01,ps_ind_03,ps_ind_14,ps_ind_15,ps_reg_01,ps_reg_02,ps_car_11,ps_car_12,ps_car_13,ps_car_14,...,ps_car_04_cat_2,ps_car_04_cat_3,ps_car_04_cat_4,ps_car_04_cat_5,ps_car_04_cat_6,ps_car_04_cat_7,ps_car_04_cat_8,ps_car_04_cat_9,ps_car_07_cat_1,ps_car_08_cat_1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.727273,0.0,0.923077,0.555556,0.166667,0.333333,0.155592,0.150236,0.460456,...,0,0,0,0,0,0,0,0,1,1
1,0.571429,0.454545,0.0,0.384615,1.0,0.277778,0.333333,0.155592,0.127559,0.472211,...,0,0,0,0,0,0,0,0,1,1
2,0.714286,0.272727,0.0,0.769231,0.444444,0.0,1.0,0.230157,0.231528,0.548452,...,0,0,0,0,0,0,0,0,1,1
3,0.0,0.545455,0.0,0.307692,0.111111,0.111111,0.666667,0.207162,0.14398,0.516085,...,0,0,0,0,0,0,0,0,1,1
4,0.714286,0.636364,0.0,0.307692,1.0,0.222222,1.0,0.207162,0.201647,0.523018,...,0,0,0,0,0,0,0,0,1,1


In [4]:
X_train = data_train.drop('target', axis=1)
X_val = data_val

y_train = data_train['target']

In [5]:
# pca = PCA(n_components=0.95)
# X_train = pd.DataFrame(pca.fit_transform(X_train), index=X_train.index)
# X_val = pd.DataFrame(pca.transform(X_val), index=X_val.index)


In [6]:
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=314).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]


                print ("Base model %d: fit %s model | fold %d" % (i+1, str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)
                cross_score = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
                print("cross_score [roc-auc]: %.5f [gini]: %.5f" % (cross_score.mean(), 2*cross_score.mean()-1))
                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
            S_test[:, i] = S_test_i.mean(axis=1)

        results = cross_val_score(self.stacker, S_train, y, cv=3, scoring='roc_auc')
        # Calculate gini factor as 2 * AUC - 1
        print("Stacker score [gini]: %.5f" % (2 * results.mean() - 1))

        self.stacker.fit(S_train, y)
        res = self.stacker.predict_proba(S_test)[:,1]
        return res

In [7]:
xgb_model = xgb.XGBClassifier()
lgb_model = LGBMClassifier()

log_model = LogisticRegression()

stack = Ensemble(n_splits=3,
                 stacker = log_model,
                 base_models = (xgb_model, lgb_model))


In [8]:
# rus = RandomUnderSampler(sampling_strategy = 1, random_state=2021)
# X_resampled, y_resampled = rus.fit_resample(X_train, y_train)


In [12]:
preds = stack.fit_predict(X_train, y_train, X_val) 


Base model 1: fit XGBClassifier model | fold 1
cross_score [roc-auc]: 0.62174 [gini]: 0.24348
Base model 1: fit XGBClassifier model | fold 2
cross_score [roc-auc]: 0.61327 [gini]: 0.22655
Base model 1: fit XGBClassifier model | fold 3
cross_score [roc-auc]: 0.62168 [gini]: 0.24335
Base model 2: fit LGBMClassifier model | fold 1
cross_score [roc-auc]: 0.60445 [gini]: 0.20891
Base model 2: fit LGBMClassifier model | fold 2
cross_score [roc-auc]: 0.59603 [gini]: 0.19206
Base model 2: fit LGBMClassifier model | fold 3
cross_score [roc-auc]: 0.60610 [gini]: 0.21221
Stacker score [gini]: 0.24050


In [13]:
submission = pd.DataFrame()
submission['id'] = data_val.index
submission['target'] = preds
submission.to_csv('submit.csv', index=False)

In [14]:
submission.head()

Unnamed: 0,id,target
0,0,0.029669
1,1,0.031113
2,2,0.030785
3,3,0.024797
4,4,0.036189


--------