In [1]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import warnings

warnings.filterwarnings('ignore')

pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 200)

Using TensorFlow backend.


# Base Line Model

In [2]:
def train_lgbm(X, y, plot=False):
    data = pd.DataFrame(y)

    models = []
    scores = []

    kf = StratifiedKFold(n_splits=5, random_state=42)
    for i, (tdx, vdx) in enumerate(kf.split(X, y)):
        print(f'Fold : {i}')
        X_train, X_val, y_train, y_val = X.loc[tdx], X.loc[vdx], y.loc[tdx], y.loc[vdx]
        y_true = y_val

        params = {
            'learning_rate': .05,
            'n_estimators': 2000,
            'num_leaves': 50,
            'min_split_gain': 0,
            'min_child_weight': 1e-3,
            'min_child_samples': 21,
            'subsample': .8,
            'colsample_bytree': .8,

            'n_jobs': -1,
            'random_state': 0
        }
        model = LGBMClassifier().set_params(**params)
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], early_stopping_rounds=50,
                  verbose=False)

        ## plot feature importance
        if plot:
            fscores = pd.Series(model.feature_importances_, X_train.columns).sort_values(ascending=False)
            fscores.plot(kind='bar', title='Feature Importance %d' % i, figsize=(20, 10))
            plt.ylabel('Feature Importance Score')
            plt.show()

        y_pred = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:,1]
        auc = roc_auc_score(y_true, y_pred)
        print("AUC score at %d floder: %f" % (i, auc))
        scores.append(auc)
        models.append(model)
        data.loc[vdx, 'y_pred'] = y_pred

    mean_score = np.mean(scores)
    print("5-floder total mean_score:", mean_score)
    print("----train lgbm finish!----")
    print(roc_auc_score(data['y'], data['y_pred']))

    return data['y_pred']

# Load process data

In [3]:
data = pd.read_csv('process_data/process_data.csv')

# Original Data

In [4]:
X = data.copy()
y = X.pop('y')
print(y.value_counts())
train_lgbm(X,y,plot=False)

0    37903
1     4905
Name: y, dtype: int64
Fold : 0
AUC score at 0 floder: 0.902332
Fold : 1
AUC score at 1 floder: 0.699276
Fold : 2
AUC score at 2 floder: 0.531219
Fold : 3
AUC score at 3 floder: 0.775808
Fold : 4
AUC score at 4 floder: 0.841640
5-floder total mean_score: 0.7500549005530306
----train lgbm finish!----
0.7709561019849934


0        0.054284
1        0.029658
2        0.017008
3        0.026988
4        0.043360
           ...   
42803    0.189308
42804    0.220058
42805    0.206039
42806    0.124980
42807    0.152191
Name: y_pred, Length: 42808, dtype: float64

# Over sampling

In [5]:
def over_sampler(df):
    # col = df.columns.tolist()
    X = df.copy()
    y = X.pop('y')
    print(X.shape, y.shape)
    smo = SMOTE(random_state=42)
    X_smo, y_smo = smo.fit_sample(X, y)
    print(X_smo.shape, y_smo.shape)
    return X_smo, y_smo

In [6]:
X,y = over_sampler(data)
print(y.value_counts())
train_lgbm(X,y,plot=False)

(42808, 64) (42808,)
(75806, 64) (75806,)
1    37903
0    37903
Name: y, dtype: int64
Fold : 0
AUC score at 0 floder: 0.902812
Fold : 1
AUC score at 1 floder: 0.993895
Fold : 2
AUC score at 2 floder: 0.990467
Fold : 3
AUC score at 3 floder: 0.994505
Fold : 4
AUC score at 4 floder: 0.952715
5-floder total mean_score: 0.9668789101713751
----train lgbm finish!----
0.9505115239554506


0        0.177953
1        0.133845
2        0.114435
3        0.131223
4        0.153713
           ...   
75801    0.987307
75802    0.782475
75803    0.920679
75804    0.824312
75805    0.818684
Name: y_pred, Length: 75806, dtype: float64

# under sampling

In [7]:
def under_sampler(df):
    X = df.copy()
    y = X.pop('y')
    print(X.shape, y.shape)
    under = RandomUnderSampler(random_state=42)
    X_under, y_under = under.fit_sample(X, y)
    print(X_under.shape, y_under.shape)
    return X_under, y_under

In [8]:
X,y = under_sampler(data)
print(y.value_counts())
train_lgbm(X,y,plot=False)

(42808, 64) (42808,)
(9810, 64) (9810,)
1    4905
0    4905
Name: y, dtype: int64
Fold : 0
AUC score at 0 floder: 0.883617
Fold : 1
AUC score at 1 floder: 0.842913
Fold : 2
AUC score at 2 floder: 0.808913
Fold : 3
AUC score at 3 floder: 0.863885
Fold : 4
AUC score at 4 floder: 0.931777
5-floder total mean_score: 0.8662209919146765
----train lgbm finish!----
0.8685269872740063


0       0.122363
1       0.066982
2       0.069140
3       0.165093
4       0.574217
          ...   
9805    0.842793
9806    0.370974
9807    0.934544
9808    0.908655
9809    0.938256
Name: y_pred, Length: 9810, dtype: float64