# Import the training data

## Read the training data

In [None]:
import pandas as pd
import numpy as np

# Main table
pd.options.display.max_columns = None
train = pd.read_pickle('data/global_train_data.pkl').sample(50000)

## Split the training and validation sets
For some early model runs, we used as little as 10% of the training data. By the end, we were using 75% for training and holding back 25% for validation. The parameter that controls the proportion of the train/test split is called test_size and can be found in the first code cell of this notebook.

In [None]:
from sklearn.model_selection import train_test_split

y = train['TARGET'].values
X_train, X_valid, y_train, y_valid = train_test_split(train.drop(
    ['TARGET', 'SK_ID_CURR'], axis=1), y, stratify=y, test_size=0.25, random_state=1)

## build model pipeline

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# preprocessing steps
pipe = Pipeline(
    steps=[
        # tried median, mean, constant strategies
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

pipe.fit(X_train)
X_train = pipe.transform(X_train)
X_valid = pipe.transform(X_valid)

print('Shape of X_train:', X_train.shape)
print('Shape of X_valid:', X_valid.shape)
print('Shape of y:', y.shape)

# Balancing data analysis

## Define some functions

In [None]:
import lightgbm as lgb
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix


def get_statistics(_y_valid, _y_pred):
    TN, FP, FN, TP = confusion_matrix(
        list(_y_valid), list(_y_pred), labels=[0, 1]).ravel()
    # Sensitivity, hit rate, recall, or true positive rate
    sensitivity = TP/(TP+FN)
    # Specificity or true negative rate
    specifity = TN/(TN+FP)
    # Precision or positive predictive value
    precision = TP/(TP+FP)
    # Overall accuracy
    accuracy = (TP+TN)/(TP+FP+FN+TN)

    return TN, FP, FN, TP, sensitivity, specifity, precision, accuracy


def eval_error(_y_valid, _y_pred):
    TN, FP, FN, TP = confusion_matrix(
        list(_y_valid), list(_y_pred), labels=[0, 1]).ravel()
    # Sensitivity, hit rate, recall, or true positive rate
    sensitivity = TP/(TP+FN)
    # Overall accuracy
    accuracy = (TP+TN)/(TP+FP+FN+TN)
    value = sensitivity*accuracy
    return "error", value, True


def process_fitting(_name, _pipeline):
    _pipeline.fit(X_train, y_train, 
                      lgbmclassifier__eval_metric=eval_error)
    TN, FP, FN, TP, sensitivity, specifity, precision, accuracy = get_statistics(
        y_valid, _pipeline.predict(X_valid))
    result = pd.DataFrame({'Preprocess': [_name],
                           'TN': [TN],
                           'FP': [FP],
                           'FN': [FN],
                           'TP': [TP],
                           'sensitivity': [sensitivity],
                           'accuracy': [accuracy],
                           'score': [eval_error(y_valid, _pipeline.predict(X_valid))[1]]})
    return result

In [None]:
from lightgbm import LGBMClassifier

classifier = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', max_depth=18,
                              # default learning rate is 0.1 but 0.02 feels like the sweet spot.
                              n_jobs=-1, num_leaves=30, learning_rate=0.02, n_estimators=1600,
                              max_bin=512, subsample_for_bin=200, subsample=0.8,
                              subsample_freq=1, colsample_bytree=0.8,
                              # bumping up the alpha parameter gave us a little boost
                              reg_alpha=80, reg_lambda=20,
                              min_split_gain=0.5, min_child_weight=1,
                              # about 92% target=0 to 8% target=1 - ratio is about 11.5 to 1
                              min_child_samples=10, scale_pos_weight=11.5, num_class=1)

## Fit models without balancing data - baseline

In [None]:
results = pd.DataFrame(columns=['Preprocess',
                       'TN', 'FP', 'FN', 'TP', 'sensitivity', 'accuracy', 'score'])

In [None]:
from imblearn.pipeline import make_pipeline

# Create pipeline to predict classification from data
pipeline = make_pipeline(
    classifier
)
# Fit the pipeline
result = process_fitting('None', pipeline)

# update model scoreboard
results = pd.concat([results, result], ignore_index=True)

## Optimize SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

sampling_strategy_values = [0.2, 0.5]
k_values = [1, 5]

for s in sampling_strategy_values:
    for k in k_values:
        print('sampling_strategy_values:', s, 'k_values:', k)
        # Create pipeline to predict classification from data
        pipeline = make_pipeline(
            SMOTE(sampling_strategy=s, k_neighbors=k),
            classifier
        )

        # Fit the pipeline
        result = process_fitting_metric('SMOTE - knn:'+str(k)+'- sampling:'+str(s), pipeline)

        # update model scoreboard
        results = pd.concat([results, result], ignore_index=True)

## Optimize BorderlineSMOTE

In [None]:
from imblearn.over_sampling import BorderlineSMOTE

sampling_strategy_values = [0.2, 0.5]
k_values = [1, 5]

for s in sampling_strategy_values:
    for k in k_values:
        print('sampling_strategy_values:', s, 'k_values:', k)
        # Create pipeline to predict classification from data
        pipeline = make_pipeline(
            BorderlineSMOTE(k_neighbors=k, sampling_strategy=s),
            classifier
        )

        # Fit the pipeline
        result = process_fitting_metric('BorderlineSMOTE - knn:'+str(k)+'- sampling:'+str(s), pipeline)

        # update model scoreboard
        results = pd.concat([results, result], ignore_index=True)

## Optimize ADASYN

In [None]:
from imblearn.over_sampling import ADASYN

sampling_strategy_values = [0.2, 0.5]
k_values = [1, 5]

for s in sampling_strategy_values:
    for k in k_values:
        print('sampling_strategy_values:', s, 'k_values:', k)
        # Create pipeline to predict classification from data
        pipeline = make_pipeline(
            ADASYN(n_neighbors=k, sampling_strategy=s),
            classifier
        )

        # Fit the pipeline
        result = process_fitting_metric('ADASYN - knn:'+str(k)+'- sampling:'+str(s), pipeline)

        # update model scoreboard
        results = pd.concat([results, result], ignore_index=True)

## Optimize SMOTE with RandomUndersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

sampling_strategy_values = [0.2, 0.3, 0.5, 1]

for s in sampling_strategy_values:
    print('sampling_strategy_values:', s)
    # Create pipeline to predict classification from data
    pipeline = make_pipeline(
        SMOTE(sampling_strategy=0.5, k_neighbors=1),
        RandomUnderSampler(sampling_strategy=s),
        classifier
    )

    # Fit the pipeline
    result = process_fitting_metric('SMOTE UNDER - undersampling:'+str(s), pipeline)

    # update model scoreboard
    results = pd.concat([results, result], ignore_index=True)

In [None]:
results