# Import the training data

## Read the training data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

train = pd.read_pickle('data/global_train_data.pkl')
y = train['TARGET'].values
X = train.drop(['TARGET', 'SK_ID_CURR'], axis=1)

prepro = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

prepro.fit(X)
X = prepro.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=1)

print('Shape of X_train:', X_train.shape)
print('Shape of X_test:', X_test.shape)

# Baseline

In [None]:
results = pd.DataFrame(columns=['Method',
                       'TN', 'FP', 'FN', 'TP', 'sensitivity', 'accuracy', 'score'])

In [None]:
from lightgbm import LGBMClassifier

param = {
    "boosting_type": 'gbdt',
    "objective": "binary",
    "n_jobs": -1,
    "verbosity": 1,
    'learning_rate': 0.02,
    'n_estimators': 1600,
    'subsample_for_bin': 50000,
    'subsample': 0.8,
    'subsample_freq': 10,
    'colsample_bytree': 0.8,
    'reg_lambda': 30,
    'reg_alpha': 25,
    'min_split_gain': 0.5,
    'min_child_weight': 1,
    'min_child_samples': 10,
    'scale_pos_weight': 11.5,
    'metrics': 'logloss',
}

classifier = LGBMClassifier(**param)
# fitting model 
classifier.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
)

y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix

def get_statistics(_y_test, _y_pred):
    _y_pred = [round(num) for num in _y_pred]
    TN, FP, FN, TP = confusion_matrix(
        list(_y_test), list(_y_pred), labels=[0, 1]).ravel()
    # Sensitivity, hit rate, recall, or true positive rate
    sensitivity = TP/(TP+FN) if TP+FN != 0 else 0
    # Specificity or true negative rate
    specifity = TN/(TN+FP) if TN+FP != 0 else 0
    # Precision or positive predictive value
    precision = TP/(TP+FP) if TP+FP != 0 else 0
    # Overall accuracy
    accuracy = (TP+TN)/(TP+FP+FN+TN) if TP+FP+FN+TN != 0 else 0

    return TN, FP, FN, TP, sensitivity, specifity, precision, accuracy

def eval_error(_y_test, _y_pred):
    _y_pred = [round(num) for num in _y_pred]
    TN, FP, FN, TP = confusion_matrix(
        list(_y_test), list(_y_pred), labels=[0, 1]).ravel()
    # Sensitivity, hit rate, recall, or true positive rate
    sensitivity = TP/(TP+FN) if TP+FN != 0 else 0
    # Overall accuracy
    accuracy = (TP+TN)/(TP+FP+FN+TN) if TP+FP+FN+TN != 0 else 0
    value = sensitivity*accuracy
    return "error", value, True

TN, FP, FN, TP, sensitivity, specifity, precision, accuracy = get_statistics(
    y_test, y_pred)
result = pd.DataFrame({'Method': 'test',
                       'TN': [TN],
                       'FP': [FP],
                       'FN': [FN],
                       'TP': [TP],
                       'sensitivity': [sensitivity],
                       'accuracy': [accuracy],
                       'score': [eval_error(y_test, y_pred)[1]]})
    
results = pd.concat([results, result], ignore_index=True)

results