In [None]:
!git clone https://github.com/DepInjoy/ML-000.git

Cloning into 'ML-000'...
remote: Enumerating objects: 31, done.[K
remote: Total 31 (delta 0), reused 0 (delta 0), pack-reused 31[K
Unpacking objects: 100% (31/31), done.


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgbm
from sklearn.metrics import accuracy_score, recall_score,precision_score, auc
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [None]:
df_train_data = pd.read_csv("./ML-000/Week09/data/train_final.csv")
df_test_data = pd.read_csv("./ML-000/Week09/data/test_final.csv")

In [None]:
for col in df_train_data.columns:
  print(col)

continuous_annual_inc
continuous_annual_inc_joint
continuous_delinq_2yrs
continuous_dti
continuous_dti_joint
continuous_fico_range_high
continuous_fico_range_low
continuous_funded_amnt
continuous_funded_amnt_inv
continuous_inq_last_6mths
continuous_installment
continuous_int_rate
continuous_last_fico_range_high
continuous_last_fico_range_low
continuous_loan_amnt
loan_status
continuous_mths_since_last_delinq
continuous_mths_since_last_major_derog
continuous_mths_since_last_record
continuous_open_acc
continuous_pub_rec
discrete_addr_state_1_one_hot
discrete_addr_state_2_one_hot
discrete_addr_state_3_one_hot
discrete_addr_state_4_one_hot
discrete_addr_state_5_one_hot
discrete_addr_state_6_one_hot
discrete_addr_state_7_one_hot
discrete_addr_state_8_one_hot
discrete_addr_state_9_one_hot
discrete_addr_state_10_one_hot
discrete_addr_state_11_one_hot
discrete_addr_state_12_one_hot
discrete_addr_state_13_one_hot
discrete_addr_state_14_one_hot
discrete_addr_state_15_one_hot
discrete_addr_state_1

Setup HyperParameter search

In [None]:
def train_model(df_train_data, df_test_data):
  df_train_y = df_train_data["loan_status"]
  df_train_features = df_train_data.drop("loan_status", axis=1)

  df_test_y = df_test_data["loan_status"]
  df_test_features = df_test_data.drop("loan_status", axis=1)
  
  params = {"objective": "binary",
            "boosting_type": "gbdt",
            "learning_rate": 0.1,
            "num_leaves": 15,
            "max_bin": 256,
            "feature_fraction": 0.6,
            "verbosity": 0,
            "drop_rate": 0.1,
            "is_unbalance": False,
            "max_drop": 50,
            "min_child_samples": 10,
            "min_child_weight": 150,
            "min_split_gain": 0,
            "subsample": 0.9
            }

  num_boost_round = 10000
  kfold_splits = 5
  kfold = StratifiedKFold(n_splits=kfold_splits, shuffle=True, random_state=218)

  best_trees = []
  fold_scores = []

  for i, (train_fold, validate) in enumerate(kfold.split(df_train_features, df_train_y)):
      X_train, X_validate, label_train, label_validate = df_train_features.loc[train_fold],\
                        df_train_features.loc[validate],\
                        df_train_y.loc[train_fold],\
                        df_train_y.loc[validate]
      dtrain = lgbm.Dataset(X_train, label_train)
      dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
      bst = lgbm.train(params, dtrain, num_boost_round,
                      valid_sets=dvalid,
                      verbose_eval=100, 
                      early_stopping_rounds=1000)
      best_trees.append(bst.best_iteration)
      y_pred = bst.predict(df_test_features, num_iteration=bst.best_iteration)

      score = accuracy_score(df_test_y, np.where(y_pred>=0.5, 1, 0))

      fold_scores.append(score)
      print("{} of folds, accuracy_score {} ".format(i, score))

In [None]:
train_model(df_train_data, df_test_data)

Training until validation scores don't improve for 1000 rounds.
[100]	valid_0's binary_logloss: 0.194828
[200]	valid_0's binary_logloss: 0.19628
[300]	valid_0's binary_logloss: 0.196992
[400]	valid_0's binary_logloss: 0.197854
[500]	valid_0's binary_logloss: 0.198793
[600]	valid_0's binary_logloss: 0.199695
[700]	valid_0's binary_logloss: 0.200452
[800]	valid_0's binary_logloss: 0.201028
[900]	valid_0's binary_logloss: 0.2017
[1000]	valid_0's binary_logloss: 0.202231
Early stopping, best iteration is:
[78]	valid_0's binary_logloss: 0.194449
0 of folds, accuracy_score 0.91814 
Training until validation scores don't improve for 1000 rounds.
[100]	valid_0's binary_logloss: 0.192267
[200]	valid_0's binary_logloss: 0.193085
[300]	valid_0's binary_logloss: 0.194519
[400]	valid_0's binary_logloss: 0.194808
[500]	valid_0's binary_logloss: 0.196197
[600]	valid_0's binary_logloss: 0.196992
[700]	valid_0's binary_logloss: 0.197758
[800]	valid_0's binary_logloss: 0.198642
[900]	valid_0's binary_lo

In [None]:
df_train_data2 = df_train_data.copy()
df_test_data2 = df_test_data.copy()
df_train_data2['cap'] = (df_train_data['continuous_funded_amnt'] * df_train_data['continuous_int_rate'] + df_train_data['continuous_installment']) / df_train_data['continuous_annual_inc']
df_test_data2['cap'] = (df_test_data['continuous_funded_amnt'] * df_test_data['continuous_int_rate'] + df_test_data['continuous_installment']) / df_test_data['continuous_annual_inc']

In [None]:
train_model(df_train_data2, df_test_data2)

Training until validation scores don't improve for 1000 rounds.
[100]	valid_0's binary_logloss: 0.194957
[200]	valid_0's binary_logloss: 0.196355
[300]	valid_0's binary_logloss: 0.197229
[400]	valid_0's binary_logloss: 0.198316
[500]	valid_0's binary_logloss: 0.199163
[600]	valid_0's binary_logloss: 0.199722
[700]	valid_0's binary_logloss: 0.200389
[800]	valid_0's binary_logloss: 0.20098
[900]	valid_0's binary_logloss: 0.20157
[1000]	valid_0's binary_logloss: 0.202106
Early stopping, best iteration is:
[72]	valid_0's binary_logloss: 0.19462
0 of folds, accuracy_score 0.91826
Training until validation scores don't improve for 1000 rounds.
[100]	valid_0's binary_logloss: 0.192398
[200]	valid_0's binary_logloss: 0.193144
[300]	valid_0's binary_logloss: 0.193827
[400]	valid_0's binary_logloss: 0.194979
[500]	valid_0's binary_logloss: 0.195937
[600]	valid_0's binary_logloss: 0.196792
[700]	valid_0's binary_logloss: 0.197421
[800]	valid_0's binary_logloss: 0.198438
[900]	valid_0's binary_log