<a href="https://colab.research.google.com/github/Azimoj/Home-Credit-Default-Risk/blob/main/Model_Optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
plt.xkcd()

import gc
import time
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold

In [3]:
import os
PATH = '/content/drive/MyDrive/p7/data/'
print(os.listdir(PATH))

['POS_CASH_balance.csv', 'application_test.csv', 'application_train.csv', 'bureau_balance.csv', 'credit_card_balance.csv', 'installments_payments.csv', 'previous_application.csv', 'bureau.csv', 'bu_agg_final.csv', 'final_prev_df.csv', 'pos_agg_final.csv', 'ins_agg_final.csv', 'cc_agg_final.csv', 'bb_agg_final.csv', 'NEW_data_data.csv', 'X_scale_train.csv', 'X_train.csv', 'X_scale_test.csv', 'X_test.csv', 'result_XGB.csv', 'df_select_rf.csv', 'df_X_train_rf.csv', 'df_X_test_rf.csv', 'result_RF.csv', 'result_LightGBM.csv', 'X_train_final.csv', 'X_test_final.csv', 'y_test.csv', 'y_train.csv', 'LightGBM_impute_without_F0.csv', 'result_LightGBM_with_Feat0.csv', 'result_LightGBM_with_imput_F0.csv', 'result_LightGBM_with_imput_0F.csv', 'LightGBM_w_impute_without_0F.csv', 'result_LightGBM_wout_impute_w_0F.csv', 'result_LightGBM_wout_impute_0F.csv', 'Df_data.csv']


In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [5]:
def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [6]:
application_train = import_data(PATH+'application_train.csv')
application_test = import_data(PATH+'application_test.csv')

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 59.54 MB
Decreased by 79.2%
Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 9.40 MB
Decreased by 79.1%


In [7]:
data = import_data(PATH+'NEW_data_data.csv')

Memory usage of dataframe is 1501.49 MB
Memory usage after optimization is: 384.46 MB
Decreased by 74.4%


In [8]:
data.head(3)

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,INSTAL_AMT_INSTALMENT_MEAN,INSTAL_AMT_INSTALMENT_SUM,INSTAL_AMT_PAYMENT_MIN,INSTAL_AMT_PAYMENT_MAX,INSTAL_AMT_PAYMENT_MEAN,INSTAL_AMT_PAYMENT_SUM,INSTAL_DAYS_ENTRY_PAYMENT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MEAN,INSTAL_DAYS_ENTRY_PAYMENT_SUM,INSTAL_COUNT
0,0,100002,1.0,0,0,0,0,202500.0,406597.5,24700.5,...,32428.117188,194568.703125,32426.279297,32428.484375,32428.117188,194568.703125,-68.0,-123.3125,-740.0,6.0
1,1,100003,0.0,1,0,1,0,270000.0,1293502.5,35698.5,...,6493.328613,233759.828125,2705.76001,19978.695312,6493.328613,233759.828125,-680.0,-1755.0,-63182.0,36.0
2,2,100004,0.0,0,1,0,0,67500.0,135000.0,6750.0,...,5244.454102,73422.359375,1346.400024,5250.509766,4494.381348,62921.339844,-597.0,-742.0,-10390.0,14.0


In [9]:
data.drop('Unnamed: 0',inplace=True, axis=1)

**Imputation**

In [10]:
# data missing values (in percent)
data_missing = (data.isnull().sum() / len(data)).sort_values(ascending = False)
data_missing.head()

BURO_NEW_FLAG_nan_MEAN_MEAN    0.749487
BURO_MONTHS_BALANCE_MIN_MIN    0.749487
BURO_STATUS_2_MEAN_MEAN        0.749487
BURO_STATUS_1_MEAN_MEAN        0.749487
BURO_STATUS_0_MEAN_MEAN        0.749487
dtype: float64

In [11]:
# Identify missing values above threshold
data_missing = data_missing.index[data_missing > 0.5]

print('There are %d columns with more than 50%% missing values' % len(data_missing))

There are 78 columns with more than 50% missing values


In [12]:
data_clean = data.drop(columns = data_missing)

print('data_clean set full shape: ', data_clean.shape)

data_clean set full shape:  (307505, 561)


In [13]:
data_clean.isnull().sum().sort_values(ascending=False)

CLOSED_AMT_CREDIT_SUM_DEBT_SUM         153541
CLOSED_AMT_CREDIT_SUM_MEAN             153541
CLOSED_DAYS_CREDIT_MIN                 153541
CLOSED_DAYS_CREDIT_MAX                 153541
CLOSED_DAYS_CREDIT_MEAN                153541
                                        ...  
ORGANIZATION_TYPE_Industry: type 13         0
ORGANIZATION_TYPE_Industry: type 12         0
ORGANIZATION_TYPE_Industry: type 11         0
ORGANIZATION_TYPE_Industry: type 10         0
SK_ID_CURR                                  0
Length: 561, dtype: int64

In [14]:
data_clean[data_clean==np.inf]=np.nan

In [15]:
# Imputaion with mod & median
def filling_nan_value(df):
  column_with_nan = df.columns[df.isnull().any()]
  for column in column_with_nan:
    df[column]= df[column].fillna(df[column].median())
  return (df)

In [16]:
data_clean = filling_nan_value(data_clean)

In [17]:
data_clean.isnull().sum().sort_values(ascending=False).head()

SK_ID_CURR                                              0
PREV_NAME_GOODS_CATEGORY_Construction Materials_MEAN    0
PREV_NAME_CLIENT_TYPE_Refreshed_MEAN                    0
PREV_NAME_CLIENT_TYPE_Repeater_MEAN                     0
PREV_NAME_CLIENT_TYPE_nan_MEAN                          0
dtype: int64

In [18]:
gc.enable()
del data 
gc.collect()

94

In [38]:
data_clean.to_csv('/content/drive/MyDrive/p7/data/data_clean.csv')

**Spliting**

In [19]:
X = data_clean.drop('TARGET',axis=1)
y= data_clean['TARGET']

In [20]:
X_train, y_train, X_test, y_test= train_test_split(X, y, test_size=0.3, random_state=0)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X ,y,test_size = 0.3,random_state=0)
print('The shape of X_train is', X_train.shape)
print('The shape of X_test is', X_test.shape)
print('The shape of y_train is', y_train.shape)
print('The shape of y_test is', y_test.shape)

The shape of X_train is (215253, 560)
The shape of X_test is (92252, 560)
The shape of y_train is (215253,)
The shape of y_test is (92252,)


In [22]:
import pickle

model = 'drive/MyDrive/p7/LGBM_wout_imput_wout_0F.sav'
pickle.load(open(model, 'rb'))

LGBMClassifier(boosting_type='goss', class_weight='balanced',
               n_estimators=10000, objective='binary')

In [44]:
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from time import time

### **Bayesian Optimization**

Bayesian optimization uses probability to find the minimum of a function. The final aim is to find the input value to a function which can gives us the lowest possible output value.It usually performs better than random,grid and manual search providing better performance in the testing phase and reduced optimization time. In Hyperopt, Bayesian Optimization can be implemented giving 3 three main parameters to the function fmin.

* Objective Function = defines the loss function to minimize.
* Domain Space = defines the range of input values to test (in Bayesian Optimization this space creates a probability distribution for each of the used Hyperparameters).
* Optimization Algorithm = defines the search algorithm to use to select the best input values to use in each new iteration.

https://github.com/krishnaik06/All-Hyperparamter-Optimization/blob/master/Hyper%20Parameter%20Optimization.ipynb

In [23]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

In [35]:
space = {'colsample_bytree': hp.uniform('colsample_bytree', 0.8, 1),
          'learning_rate': hp.uniform('learning_rate', .01, .02), 
          'num_leaves': hp.choice('num_leaves', [33, 34, 35]), 
          'subsample': hp.uniform('subsample', 0.8, 1), 
          'max_depth': hp.choice('max_depth', [7, 8, 9]), 
          'reg_alpha': hp.uniform('reg_alpha', .03, .05), 
          'reg_lambda': hp.uniform('reg_lambda', .06, .08), 
          'min_split_gain': hp.uniform('min_split_gain', .01, .03),
          'min_child_weight': hp.choice('min_child_weight', [38, 39, 40])}

In [25]:
space

{'colsample_bytree': <hyperopt.pyll.base.Apply at 0x7f4aa7d2f510>,
 'learning_rate': <hyperopt.pyll.base.Apply at 0x7f4aa7d2f690>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x7f4aa7d2fb10>,
 'min_child_weight': <hyperopt.pyll.base.Apply at 0x7f4aa72a95d0>,
 'min_split_gain': <hyperopt.pyll.base.Apply at 0x7f4aa8f54310>,
 'num_leaves': <hyperopt.pyll.base.Apply at 0x7f4aa7d2f810>,
 'reg_alpha': <hyperopt.pyll.base.Apply at 0x7f4aa7d2fc90>,
 'reg_lambda': <hyperopt.pyll.base.Apply at 0x7f4aa7d2fe10>,
 'subsample': <hyperopt.pyll.base.Apply at 0x7f4aa7d2f990>}

In [36]:
def objective(space):
    model = LGBMClassifier( colsample_bytree = space['colsample_bytree'], 
                            learning_rate = space['learning_rate'],
                            num_leaves = space['num_leaves'],
                            subsample = space['subsample'],
                            max_depth = space['max_depth'],
                            reg_alpha = space['reg_alpha'],
                            reg_lambda = space['reg_lambda'],
                            min_split_gain = space['min_split_gain'],
                            min_child_weight = space['min_child_weight'], 
                            )
    
    accuracy = cross_val_score(model, X_train, y_train, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [39]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|██████████| 80/80 [5:37:30<00:00, 253.13s/it, best loss: -0.9189372506384468]


{'colsample_bytree': 0.897205838848013,
 'learning_rate': 0.01903391441841755,
 'max_depth': 1,
 'min_child_weight': 1,
 'min_split_gain': 0.023464913960387336,
 'num_leaves': 1,
 'reg_alpha': 0.03466317212004515,
 'reg_lambda': 0.06855724195516467,
 'subsample': 0.9904327467948528}

In [41]:
num_leav = {0: 33, 1: 34, 2: 35,}
max_dep = {0: 7, 1: 8, 2: 9}
min_child_weight = {0: 38, 1: 39, 2: 40}


print(num_leav[best['num_leaves']])
print(max_dep[best['max_depth']])
print(min_child_weight[best['min_child_weight']])

34
8
39


In [64]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

trainedLGBM = LGBMClassifier(colsample_bytree = best['colsample_bytree'], 
                            learning_rate = best['learning_rate'],
                            num_leaves = num_leav[best['num_leaves']],
                            subsample = best['subsample'],
                            max_depth = max_dep[best['max_depth']],
                            reg_alpha = best['reg_alpha'],
                            reg_lambda = best['reg_lambda'],
                            min_split_gain = best['min_split_gain'],
                            min_child_weight = min_child_weight[best['min_child_weight']],
                            boosting_type='goss', class_weight='balanced',
                            n_estimators=10000, objective='binary').fit(X_train,y_train)



[[76472  8404]
 [ 4881  2495]] /
0.8559922820101461 /
              precision    recall  f1-score   support

         0.0       0.94      0.90      0.92     84876
         1.0       0.23      0.34      0.27      7376

    accuracy                           0.86     92252
   macro avg       0.58      0.62      0.60     92252
weighted avg       0.88      0.86      0.87     92252



In [None]:
predictionLGBM = trainedLGBM.predict(X_test)
print(confusion_matrix(y_test,predictionLGBM),'/n')
print(accuracy_score(y_test,predictionLGBM),'/n')
print(classification_report(y_test,predictionLGBM))
acc5 = accuracy_score(y_test,predictionLGBM)

In [57]:
predictionLGBM = trainedLGBM.predict_proba(X_test)[:,1]

In [62]:
import sklearn.metrics as me
aa=me.confusion_matrix(y_test,predictionLGBM)

ValueError: ignored

In [68]:
from sklearn.metrics import roc_auc_score
AUROC_LGBM=roc_auc_score(y_test, predictionLGBM)

In [69]:
AUROC_LGBM

0.6196220926963607

In [66]:
AUROC_LGBM

0.7282185676585844

In [56]:
AUROC_LGBM

0.7452782320366504

In [49]:
accuracy_score(y_test,predictionLGBM

SyntaxError: ignored

In [None]:
    loss = [x['result']['loss'] for x in trials.trials]
    
    best_param_values = [x for x in best_param.values()]
    
    if best_param_values[0] == 0:
        boosting_type = 'gbdt'
    else:
        boosting_type= 'dart'
    
    clf_best = lgb.LGBMClassifier(learning_rate=best_param_values[2],
                                  num_leaves=int(best_param_values[5]),
                                  max_depth=int(best_param_values[3]),
                                  n_estimators=int(best_param_values[4]),
                                  boosting_type=boosting_type,
                                  colsample_bytree=best_param_values[1],
                                  reg_lambda=best_param_values[6],
                                 )
                                  
    clf_best.fit(X_train, y_train)
    
    print("")
    print("##### Results")
    print("Score best parameters: ", min(loss)*-1)
    print("Best parameters: ", best_param)
    print("Test Score: ", clf_best.score(X_test, y_test))
    print("Time elapsed: ", time.time() - start)
    print("Parameter combinations evaluated: ", num_eval)