In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

from pathlib import Path

import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.inspection import permutation_importance

from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [2]:
data_df = pd.read_csv("smaller_data_sets/master_df.csv",index_col=[0])
data_df.dropna(inplace=True)
data_df.drop(columns=["SK_ID_CURR","SK_ID_PREV"],inplace=True)

In [3]:
list(data_df.columns)

['TARGET',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'NAME_HOUSING_TYPE',
 'DAYS_EMPLOYED',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'MONTHS_BALANCE',
 'AMT_BALANCE',
 'AMT_CREDIT_LIMIT_ACTUAL',
 'AMT_DRAWINGS_ATM_CURRENT',
 'AMT_DRAWINGS_CURRENT',
 'AMT_DRAWINGS_OTHER_CURRENT',
 'AMT_DRAWINGS_POS_CURRENT',
 'AMT_INST_MIN_REGULARITY',
 'AMT_PAYMENT_CURRENT',
 'AMT_PAYMENT_TOTAL_CURRENT',
 'AMT_RECEIVABLE_PRINCIPAL',
 'AMT_RECIVABLE',
 'AMT_TOTAL_RECEIVABLE',
 'CNT_DRAWINGS_ATM_CURRENT',
 'CNT_DRAWINGS_CURRENT',
 'CNT_DRAWINGS_OTHER_CURRENT',
 'CNT_DRAWINGS_POS_CURRENT',
 'CNT_INSTALMENT_MATURE_CUM',
 'NAME_CONTRACT_STATUS',
 'SK_DPD_CC',
 'SK_DPD_DEF_CC',
 'SK_DPD',
 'SK_DPD_DEF']

In [4]:
# creating a list of categorical vars / setting up OneHotEncoder

# Creating list
categorical_vars = list(data_df.dtypes[data_df.dtypes == 'object'].index)
# print(categorical_vars)



# OneHotEncoder Instance
enc = OneHotEncoder(sparse = False)

# Encoding Data
encoded_data = enc.fit_transform(data_df[categorical_vars])

In [5]:
data_df.shape

(192655, 40)

In [6]:
# Creating Encoded DataFrame

encoded_df = pd.DataFrame(
    encoded_data,
    columns = enc.get_feature_names(categorical_vars)
)

# Reviewing DF
encoded_df.shape

# adding ID column for eas of use

encoded_df.index = data_df.index

In [7]:
encoded_df

Unnamed: 0,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Signed
100002,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
100007,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
100008,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
100009,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
100010,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
456244,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
456246,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
456247,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
456249,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [8]:
scaled_df = data_df.drop(columns = categorical_vars).drop(columns=["TARGET"])
list(scaled_df.columns)

['AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'DAYS_EMPLOYED',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'MONTHS_BALANCE',
 'AMT_BALANCE',
 'AMT_CREDIT_LIMIT_ACTUAL',
 'AMT_DRAWINGS_ATM_CURRENT',
 'AMT_DRAWINGS_CURRENT',
 'AMT_DRAWINGS_OTHER_CURRENT',
 'AMT_DRAWINGS_POS_CURRENT',
 'AMT_INST_MIN_REGULARITY',
 'AMT_PAYMENT_CURRENT',
 'AMT_PAYMENT_TOTAL_CURRENT',
 'AMT_RECEIVABLE_PRINCIPAL',
 'AMT_RECIVABLE',
 'AMT_TOTAL_RECEIVABLE',
 'CNT_DRAWINGS_ATM_CURRENT',
 'CNT_DRAWINGS_CURRENT',
 'CNT_DRAWINGS_OTHER_CURRENT',
 'CNT_DRAWINGS_POS_CURRENT',
 'CNT_INSTALMENT_MATURE_CUM',
 'SK_DPD_CC',
 'SK_DPD_DEF_CC',
 'SK_DPD',
 'SK_DPD_DEF']

In [9]:
scaler=StandardScaler()
scaled_data = scaler.fit_transform(scaled_df)

In [10]:
scaled_df = pd.DataFrame(scaled_data, index=encoded_df.index,columns=scaled_df.columns)
scaled_df

Unnamed: 0,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,DAYS_EMPLOYED,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_HOUR,...,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,SK_DPD_CC,SK_DPD_DEF_CC,SK_DPD,SK_DPD_DEF
100002,0.114718,-0.487210,-0.172031,-0.516908,-0.455847,0.243973,4.186201,0.253330,5.286047,-0.076662,...,-0.642939,-0.267879,-0.222999,-0.050801,-0.145746,-0.380733,-0.104838,-0.016639,-0.073209,-0.016703
100007,-0.171221,-0.217472,-0.373829,-0.068028,-0.472834,-0.616345,-0.326277,-0.614297,-0.279726,-0.076662,...,-0.642939,-0.267879,-0.222999,-0.050801,-0.145746,2.028839,-0.104838,-0.016639,-0.073209,-0.016703
100008,-0.250649,-0.274523,0.028486,-0.230124,-0.462575,-0.616345,-0.326277,-0.614297,-0.279726,-0.076662,...,-0.640450,-0.267879,-0.222999,-0.050801,-0.145746,1.275848,-0.104838,-0.016639,5.439420,-0.016703
100009,0.003520,2.438584,1.009610,2.375872,-0.473485,-0.186186,-0.326277,-0.180484,-0.279726,-0.076662,...,0.159350,-0.267879,-0.222999,-0.050801,-0.145746,3.183426,-0.104838,-0.016639,-0.073209,-0.016703
100010,0.670712,2.360691,1.064704,2.749938,-0.454517,0.243973,-0.326277,0.253330,-0.279726,-0.076662,...,-0.642939,-0.267879,-0.222999,-0.050801,-0.145746,2.179437,-0.104838,-0.016639,-0.073209,-0.016703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456244,0.321230,1.787289,0.630998,1.665146,-0.489021,-0.616345,-0.326277,-0.614297,-0.279726,-0.076662,...,-0.260845,-0.267879,12.377985,-0.050801,13.775492,-1.133725,-0.104838,-0.016639,-0.071787,-0.010369
456246,-0.266535,-0.947571,-1.174295,-0.866036,-0.513146,-0.616345,-0.326277,-0.614297,-0.279726,-0.076662,...,-0.642939,-0.267879,-0.222999,-0.050801,-0.145746,0.121261,-0.104838,-0.016639,-0.073209,-0.016703
456247,-0.202992,-0.642071,-0.665315,-0.803692,-0.454164,3.255087,6.442440,3.290028,5.286047,-0.076662,...,2.440931,2.779494,2.842105,-0.050801,2.111752,-0.029337,-0.104838,-0.016639,-0.073209,-0.016703
456249,-0.202992,-0.947571,-0.360696,-0.866036,2.132662,-0.616345,-0.326277,-0.614297,-0.279726,-0.076662,...,3.654339,-0.267879,-0.222999,-0.050801,-0.145746,-0.732129,-0.104838,-0.016639,-0.073209,-0.016703


In [11]:
processed_df = pd.concat([encoded_df,scaled_df], join="inner",axis=1)

In [12]:
processed_df.dropna()

Unnamed: 0,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,NAME_CONTRACT_STATUS_Active,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Signed,...,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,SK_DPD_CC,SK_DPD_DEF_CC,SK_DPD,SK_DPD_DEF
100002,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.642939,-0.267879,-0.222999,-0.050801,-0.145746,-0.380733,-0.104838,-0.016639,-0.073209,-0.016703
100007,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.642939,-0.267879,-0.222999,-0.050801,-0.145746,2.028839,-0.104838,-0.016639,-0.073209,-0.016703
100008,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.640450,-0.267879,-0.222999,-0.050801,-0.145746,1.275848,-0.104838,-0.016639,5.439420,-0.016703
100009,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.159350,-0.267879,-0.222999,-0.050801,-0.145746,3.183426,-0.104838,-0.016639,-0.073209,-0.016703
100010,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.642939,-0.267879,-0.222999,-0.050801,-0.145746,2.179437,-0.104838,-0.016639,-0.073209,-0.016703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456244,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.260845,-0.267879,12.377985,-0.050801,13.775492,-1.133725,-0.104838,-0.016639,-0.071787,-0.010369
456246,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.642939,-0.267879,-0.222999,-0.050801,-0.145746,0.121261,-0.104838,-0.016639,-0.073209,-0.016703
456247,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,2.440931,2.779494,2.842105,-0.050801,2.111752,-0.029337,-0.104838,-0.016639,-0.073209,-0.016703
456249,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,3.654339,-0.267879,-0.222999,-0.050801,-0.145746,-0.732129,-0.104838,-0.016639,-0.073209,-0.016703


In [13]:
target_df = pd.DataFrame(data_df["TARGET"],index=data_df.index)
target_df

Unnamed: 0,TARGET
100002,1
100007,0
100008,0
100009,0
100010,0
...,...
456244,0
456246,0
456247,0
456249,0


In [14]:
ready_to_split = pd.concat([processed_df, target_df], join="inner", axis=1)
ready_to_split = ready_to_split.dropna()

In [15]:
X=ready_to_split.drop(columns="TARGET")
y=ready_to_split["TARGET"]

In [16]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, random_state = 1)

In [45]:
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100),threshold=.04)
sel.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(), threshold=0.04)

In [48]:
sel.get_support()

array([False, False, False, False, False, False, False, False, False,
       False,  True,  True,  True,  True,  True, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False])

In [49]:
sel.threshold_

0.04

In [50]:
selected_feat= X_train.columns[(sel.get_support())]

In [51]:
print(selected_feat)

Index(['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'DAYS_EMPLOYED', 'MONTHS_BALANCE', 'AMT_PAYMENT_CURRENT',
       'CNT_INSTALMENT_MATURE_CUM'],
      dtype='object')


In [52]:
len(selected_feat)

8

In [53]:
point_04 = selected_feat

In [54]:
point_04

Index(['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'DAYS_EMPLOYED', 'MONTHS_BALANCE', 'AMT_PAYMENT_CURRENT',
       'CNT_INSTALMENT_MATURE_CUM'],
      dtype='object')