# Feature Importance

In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# Modeling 
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
dataset = pd.read_csv('chapter2.csv')

In [4]:
# Memisahkan data menjadi fitur (X) dan label (y)
X = dataset.drop(columns=['TARGET'], axis = 1)
y = dataset['TARGET']

# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Mencetak bentuk data X_train, X_test, y_train, dan y_test
print('X_train data shape: ', X_train.shape)
print('y_train data shape: ', y_train.shape)
print('X_test data shape: ', X_test.shape)
print('y_test data shape: ', y_test.shape)

X_train data shape:  (246005, 178)
y_train data shape:  (246005,)
X_test data shape:  (61502, 178)
y_test data shape:  (61502,)


In [5]:
# create a dataframe temporery of all zeroes to hold feature importance calculations
feature_imp = np.zeros(X_train.shape[1])

In [6]:
model = lgb.LGBMClassifier(objective='binary', boosting_type='goss', n_estimators=10000, class_weight='balanced')

In [7]:
# Mengimpor modul re
import re

# Menghapus karakter khusus dari nama fitur
X_train = X_train.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
X_test = X_test.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))

# Menghindari duplikasi nama fitur
new_names = {col: f'{col}_{i}' if col in X_train.columns[:i] else col for i, col in enumerate(X_train.columns)}
X_train = X_train.rename(columns=new_names)
X_test = X_test.rename(columns=new_names)

In [8]:
# Split data dan modeling
for i in range(2):
    train_x1, train_x2, train_y1, train_y2 = train_test_split(X_train, y_train, test_size = 0.25, random_state = i)
    model.fit(train_x1, train_y1, early_stopping_rounds=100, eval_set = [(train_x2, train_y2)], eval_metric = 'auc', verbose = 200)
    feature_imp += model.feature_importances_

[200]	valid_0's auc: 0.772702	valid_0's binary_logloss: 0.509231
[200]	valid_0's auc: 0.774067	valid_0's binary_logloss: 0.51332


In [9]:
# Tampilkan beberapa Feature Importanca
feature_imp = feature_imp / 2
feature_imp = pd.DataFrame({'feature': list(X_train.columns), 'importance': feature_imp}).sort_values('importance', ascending = False)
feature_imp.head(10)

Unnamed: 0,feature,importance
98,ANNUITY_CREDIT,197.0
14,EXT_SOURCE_1,182.0
15,EXT_SOURCE_2,181.0
16,EXT_SOURCE_3,172.5
167,cash_CNT_INSTALMENT_mean_mean,121.0
11,DAYS_BIRTH,114.0
162,install_DAYS_DIFF_min_mean,110.5
159,install_AMT_INSTALMENT_mean_mean,107.0
2,DAYS_EMPLOYED,95.0
158,install_DAYS_INSTALMENT_mean_mean,94.5


In [10]:
# Cara dan pindahkan data yang memiliki nilai 0.0 pada importance
zero_imp = list(feature_imp[feature_imp['importance'] == 0.0]['feature'])
print('count of features with 0 importance: ', len(zero_imp))
feature_imp.tail(10)

count of features with 0 importance:  49


Unnamed: 0,feature,importance
86,OCCUPATION_TYPE_Waitersbarmenstaff,0.0
85,OCCUPATION_TYPE_Securitystaff,0.0
82,OCCUPATION_TYPE_Realtyagents,0.0
76,OCCUPATION_TYPE_ITstaff,0.0
74,OCCUPATION_TYPE_HRstaff,0.0
71,OCCUPATION_TYPE_Cookingstaff,0.0
70,OCCUPATION_TYPE_Cleaningstaff,0.0
63,NAME_HOUSING_TYPE_Coopapartment,0.0
122,CREDIT_TYPE_Cashloannonearmarked,0.0
177,NAME_CONTRACT_STATUS_XNA_mean,0.0


In [11]:
zero_imp

['FLAG_DOCUMENT_6',
 'NAME_CONTRACT_STATUS_Amortizeddebt_mean',
 'NAME_CONTRACT_STATUS_Refused_mean',
 'FLAG_DOCUMENT_2',
 'NAME_TYPE_SUITE_Other_B',
 'NAME_TYPE_SUITE_Other_A',
 'NAME_TYPE_SUITE_Groupofpeople',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_21',
 'FLAG_DOCUMENT_20',
 'NAME_CONTRACT_STATUS_Demand_mean_y',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_14',
 'NAME_CONTRACT_STATUS_Approved_mean_x',
 'FLAG_DOCUMENT_12',
 'NAME_CONTRACT_STATUS_Canceled_mean',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_7',
 'NAME_INCOME_TYPE_Businessman',
 'CREDIT_TYPE_Anothertypeofloan',
 'NAME_INCOME_TYPE_Student',
 'NAME_INCOME_TYPE_Unemployed',
 'CREDIT_CURRENCY_currency4',
 'CREDIT_TYPE_Interbankcredit',
 'CREDIT_TYPE_Loanforbusinessdevelopment',
 'CREDIT_TYPE_Loanforpurchaseofsharesmarginlending',
 'CREDIT_TYPE_Loanforthepurchaseofequipment',
 'CREDIT_CURRENCY_currency2',
 'CREDIT_TYPE_Mobileoperatorloan',
 'CREDIT_ACTIVE_Baddebt',
 'CREDIT_TYPE_Reale

zero_imp adalah berapa feature yang akan dihapus guna untuk mempercepat proses pemodelan

In [12]:
# Menghapus karakter khusus dari nama fitur
dataset1 = dataset.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))

# Menghindari duplikasi nama fitur
new_names = {col: f'{col}_{i}' if col in dataset1.columns[:i] else col for i, col in enumerate(dataset1.columns)}
dataset1 = dataset1.rename(columns=new_names)

In [13]:
zero_imp.remove("SK_ID_CURR")

In [14]:
# Drop zero_imp
dataset1 = dataset1.drop(columns = zero_imp)
x_train = X_train.drop(columns = zero_imp)
x_test = X_test.drop(columns = zero_imp)

In [15]:
print('x_train data shape: ', dataset1.shape)

x_train data shape:  (307507, 131)


# Handle Missing Value

In [16]:
dataset= dataset1

In [17]:
# Menghapus karakter khusus dari nama fitur
dataset = dataset.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))

In [18]:
dataset.head()

Unnamed: 0,Unnamed0,SK_ID_CURR,DAYS_EMPLOYED,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,install_AMT_DIFF_max_mean,install_AMT_DIFF_min_mean,cash_MONTHS_BALANCE_count_mean,cash_CNT_INSTALMENT_mean_mean,cash_SK_DPD_mean_mean,cash_SK_DPD_DEF_mean_mean,NAME_CONTRACT_STATUS_Approved_mean_y,NAME_CONTRACT_STATUS_Completed_mean_y,NAME_CONTRACT_STATUS_Returnedtothestore_mean,NAME_CONTRACT_STATUS_Signed_mean
0,0,100002,-637,1,0,1,0,202500.0,406597.5,24700.5,...,0.0,0.0,19.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,100003,-1188,0,0,0,0,270000.0,1293502.5,35698.5,...,0.0,0.0,9.714286,10.107143,0.0,0.0,0.0,0.571429,0.0,0.0
2,2,100004,-225,1,1,1,0,67500.0,135000.0,6750.0,...,0.0,0.0,4.0,3.75,0.0,0.0,0.0,1.0,0.0,0.0
3,3,100006,-3039,0,0,1,0,135000.0,312682.5,29686.5,...,0.0,0.0,8.238095,11.904762,0.0,0.0,0.0,0.52381,0.380952,0.0
4,4,100007,-3038,1,0,1,0,121500.0,513000.0,21865.5,...,5225.751136,0.0,13.727273,15.333333,0.0,0.0,0.0,0.666667,0.0,0.19697


In [19]:
dataset.drop(dataset.columns[[0]], axis = 1, inplace = True)

In [20]:
dataset.head()

Unnamed: 0,SK_ID_CURR,DAYS_EMPLOYED,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,...,install_AMT_DIFF_max_mean,install_AMT_DIFF_min_mean,cash_MONTHS_BALANCE_count_mean,cash_CNT_INSTALMENT_mean_mean,cash_SK_DPD_mean_mean,cash_SK_DPD_DEF_mean_mean,NAME_CONTRACT_STATUS_Approved_mean_y,NAME_CONTRACT_STATUS_Completed_mean_y,NAME_CONTRACT_STATUS_Returnedtothestore_mean,NAME_CONTRACT_STATUS_Signed_mean
0,100002,-637,1,0,1,0,202500.0,406597.5,24700.5,0.018801,...,0.0,0.0,19.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100003,-1188,0,0,0,0,270000.0,1293502.5,35698.5,0.003541,...,0.0,0.0,9.714286,10.107143,0.0,0.0,0.0,0.571429,0.0,0.0
2,100004,-225,1,1,1,0,67500.0,135000.0,6750.0,0.010032,...,0.0,0.0,4.0,3.75,0.0,0.0,0.0,1.0,0.0,0.0
3,100006,-3039,0,0,1,0,135000.0,312682.5,29686.5,0.008019,...,0.0,0.0,8.238095,11.904762,0.0,0.0,0.0,0.52381,0.380952,0.0
4,100007,-3038,1,0,1,0,121500.0,513000.0,21865.5,0.028663,...,5225.751136,0.0,13.727273,15.333333,0.0,0.0,0.0,0.666667,0.0,0.19697


In [21]:
# Function to calculate missing values by column
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [22]:
missing_values = missing_values_table(dataset)
missing_values.head(20)

Your selected dataframe has 130 columns.
There are 69 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
credit_SK_DPD_DEF_mean_mean,220602,71.7
credit_SK_DPD_mean_mean,220602,71.7
credit_CNT_INSTALMENT_MATURE_CUM_mean_mean,220602,71.7
credit_CNT_DRAWINGS_CURRENT_mean_mean,220602,71.7
credit_AMT_INST_MIN_REGULARITY_mean_mean,220602,71.7
credit_AMT_DRAWINGS_CURRENT_mean_mean,220602,71.7
credit_AMT_CREDIT_LIMIT_ACTUAL_mean_mean,220602,71.7
credit_MONTHS_BALANCE_count_mean,220602,71.7
NAME_CONTRACT_STATUS_Completed_mean_x,220602,71.7
bureau_bal_STATUS_3_mean,215276,70.0


In [23]:
def fillna_df(df):
    for column in df.select_dtypes("object").columns:
        df[column] = df[column].fillna(df[column].mode())
        
    for column in df.select_dtypes("number").columns:
        df[column] = df[column].fillna(df[column].median())
    return df

In [24]:
fillna_df(dataset);
missing_values = missing_values_table(dataset)
missing_values.head(20)

Your selected dataframe has 130 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [25]:
dataset.to_csv('dataclear.csv', index= False)