In [6]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

In [5]:
df=pd.read_csv("C:\\House_Loan\\loan_data.csv")

In [7]:
df = df.drop(['SK_ID_CURR'], axis=1) 

In [8]:
df = df.fillna(-1)

In [10]:
le = preprocessing.LabelEncoder()
columns = df.columns.values
for column in columns:
    if df[column].dtype != np.int64 and df[column].dtype != np.float64:  
        df[column] = le.fit_transform(df[column].astype(str))

In [11]:
feature_name = df.drop(['TARGET'], axis=1).columns
df_target_1 = df.loc[df['TARGET'] == 1].reset_index(drop=True)
df_target_0 = df.loc[df['TARGET'] == 0].sample(n=df_target_1.shape[0])  # balance the number of data of TARGET==0 and TARGET==1
df_sampled = pd.concat([df_target_1, df_target_0], ignore_index=True)  # combine the TARGET==0 and TARGET==1 data to single df
df_sampled = df_sampled.sample(frac=1).reset_index(drop=True)

In [12]:
X = df_sampled.drop(['TARGET'], axis=1).values
y = df_sampled['TARGET'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = XGBClassifier()  # use XGBoost to find the feature importance
model.fit(X_train, y_train)
importance = model.feature_importances_

In [13]:
top20 = np.argpartition(importance, -20)[-20:]  # sort out top 20 important features
feature_name_selected = ['TARGET']
print('Top 20 features:')
for i in top20:
    feature_name_selected.append(feature_name[i])

processed_data = df_sampled[feature_name_selected]
print(processed_data)

Top 20 features:
       TARGET  REG_REGION_NOT_LIVE_REGION  NONLIVINGAPARTMENTS_MODE  \
0           0                           0                   -1.0000   
1           1                           0                   -1.0000   
2           1                           0                   -1.0000   
3           1                           0                   -1.0000   
4           0                           0                   -1.0000   
...       ...                         ...                       ...   
49645       0                           0                    0.0078   
49646       0                           0                   -1.0000   
49647       0                           0                    0.0000   
49648       1                           0                   -1.0000   
49649       1                           0                    0.0039   

       FLAG_DOCUMENT_18  REG_CITY_NOT_LIVE_CITY  NAME_INCOME_TYPE  DAYS_BIRTH  \
0                     0                       0  

In [16]:
processed_data.to_csv('C:/House_Loan/train_data.csv', index=False, header=True)  
df_sampled.to_csv('C:/House_Loan/balanced_data.csv', index=False, header=True)

In [21]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

input_file = 'C:/House_Loan/train_data.csv'

number_of_feature = 97

results = []

def df_preprocessing(df):
    for i in df:
        df[i] = df[i] / df[i].abs().max()
    return df

df = pd.read_csv(input_file, on_bad_lines='skip')
X = df_sampled.drop(['TARGET'], axis=1).values
y = df_sampled['TARGET'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
model = XGBClassifier()
model.fit(X_train, y_train)
importance = model.feature_importances_

for j in range(1, number_of_feature+1):

    top = np.argpartition(importance, -j)[-j:]
    feature_name_selected = ['TARGET']
    for i in top:
        feature_name_selected.append(feature_name[i])
    processed_data = df_sampled[feature_name_selected]
    feature = processed_data.drop(['TARGET'], axis=1)
    feature = df_preprocessing(feature)
    X = feature.values
    y = processed_data['TARGET'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

    model = XGBClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred, normalize=True)
    print('no. of features:', j, 'accuracy:', accuracy)

no. of features: 1 accuracy: 0.6078305002819625
no. of features: 2 accuracy: 0.643841134294691
no. of features: 3 accuracy: 0.6471441230967534
no. of features: 4 accuracy: 0.6493192620639652
no. of features: 5 accuracy: 0.6498026262789012
no. of features: 6 accuracy: 0.651494401031177
no. of features: 7 accuracy: 0.6612422460323854
no. of features: 8 accuracy: 0.6625312172722146
no. of features: 9 accuracy: 0.6605977604124708
no. of features: 10 accuracy: 0.6657536453717876
no. of features: 11 accuracy: 0.6622089744622573
no. of features: 12 accuracy: 0.6696205590912753
no. of features: 13 accuracy: 0.6671231773141062
no. of features: 14 accuracy: 0.671554015951019
no. of features: 15 accuracy: 0.6728429871908483
no. of features: 16 accuracy: 0.669781680496254
no. of features: 17 accuracy: 0.672037380165955
no. of features: 18 accuracy: 0.6721179408684443
no. of features: 19 accuracy: 0.671795698058487
no. of features: 20 accuracy: 0.6729235478933376
no. of features: 21 accuracy: 0.677