In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_csv('data/HomeCreditDefault/application_train_cleaned.csv')
df_train = pd.read_csv('data/HomeCreditDefault/application_train.csv')
df_train = df_train[['SK_ID_CURR', 'TARGET', 'REGION_POPULATION_RELATIVE', 'DAYS_EMPLOYED', 
                     'EXT_SOURCE_2', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 
                     'AMT_GOODS_PRICE']].copy()
df_train = df_train.dropna()
print("\napp_train")
print(df_train.shape)


app_train
(306562, 9)


In [3]:
test = pd.read_csv('data/HomeCreditDefault/application_test.csv')


app_test = test[['SK_ID_CURR', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 
           'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 
           'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 
           'OWN_CAR_AGE']].copy()


In [4]:
own_car = pd.get_dummies(train['FLAG_OWN_CAR'])
own_realty = pd.get_dummies(train['FLAG_OWN_REALTY'])
app_test['OWN_CAR_AND_REALTY'] = own_car['Y'] * own_realty['Y']
app_test['GENDER'] = pd.get_dummies(app_test['CODE_GENDER'])['M']
app_test['CONTRACT_TYPE'] = pd.get_dummies(app_test['NAME_CONTRACT_TYPE'])['Cash loans']

app_test = app_test.drop(['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CODE_GENDER', 'OWN_CAR_AGE', 'NAME_CONTRACT_TYPE'], axis=1)
app_test = app_test.dropna()
print(app_test.shape)

(48720, 11)


OWN_CAR_AND_REALTY: 1 for yes, 0 for no  
GENDER: 1 for Male, 0 for Female  
CONTRACT_TYPE: 1 for Cash Loan, 0 for Revolving loan

In [5]:
own_car = pd.get_dummies(train['FLAG_OWN_CAR'])
own_realty = pd.get_dummies(train['FLAG_OWN_REALTY'])
train['OWN_CAR_AND_REALTY'] = own_car['Y'] * own_realty['Y']
train['GENDER'] = pd.get_dummies(train['CODE_GENDER'])['M']
train['CONTRACT_TYPE'] = pd.get_dummies(train['NAME_CONTRACT_TYPE'])['Cash loans']

train = train.drop(['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CODE_GENDER', 'OWN_CAR_AGE', 'NAME_CONTRACT_TYPE'], axis=1)
train = train.dropna()
target = train['TARGET']
train = train.drop(['TARGET'], axis=1)
print(np.shape(train))

(307221, 11)


In [6]:
X = train[['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
          'OWN_CAR_AND_REALTY', 'GENDER', 'CONTRACT_TYPE']]
y = target

# Logistic Regression

In [7]:
y_log = df_train['TARGET']
X_log = df_train.drop(['TARGET', 'SK_ID_CURR'], axis=1)


seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X_log, y_log, test_size=test_size, random_state=seed)


clf = LogisticRegression(random_state=0).fit(X, y)
clf.predict(X)



array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [8]:
clf.predict_proba(X)

array([[0.92708774, 0.07291226],
       [0.97777975, 0.02222025],
       [0.70441667, 0.29558333],
       ...,
       [0.91792972, 0.08207028],
       [0.89427548, 0.10572452],
       [0.96819319, 0.03180681]])

In [9]:
score = clf.score(X, y)
print("Accuracy: {}".format(np.round(score * 100.0, 2)) + '%')

Accuracy: 91.93%


# XGBoost

In [10]:
df_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,REGION_POPULATION_RELATIVE,DAYS_EMPLOYED,EXT_SOURCE_2,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE
0,100002,1,0.018801,-637,0.262949,202500.0,406597.5,24700.5,351000.0
1,100003,0,0.003541,-1188,0.622246,270000.0,1293502.5,35698.5,1129500.0
2,100004,0,0.010032,-225,0.555912,67500.0,135000.0,6750.0,135000.0
3,100006,0,0.008019,-3039,0.650442,135000.0,312682.5,29686.5,297000.0
4,100007,0,0.028663,-3038,0.322738,121500.0,513000.0,21865.5,513000.0


XGBoost model only uses REGION_POPULATION_RELATIVE, DAYS_EMPLOYED, EXT_SOURCE_2, AMT_INCOME_TOTAL, AMT_CREDIT, AMT_ANNUITY, and AMT_GOODS_PRICE. 

In [11]:
y_xgboost = df_train['TARGET']
X_xgboost = df_train.drop(['TARGET', 'SK_ID_CURR'], axis=1)

In [12]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X_xgboost, y_xgboost, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [value for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: {}".format(np.round(accuracy * 100.0, 2)) + '%')

Accuracy: 92.06%


In [13]:
print(sum(predictions))

120


In [14]:
sum(y_test)


8014