In [126]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
%matplotlib inline

## Upload, Initial Exploration

In [127]:
df = pd.read_csv('application_data_updated.csv')
print(df.shape)
df.head()

(307511, 52)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,2.0,2.0,2.0,-1134.0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,1.0,0.0,-828.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,-815.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,2.0,0.0,-617.0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.0,-1106.0,0.0,0.0,0.0,0.0,0.0,0.0


In [128]:
df.TARGET.mean()
df.TARGET.value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64

In [129]:
df.dtypes

SK_ID_CURR                       int64
TARGET                           int64
NAME_CONTRACT_TYPE              object
CODE_GENDER                     object
FLAG_OWN_CAR                    object
FLAG_OWN_REALTY                 object
CNT_CHILDREN                     int64
AMT_INCOME_TOTAL               float64
AMT_CREDIT                     float64
AMT_ANNUITY                    float64
AMT_GOODS_PRICE                float64
NAME_TYPE_SUITE                 object
NAME_INCOME_TYPE                object
NAME_EDUCATION_TYPE             object
NAME_FAMILY_STATUS              object
NAME_HOUSING_TYPE               object
REGION_POPULATION_RELATIVE     float64
DAYS_BIRTH                       int64
DAYS_EMPLOYED                    int64
DAYS_REGISTRATION              float64
DAYS_ID_PUBLISH                  int64
FLAG_MOBIL                       int64
FLAG_EMP_PHONE                   int64
FLAG_WORK_PHONE                  int64
FLAG_CONT_MOBILE                 int64
FLAG_PHONE               

## Cleaning Null Values

In [130]:
null_columns = df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

AMT_ANNUITY                      12
AMT_GOODS_PRICE                 278
NAME_TYPE_SUITE                1292
OCCUPATION_TYPE               96391
CNT_FAM_MEMBERS                   2
EXT_SOURCE_2                    660
OBS_30_CNT_SOCIAL_CIRCLE       1021
DEF_30_CNT_SOCIAL_CIRCLE       1021
OBS_60_CNT_SOCIAL_CIRCLE       1021
DEF_60_CNT_SOCIAL_CIRCLE       1021
DAYS_LAST_PHONE_CHANGE            1
AMT_REQ_CREDIT_BUREAU_HOUR    41519
AMT_REQ_CREDIT_BUREAU_DAY     41519
AMT_REQ_CREDIT_BUREAU_WEEK    41519
AMT_REQ_CREDIT_BUREAU_MON     41519
AMT_REQ_CREDIT_BUREAU_QRT     41519
AMT_REQ_CREDIT_BUREAU_YEAR    41519
dtype: int64

In [131]:
# NAME_TYPE_SUITE: who was accompanying client when he was appplying for loan
# Change the null values to "Unaccompanied"
# Data type all changes to object here??

In [132]:
# AMT_GOODS_PRICE: For consumer loans it is the price of the goods for which the loan is given
# Make nulls = 0?
df['AMT_GOODS_PRICE'].describe()

count    3.072330e+05
mean     5.383962e+05
std      3.694465e+05
min      4.050000e+04
25%      2.385000e+05
50%      4.500000e+05
75%      6.795000e+05
max      4.050000e+06
Name: AMT_GOODS_PRICE, dtype: float64

In [133]:
# CNT_FAM_MEMBERS
df['CNT_FAM_MEMBERS'].describe()
df.loc[df['CNT_FAM_MEMBERS'].isnull()] = 0.0

In [134]:
# DAYS_LAST_PHONE_CHANGE
df['DAYS_LAST_PHONE_CHANGE'].value_counts()
df.loc[df['DAYS_LAST_PHONE_CHANGE'].isnull()] = 0.0

In [135]:
#CNT_FAM_MEMBERS

In [136]:
# High correlation between 30 and 60 for social circle - just use 60?
social_subset = df.loc[:, ['OBS_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', \
                           'DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE']]
social_subset.corr()

Unnamed: 0,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE
OBS_30_CNT_SOCIAL_CIRCLE,1.0,0.99849,0.32934,0.253501
OBS_60_CNT_SOCIAL_CIRCLE,0.99849,1.0,0.331572,0.255571
DEF_30_CNT_SOCIAL_CIRCLE,0.32934,0.331572,1.0,0.860517
DEF_60_CNT_SOCIAL_CIRCLE,0.253501,0.255571,0.860517,1.0


In [137]:
# Change amt_req_credit_bureau
df['AMT_REQ_CREDIT_BUREAU'] = df['AMT_REQ_CREDIT_BUREAU_HOUR'] + df['AMT_REQ_CREDIT_BUREAU_DAY'] + \
                            df['AMT_REQ_CREDIT_BUREAU_WEEK'] + df['AMT_REQ_CREDIT_BUREAU_MON'] + \
                            df['AMT_REQ_CREDIT_BUREAU_QRT'] + df['AMT_REQ_CREDIT_BUREAU_YEAR']
# Remove old columns
df.drop(['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', \
         'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR'], axis = 1, inplace = True)
# Value counts are really skewed
# df['AMT_REQ_CREDIT_BUREAU'].value_counts()

In [138]:
# Check to see how many are null
null_columns = df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

AMT_ANNUITY                    12
AMT_GOODS_PRICE               276
NAME_TYPE_SUITE              1290
OCCUPATION_TYPE             96390
EXT_SOURCE_2                  659
OBS_30_CNT_SOCIAL_CIRCLE     1021
DEF_30_CNT_SOCIAL_CIRCLE     1021
OBS_60_CNT_SOCIAL_CIRCLE     1021
DEF_60_CNT_SOCIAL_CIRCLE     1021
AMT_REQ_CREDIT_BUREAU       41517
dtype: int64

In [142]:
# Final step: remove all without ;/ not sure if this is the best idea
# RUN LOGREG ON THIS, SEE IF OCCUPATION TYPE IS EVEN THAT GREAT, IF NOT THEN JUST NOT INCLUDE THE ROW AND USE ALL IMPUTED
df = df.dropna(subset = ['OCCUPATION_TYPE'])
df.shape

(211121, 47)

## EDA

In [None]:
# None really have that strong of a correlation with target...sad
df.corr()

## Target Analysis

In [None]:
# TARGET = our predicted value, 1 = default, 0 = no default
y = df['TARGET']
print(y.mean())
print(y.value_counts())

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()