In [50]:
import pandas as pd
import altair as alt
from collections import Counter

In [51]:
df = pd.read_csv('application_data_min.csv')
df.loc[df['CNT_FAM_MEMBERS'].isnull()] = 0
df = df.dropna(subset = ['AMT_ANNUITY'])

X = df[['CNT_FAM_MEMBERS', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'CODE_GENDER', 'NAME_CONTRACT_TYPE', 
       'NAME_EDUCATION_TYPE']]
X = pd.get_dummies(X, drop_first = True)

y = df['TARGET']

In [52]:
X.describe()

Unnamed: 0,CNT_FAM_MEMBERS,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,CODE_GENDER_F,CODE_GENDER_M,CODE_GENDER_XNA,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special
count,307499.0,307499.0,307499.0,307499.0,307499.0,307499.0,307499.0,307499.0,307499.0,307499.0,307499.0,307499.0,307499.0,307499.0
mean,2.152667,168795.1,599023.2,27108.36903,0.65834,0.34164,1.3e-05,0.904783,0.09521,0.000533,0.243438,0.033418,0.012403,0.710201
std,0.910705,237127.2,402495.5,14493.896738,0.474267,0.47426,0.003607,0.293514,0.293505,0.023088,0.429158,0.179726,0.110677,0.45367
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,112500.0,270000.0,16524.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,146812.5,513531.0,24903.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,3.0,202500.0,808650.0,34596.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
max,20.0,117000000.0,4050000.0,258025.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [53]:
y.value_counts() / len(y)

0    0.919268
1    0.080732
Name: TARGET, dtype: float64

In [54]:
y.value_counts()

0    282674
1     24825
Name: TARGET, dtype: int64

In [55]:
y_values = pd.DataFrame({'values':['0', '1'], 'count':[y.value_counts()[0], y.value_counts()[1]]})

In [56]:
alt.Chart(y_values).mark_bar().encode(
    x = 'values',
    y = 'count'
)

Note that we had to hardcode these values because of 

## Option 1: Random Sampling

In [57]:
defaults = df[df['TARGET'] == 1]
nondefaults = df[df['TARGET'] == 0]

In [58]:
num_defaults = int(5000 * 0.080732)
num_nondefaults = 5000 - num_defaults

In [59]:
default_sample = defaults.sample(n = num_defaults)
nondefault_sample = nondefaults.sample(n = num_nondefaults)
sample = pd.concat([default_sample, nondefault_sample])

In [60]:
print(sample['TARGET'].value_counts())

0    4597
1     403
Name: TARGET, dtype: int64


In [67]:
alt.Chart(sample).mark_bar().encode(
    x = 'TARGET:O',
    y = 'count()'
)

In [70]:
brush = alt.selection_interval(encodings=['x'])

base = alt.Chart(sample).mark_bar().encode(
    y='count():Q'
).properties(
    width=600,
    height=100
)

alt.vconcat(
  base.encode(
    alt.X('AMT_INCOME_TOTAL:Q',
      bin=alt.Bin(maxbins=30, extent=brush),
      scale=alt.Scale(domain=brush)
    )
  ),
  base.encode(
    alt.X('AMT_INCOME_TOTAL:Q', bin=alt.Bin(maxbins=100)),
  ).add_selection(brush)
)

In [73]:
alt.Chart(sample).mark_rect().encode(
    alt.X("AMT_CREDIT:Q", bin=True),
    alt.Y("AMT_ANNUITY:Q", bin=True),
    color='count()'
)

In [75]:
alt.Chart(sample).mark_rect().encode(
    alt.X("AMT_CREDIT:Q", bin=True),
    alt.Y("AMT_INCOME_TOTAL:Q", bin=True),
    color='count()'
)

## Option 2: Discretize into Bins

In [62]:
df['FAMILY_BIN'] = pd.cut(df['CNT_FAM_MEMBERS'], bins = [0, 5, 10, 15, 20])
df['FAMILY_BIN'].value_counts()

(0, 5]      306968
(5, 10]        518
(10, 15]         7
(15, 20]         4
Name: FAMILY_BIN, dtype: int64

In [63]:
# Splits into 10 bins
df['INCOME_BIN'] = pd.qcut(df['AMT_INCOME_TOTAL'], q=10)
df['INCOME_BIN'].value_counts().reindex(df['INCOME_BIN'].cat.categories)

(-0.001, 81000.0]          33393
(81000.0, 99000.0]         30278
(99000.0, 112500.0]        36907
(112500.0, 135000.0]       48849
(135000.0, 146812.5]        4324
(146812.5, 162000.0]       31126
(162000.0, 180000.0]       30702
(180000.0, 225000.0]       44805
(225000.0, 270000.0]       19957
(270000.0, 117000000.0]    27158
Name: INCOME_BIN, dtype: int64

Note: not really sure how to plot these, altair doesn't seem to like the fact that they're intervals