In [1]:
from __future__ import division
import glob
import os
import pandas as pd
import numpy as np
from scipy.stats import binned_statistic, linregress
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
import seaborn as sns
import statsmodels.api as sm
from IPython.display import clear_output, Image

from s3_connect import s3_connect

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from EDA_plotting_functions import (make_kdeplot, default_rate_binned_barplot, default_rate_categorical_barplot, 
                                   default_rate_by_state)

init_notebook_mode(connected=True)

tmp_localdir = '~/'

pd.options.display.max_columns = 999

%pylab inline
clear_output()

In [2]:
def default_rate_by_state(df):
    """Creates a choropleth for default rate per state"""
    state_defaults = df.groupby('addr_state').mean()['default']

    scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
                [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

    data = [ dict(
            type='choropleth',
            colorscale = scl,
            autocolorscale = False,
            locations = state_defaults.index,
            z = state_defaults,
            locationmode = 'USA-states',
    #         text = df['text'],
            marker = dict(
                line = dict (
                    color = 'rgb(255,255,255)',
                    width = 2
                ) ),
            colorbar = dict(
                title = "Default Rate")
            ) ]

    layout = dict(
            title = 'Default Rates by State',
            geo = dict(
                scope='usa',
                projection=dict( type='albers usa' ),
                showlakes = True,
                lakecolor = 'rgb(255, 255, 255)'),
            yaxis = {'fixedrange': True},
            xaxis = {'fixedrange': True}
                 )

    return dict( data=data, layout=layout )

# Quick Exploratory Data Analysis

Let's explore relationship between features and default rates. Although the Lending Club dataset contains hundreds of features for each loan, we chose to include only the most influential features for simplisticities sake.

In [3]:
s3_conn = s3_connect(access=os.environ['AWS_CLOUD_BUCKET_KEY'],
                     secret=os.environ['AWS_CLOUD_BUCKET_SECRET_KEY'],
                     bucketname='ds-cloud-public-shared')

df = s3_conn.pull_pickle_from_s3(key='demos/loan-risk/data/raw_data.p',tmp_localdir=tmp_localdir)
clear_output()

## Interest Rate
Since Lending Club uses credit scores (among other variables) to determine interest rates, much the signal related to default risk is already baked into interest rates.  

In [4]:
fig = default_rate_binned_barplot(df=df, feature='int_rate', bins=20, xlabel='Interest Rate',
                     title='Default rate strongly correlates with interest rate')
py.iplot(fig)

## Debt to Income
Debt to Income (DTI) quantifies how much debt a borrower has as a fraction of their annual income. The higher the DTI, the less likely the borrower will be able to pay back the loan.

In [5]:
fig = default_rate_binned_barplot(df=df.ix[df['dti']<40], feature='dti',bins=30, xlabel='Debt to Income Ratio', title='Debt to income ratio is highly correlated with default rate')
py.iplot(fig)

## Interest Rate and DTI Interaction
Reporting multivariate figures can also be useful in understanding interactions between features. After all, complex events like defaults are rarely explained by a single variable.

Below is a kernel density estimation (KDE) plot that highlights the location of defaults in 2D space. Darker colors illustrate clusters of high default loans, while brighter colors illustrate clusters of low default loans.

Three insights can be derived from this figure
* As expected, loans with high interest rates and DTI are the riskiest. 
* Having a high DTI less risky if the interest rate is low, and vise versa.  
* Interestingly, having a moderate DTI (10-15%) is **less** risky than a low DTI (0-10%) when interest rates are <15%. 

In [6]:
from scipy.stats import binned_statistic, linregress, gaussian_kde
def kde_scipy( vals1, vals2, (a,b), (c,d), N ):

    x=np.linspace(a,b,N)
    y=np.linspace(c,d,N)
    X,Y=np.meshgrid(x,y)
    positions = np.vstack([Y.ravel(), X.ravel()])

    values = np.vstack([vals1, vals2])
    kernel = gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, X.shape)

    return [x, y, Z]

def make_kdeplot((varX1, varY1), (varX2, varY2), (a,b), (c,d), N, colorsc, title, x_label, y_label):
    #varX, varY are lists, 1d numpy.array(s), or dataframe columns, storing the values of two variables

    x1, y1, Z1 = kde_scipy(varY1, varX1, (a,b), (c,d), N )
    x2, y2, Z2 = kde_scipy(varY2, varX2, (a,b), (c,d), N )

    data = go.Data([
       go.Contour(
           z=Z2-Z1,
           x=x1,
           y=y1,
           colorscale='Hot',
           reversescale=True,
           opacity=0.9,
           contours=go.Contours(
               showlines=False) )
     ])

    layout = go.Layout(
        title= title,
        font= go.Font(family='Georgia, serif',  color='#635F5D'),
        showlegend=False,
        autosize=False,
        width=650,
        height=650,
        xaxis=go.XAxis(
            range=[a,b],
            showgrid=False,
            nticks=7,
            title=x_label
        ),
        yaxis=go.YAxis(
            range=[c,d],
            showgrid=False,
            nticks=7,
            title=y_label
        ),
        margin=go.Margin(
            l=40,
            r=40,
            b=85,
            t=100,
        ),
        
        annotations=[
        dict(
            x=7.9,
            y=13.43,
            xref='x',
            yref='y',
            text='Low Risk 👍',
            showarrow=True,
            arrowhead=0.2,
            ax=20,
            ay=30,
            bgcolor='rgb(240,240,240)',
            bordercolor='rgb(0,0,0)',
            font=dict(size=14)
        ),
        dict(
            x=18.2,
            y=20.7,
            xref='x',
            yref='y',
            text='High Risk 👎',
            showarrow=True,
            arrowhead=0.2,
            ax=20,
            ay=30,
            bgcolor='rgb(240,240,240)',
            bordercolor='rgb(0,0,0)',
            font=dict(size=14)                
        )]
    )

    return go.Figure( data=data, layout=layout )

In [7]:
N = 20000
dti_upper_limit = 40

cubehelix_cs = [[0.0, '#fcf9f7'],
                [0.16666666666666666, '#edcfc9'],
                [0.3333333333333333, '#daa2ac'],
                [0.5, '#bc7897'],
                [0.6666666666666666, '#925684'],
                [0.8333333333333333, '#5f3868'],
                [1.0, '#2d1e3e']]


df_default = df.ix[(df['default']==1) & (df['dti']<=dti_upper_limit)].sample(N,random_state=6)
df_paid = df.ix[(df['default']==0) & (df['dti']<=dti_upper_limit)].sample(N,random_state=6)

fig = make_kdeplot((df_paid['int_rate'], df_paid['dti']),
             (df_default['int_rate'], df_default['dti']),
             (0.1, 25),
             (0.1, 25),
             100, cubehelix_cs,
             x_label='Interest Rate (%)',
             y_label='Debt to Income Ratio',
             title='Interaction effect of DTI and Interest Rate on Default Rate' )

py.iplot(fig, filename='kde-2D-CSCE')

## State

We can also break down default rates by state. Interestingly, Mississippi and Nebraska have the highest default rates.

In [8]:
fig = default_rate_by_state(df)
py.iplot(fig)

### <a href="https://demo.datascience.com/project/optimizing-your-investment-strategy/outputs/eda-deep-dive-UG9zdFR5cGU6MTU5" target="_blank">For a more comprehensive deep dive into EDA with Plotly, check out this report.</a>

## Next Step: Build a Model
Now that we've gotten a feel for what some of the Lending Club features, let's use these data to build a model that can predict the probability of default. 

To do this, we will **launch a notebook container** with the following specifications:
* Tool: Jupyter
* Compute Resources: 8 GB / 2 CPU
* Dependency Collection: Standard

And then open file `build_model.ipynb` to build our model.