In [1]:
from __future__ import division
import glob
import os
import pandas as pd
import numpy as np
from scipy.stats import binned_statistic
from IPython.display import clear_output, Image

from s3_connect import s3_connect

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

tmp_localdir = '~/'

init_notebook_mode(connected=True)

pd.options.display.max_columns = 999

%pylab inline
clear_output()

In [3]:
### Function for plotting KDE difference heatmap
cubehelix_cs=[[0.0, '#fcf9f7'],
 [0.16666666666666666, '#edcfc9'],
 [0.3333333333333333, '#daa2ac'],
 [0.5, '#bc7897'],
 [0.6666666666666666, '#925684'],
 [0.8333333333333333, '#5f3868'],
 [1.0, '#2d1e3e']]

def make_kdeplot((varX1, varY1), (varX2, varY2), (a,b), (c,d), N, colorsc, title, x_label, y_label):
    #varX, varY are lists, 1d numpy.array(s), or dataframe columns, storing the values of two variables

    x1, y1, Z1 = kde_scipy(varY1, varX1, (a,b), (c,d), N )
    x2, y2, Z2 = kde_scipy(varY2, varX2, (a,b), (c,d), N )    

    data = go.Data([
       go.Contour(
           z=Z2-Z1,
           x=x1,
           y=y1,
           colorscale='Hot',
           reversescale=True,
           opacity=0.9,
           contours=go.Contours(
               showlines=False)
        ),
     ])

    layout = go.Layout(
        title= title,
        font= go.Font(family='Georgia, serif',  color='#635F5D'),
        showlegend=False,
        autosize=False,
        width=650,
        height=650,
        xaxis=go.XAxis(
            range=[a,b],
            showgrid=False,
            nticks=7,
            title=x_label
        ),
        yaxis=go.YAxis(
            range=[c,d],
            showgrid=False,
            nticks=7,
            title=y_label
        ),
        margin=go.Margin(
            l=40,
            r=40,
            b=85,
            t=100,
        ),
    )

    return go.Figure( data=data, layout=layout )

In [2]:
s3_conn = s3_connect(access=os.environ['AWS_CLOUD_BUCKET_KEY'],
                     secret=os.environ['AWS_CLOUD_BUCKET_SECRET_KEY'],
                     bucketname='ds-cloud-public-shared')

df = s3_conn.pull_pickle_from_s3(key='demos/loan-risk/data/raw_data.p',tmp_localdir=tmp_localdir)
clear_output()

# Exploratory Data Analysis Deep Dive
Let's explore the relationship between following features and default rates.

In [4]:
def split_bin_data(df, feature, bins):
    # Split data into default vs paid dataframes
    df_paid = df.loc[df['default']==0]
    df_default = df.loc[df['default']==1]
    
    # Bin feature
    paid_bins = np.histogram(df_paid[feature],bins=bins)
    default_bins = np.histogram(df_default[feature], bins=bins)
    
    return paid_bins, default_bins

# Plotting continuous feature vs default rate 
def default_rate_binned_barplot(df, feature, bins, xlabel, title):
    """Makes a barplot of a specified feature on the x-axis vs default rate on the y-axis.
       The independent variable is a continuous variable which is binned into bars.
       The dependent variable is the proportion of defaults within each bin."""

    # Split data into default vs paid dataframes and bin
    paid_bins, default_bins = split_bin_data(df, feature, bins)
    
    # Make plotly figure
    data = [
    go.Bar(
    x=paid_bins[1],
    y=default_bins[0] / (default_bins[0]+paid_bins[0])
    )]

    layout = go.Layout(
        title=title,
        xaxis={'title':xlabel},
        yaxis={'title':'Default Rate'}
              
    )

    return go.Figure(data=data, layout=layout)

    
# Plotting continuous feature vs default rate
def default_rate_scatter(df, feature, bins, xlabel, title):
    """Makes a barplot of a specified feature on the x-axis vs default rate on the y-axis.
       The independent variable is a continuous variable which is binned into bars.
       The dependent variable is the proportion of defaults within each bin."""

    # Split data into default vs paid dataframes and bin
    paid_bins, default_bins = split_bin_data(df, feature, bins)
    
    # Make plotly figure
    trace = go.Scatter(
        x = paid_bins[1],
        y = default_bins[0] / (default_bins[0]+paid_bins[0]),
        mode = 'markers'
    )

    data = [trace]

    # Plot and embed in ipython notebook!
    return go.Figure(data=data)
    

# Plotting categorical feature vs default rate
def default_rate_categorical_barplot(df, feature, xlabel, title):
    """Makes a barplot of a specified feature on the x-axis vs default rate on the y-axis.
       The independent variable is the categories of the feature.
       The dependent variable is the proportion of defaults within each categorical."""

    # Split data into default vs paid dataframes and bin
    df_grouped = df.groupby(feature).mean()['default'].sort_values()
    
    # Make plotly figure
    data = [
    go.Bar(
    x=df_grouped.index,
    y=df_grouped
    )]

    layout = go.Layout(
        title=title,
        xaxis={'title':xlabel},
        yaxis={'title':'Default Rate'}
              
    )

    return go.Figure(data=data, layout=layout)
    
    
def default_rate_by_state(df):
    """Creates a choropleth for default rate per state"""
    state_defaults = df.groupby('addr_state').mean()['default']

    scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
                [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

    data = [ dict(
            type='choropleth',
            colorscale = scl,
            autocolorscale = False,
            locations = state_defaults.index,
            z = state_defaults,
            locationmode = 'USA-states',
    #         text = df['text'],
            marker = dict(
                line = dict (
                    color = 'rgb(255,255,255)',
                    width = 2
                ) ),
            colorbar = dict(
                title = "Default Rate")
            ) ]

    layout = dict(
            title = 'Default Rates by State',
            geo = dict(
                scope='usa',
                projection=dict( type='albers usa' ),
                showlakes = True,
                lakecolor = 'rgb(255, 255, 255)'),
            yaxis = {'fixedrange': True},
            xaxis = {'fixedrange': True}
                 )

    return dict( data=data, layout=layout )

1. Loan Amount
3. Interest Rate
4. Term (Loan Length)
5. Purpose of Loan
6. Debt/Income Ratio
7. State
8. Rent or Own Home
9. Annual Income
10. 30+ days past-due incidences of delinquency in the borrower's credit file for the past 2 years
15. Credit Lines Open

## 1. Amount Requested

In [20]:
fig = default_rate_binned_barplot(df=df, feature='loan_amnt', bins=20, xlabel='Loan Amount',
                     title='Lower loan amounts have a slightly higher incidence of defaults')

py.iplot(fig)

## 2. Interest Rate

In [21]:
fig = default_rate_binned_barplot(df=df, feature='int_rate', bins=20, xlabel='Interest Rate',
                     title='Default rate strongly correlates with interest rate')
py.iplot(fig)

## 3. Loan Term

In [22]:
fig = default_rate_categorical_barplot(df=df, feature='term',xlabel='Loan Term', title='Longer loans have higher default rates')
py.iplot(fig)

## 4. Purpose

In [23]:
default_rate_categorical_barplot(df=df, feature='purpose',xlabel='Purpose', title='Business loans are the most risky while wedding loans are the least risky')
py.iplot(fig)

## 5. Debt to Income

In [24]:
fig = default_rate_binned_barplot(df=df.loc[df['dti']<40], feature='dti',bins=30, xlabel='Debt to Income Ratio', title='Debt to income ratio is highly correlated with default rate')
py.iplot(fig)

## 6. State

In [25]:
fig = default_rate_categorical_barplot(df=df, feature='addr_state',xlabel='States', title='Default Rate by State')
py.iplot(fig)

In [31]:
fig = default_rate_by_state(df)
py.iplot(fig)

## 7. Home Ownership

In [27]:
fig = default_rate_categorical_barplot(df=df.loc[df['home_ownership'].map(lambda x: x in ['MORTGAGE','RENT','OWN'])], 
                                 feature='home_ownership',xlabel='Home Ownership', title='Renters have the highest default rate')
py.iplot(fig)

## 8. Annual Income 

In [28]:
fig = default_rate_binned_barplot(df=df.loc[df['annual_inc']<300000], feature='annual_inc',bins=50, xlabel='Debt to Income Ratio', title='Default rate generally decreases with annual income')
py.iplot(fig)

## 9. Number of delinquencies in past 2 years

In [29]:
fig = default_rate_categorical_barplot(df=df.loc[df['delinq_2yrs']<10], feature='delinq_2yrs', xlabel='Debt to Income Ratio', title='Generally higher default rates for 2+ delinquencies')
py.iplot(fig)

## 10. Open credit lines

In [30]:
fig = default_rate_categorical_barplot(df=df.loc[df['open_acc']<40], feature='open_acc', xlabel='Open Credit Lines', title='Higher default rates for borrowers with 0 credit lines')
py.iplot(fig)