In [1]:
from __future__ import division
import glob
import os
import cPickle as pickle
import pandas as pd
import numpy as np
from scipy.stats import binned_statistic, linregress
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
import seaborn as sns
import statsmodels.api as sm

from s3_connect import s3_connect

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)

pd.options.display.max_columns = 999

%pylab inline

tmp_localdir = '~/'

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


# Load Data From S3

In [3]:
s3_conn = s3_connect(access=os.environ['AWS_CLOUD_BUCKET_KEY'],
                     secret=os.environ['AWS_CLOUD_BUCKET_SECRET_KEY'],
                     bucketname='ds-cloud-public-shared')

df = s3_conn.pull_pickle_from_s3(key='demos/loan-risk/data/raw_data.p',tmp_localdir=tmp_localdir)

Grabbed demos/loan-risk/data/raw_data.p from S3. Local file demos/loan-risk/data/raw_data.p is now available.


# Feature Engineering

### Basic feature engineering

In [3]:
# Include only fully paid or default
if 'loan_status' in df:
    df = df.ix[(df['loan_status']=='Fully Paid') | (df['loan_status']=='Charged Off')]
    # Create binary label
    df['default'] = df['loan_status'].map(lambda x: int(x=='Charged Off'))

# Convert to datetime
if 'issue_d' in df:
    df['issue_d'] = pd.to_datetime(df['issue_d'],format='%b-%Y')

    # Include only loans from 2008 to 2016 (so we have plenty of time to observe a default)
    df = df.ix[(df['issue_d']>='2008') & (df['issue_d']<'2016')]

# Convert to interest rate and revolving utilization to float
df['int_rate'] = df['int_rate'].map(lambda x: float(x[:-1]) if type(x) == str else x)
df['revol_util'] = df['revol_util'].map(lambda x: float(x[:-1]) if type(x) != float else x)

### Define features that are good to use as-is and which features to dummify

In [4]:
cols_as_is = ['loan_amnt', 'int_rate', 'dti', 'annual_inc', 'delinq_2yrs', 'open_acc', 'revol_util', 'default']
to_dummify = ['term', 'purpose', 'addr_state', 'home_ownership']

df = df[to_dummify + cols_as_is]

### Dummify categorical variables  

In [5]:
# Dummify features
for dummy_feat in to_dummify:
    df = pd.concat([df, pd.get_dummies(df[dummy_feat], prefix=dummy_feat)],axis=1)

# Drop un-dummified features
df = df.drop(to_dummify, axis=1)
    
# Include dummified features in cols_to_use
cols_to_use = [col for col in df.columns if col in cols_as_is or any([dummy_feat in col for dummy_feat in to_dummify])]

# Drop NA and subset
df = df[cols_to_use].dropna()

### Define features and target

In [6]:
X = df[cols_to_use].drop('default')
y = df['default']

### Train / Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
split_data = {'X_train':X_train,
             'y_train':y_train,
             'X_test':X_test,
             'y_test':y_test}

# Push processed data to S3

In [10]:
s3_conn.push_file_to_s3(path=pickle.dumps(split_data),
                        key='demos/loan-risk/data/split_data.p',
                        string=True)

Sent string to S3 with key 'demos/loan-risk/data/split_data.p'


In [16]:
feats = {'to_dummify':to_dummify,
         'as_is': cols_as_is,
         'trained_features': X_train.columns}

s3_conn.push_file_to_s3(path=pickle.dumps(feats),
                        key='demos/loan-risk/data/feats.p',
                        string=True)

Sent string to S3 with key 'demos/loan-risk/data/feats.p'


In [36]:
df.shape

(5, 81)

In [19]:
clf = s3_conn.pull_pickle_from_s3(key='demos/loan-risk/models/RF.p', tmp_localdir=tmp_localdir)

Grabbed demos/loan-risk/models/RF.p from S3. Local file demos/loan-risk/models/RF.p is now available.


In [25]:
df = X_test.iloc[:5]

In [47]:
df = raw.iloc[:5]

In [51]:
raw.columns

Index([u'loan_amnt', u'int_rate', u'dti', u'annual_inc', u'delinq_2yrs',
       u'open_acc', u'revol_util', u'term', u'purpose', u'addr_state',
       u'home_ownership', u'default'],
      dtype='object')

In [48]:
# Dummify features
for dummy_feat in feats['to_dummify']:
    if dummy_feat in df:
        df = pd.concat([df, pd.get_dummies(df[dummy_feat], prefix=dummy_feat)],axis=1)
        
        # Drop un-dummified features
        df = df.drop(dummy_feat, axis=1)

    
# Include dummified features in cols_to_use
cols_to_use = [col for col in df.columns if col in feats['as_is'] or any([dummy_feat in col for dummy_feat in feats['to_dummify']])]

# Drop NA and subset
df = df[cols_to_use].dropna()


# If after dummifying we are missing any features wrt the training data feature space, add a column of zeros for that feature
for training_feat in feats['trained_features']:
    if training_feat not in df:
        df[training_feat] = np.zeros((df.shape[0],1))

In [4]:
payload = {"addr_state": "AZ",  "annual_inc": "24000.0",  "delinq_2yrs": "0.0",  "dti": "27.649999999999999",  "home_ownership": "RENT",  "int_rate": "10.65",  "loan_amnt": "5000.0",  "open_acc": "3.0",  "purpose": "credit_card",  "revol_util": "83.700000000000003",  "term": " 36 months"}

predict_default_probability(payload)

0.24928368084606212

In [None]:
{"data":
     {"addr_state": "AZ",  "annual_inc": "24000.0",  "delinq_2yrs": "0.0",  "dti": "27.649999999999999",  "home_ownership": "RENT",  "int_rate": "10.65",  "loan_amnt": "5000.0",  "open_acc": "3.0",  "purpose": "credit_card",  "revol_util": "83.700000000000003",  "term": " 36 months"}
 }