In [1]:
from __future__ import division
import glob
import os
import pandas as pd
import numpy as np
from scipy.stats import binned_statistic, linregress
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
import seaborn as sns
import statsmodels.api as sm
from IPython.display import clear_output, Image

from s3_connect import s3_connect

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

from EDA_plotting_functions import (make_kdeplot, default_rate_binned_barplot, default_rate_categorical_barplot, 
                                   default_rate_by_state)

init_notebook_mode(connected=True)

tmp_localdir = '~/'

pd.options.display.max_columns = 999

%pylab inline
clear_output()

# Optimization Introduction
Now that we have explored the relationship between loan features and default rates, we can begin exploiting this information to choose to fund loans that will maximize our ROI. To do so, we will need to know the reward / risk of each loan. This notebook serves as deep dive into how one can use machine learning to guide your selection towards loans with favorable reward to risk profiles.

Briefly, deriving the reward for a loan is straightfoward: given the interest rate and loan principal, one can compute the final amount paid back to the lender. 

Deriving the risk for a loan requires more sophistication: we will pass multitude of features through a machine learning model to accurately predict the probability of default.

# Querying Our Deploy Model 

For this report we will be calling the deployed API we setup for our loan default prediction model. The model can be called programmatically in python using the following API endpoint.

```python
body = requests.post('https://beta.datascience.com/deploy/deploy-predict-default-probabili-849-v3/', 
                     json={"data": [ data ] },
                     cookies={'datascience-platform': secret_token}
```

In [2]:
s3_conn = s3_connect(access=os.environ['AWS_CLOUD_BUCKET_KEY'],
                     secret=os.environ['AWS_CLOUD_BUCKET_SECRET_KEY'],
                     bucketname='ds-cloud-public-shared')

dat = s3_conn.pull_pickle_from_s3(key='demos/loan-risk/data/split_data.p',tmp_localdir=tmp_localdir)
clf = s3_conn.pull_pickle_from_s3(key='demos/loan-risk/models/RF.p',tmp_localdir=tmp_localdir)
clear_output()

# Optimizing Loan Choices Visually
Before we dive into automated optimization of our loan selections, let's build an intuition with a visual example. We will be using optimizing three variables to select the best loans:

1. Interest Rate (maximize)
2. Loan Amount (maximize)
3. Probability of Default (minimize)

We can visualize these three dimensions in the scatter plot below. Interest rate and default probability are plotted on the x- and y-axis, respectively, and the circle size represents loan amount. Ideally, we will select loans in the lower-right corner with big circles. 

There may be instances were we might want to fund a riskier loan with a higher interest rate or a safer loan with a lower interest rate. To represent this tradeoff, a linear model was fit to the data (solid fit line), and a confidence interval was plotted below (dotted ROI Threshold line). Any loans below this line have the best reward to risk profile and may make excellent loan selections. 

In [3]:
n_samples = 5000
np.random.seed(6)
inds = np.random.choice(range(dat['X_train'].shape[0]), size=n_samples, replace=False)
p_default = [loan[1] for loan in clf.predict_proba(dat['X_train'].values[inds])]
df_predict = pd.DataFrame({'p_default':p_default, 'int_rate':dat['X_train'].iloc[inds]['int_rate'].values,
                              'loan_amnt':dat['X_train'].iloc[inds]['loan_amnt'].values, 'default':dat['y_train'].iloc[inds].values,
                             'term':[36 if term==1 else 60 for term in dat['X_train'].iloc[inds]['term_ 36 months'].values]})

In [4]:
xi = df_predict['int_rate']
y = df_predict['p_default']
marker_size = df_predict['loan_amnt']/ np.mean(df_predict['loan_amnt'])

# Compute linear regression
model = sm.OLS(y,sm.add_constant(xi, prepend=False))
result = model.fit()
slope = result.params.ix['int_rate']

# Compute best fit lines
line = result.params[0]*xi+result.params[1]

low_risk_y = slope*xi+result.conf_int().ix['const'][0]*3
med_risk_y = slope*xi+result.conf_int().ix['const'][0]*2
high_risk_y = slope*xi+result.conf_int().ix['const'][0]*1

clear_output()

In [5]:
trace1a = go.Scatter(
    x = xi,
    y = y, 
    mode = 'markers',
    marker = dict(
        line = dict(width = 1),
        size = marker_size*5,
    ),
    name='Loan Amount'
)

trace2a = go.Scatter(
                x = xi,
                y = line,
                  mode='lines',
                  marker=go.Marker(color='rgb(31, 119, 180)'),
                  name='Linear Fit'
                  )


low_risk_linea = [dict(type='line',
                 xref='x', yref='y',
                 x0=min(xi), y0=min(low_risk_y),
                 x1=max(xi), y1=max(low_risk_y), 
                 line=dict(color='#ff0000', dash='dot')),
                
                dict(type='line',
                 xref='x', yref='y',
                 x0=min(xi), y0=min(low_risk_y),
                 x1=max(xi), y1=max(low_risk_y), 
                 line=dict(color='#ff0000', dash='dot')),
                 
                ]
                      
med_risk_linea = [dict(type='line',
                 xref='x', yref='y',                      
                 x0=min(xi), y0=min(med_risk_y),
                 x1=max(xi), y1=max(med_risk_y), 
                 line=dict(color='#ff0000', dash='dot'))]

high_risk_linea = [dict(type='line',
                 xref='x', yref='y',                       
                 x0=min(xi), y0=min(high_risk_y),
                 x1=max(xi), y1=max(high_risk_y), 
                 line=dict(color='#ff0000', dash='dot'))
                 ]

updatemenusa = list([
    dict(type="buttons",
        x = 0.3,
        xanchor = 'left',
        y = 1.0,
        yanchor = 'top',
        direction = 'left',
         buttons=list([
                    
            dict(label = 'Low Risk',
                 method = 'relayout',
                 args = ['shapes', low_risk_linea]),
                    
            dict(label = 'Medium Risk',
                 method = 'relayout',
                 args = ['shapes', med_risk_linea]),
                    
            dict(label = 'High Risk',
                 method = 'relayout',
                 args = ['shapes', high_risk_linea])
        ]),
    )
])


layouta = go.Layout(
                plot_bgcolor='rgb(229, 229, 229)',

                title='Finding High ROI Loans',
                xaxis={'title':'Interest Rate', 'zerolinecolor':'rgb(255,255,255)', 'gridcolor':'rgb(255,255,255)',
                      'range':[5, 28]},
                yaxis={'title':'Probability of Default', 'zerolinecolor':'rgb(255,255,255)', 'gridcolor':'rgb(255,255,255)',
                      'range':[0, 0.6]},

                updatemenus=updatemenusa,
        
                 annotations=[
        dict(
            x=21.75,
            y=0.185,
            xref='x',
            yref='y',
            text='High ROI',
            showarrow=True,
            arrowhead=0.2,
            ax=20,
            ay=30
        ),
        dict(
            x=25,
            y=0.035,
            xref='x',
            yref='y',
            text='Low Risk, High Return👍 ',
            showarrow=False,
            
            font=dict(
                size=14,
                color='#030000'
            ),
            
            bordercolor='#030000',
            borderwidth=2,
            borderpad=4,
            bgcolor='#3dff5d',
            opacity=0.8            
        ),
        dict(
            x=8,
            y=0.565,
            xref='x',
            yref='y',
            text='High Risk, Low Return👎',
            showarrow=False,
            
            font=dict(
                size=14,
                color='#030000'
            ),
            bordercolor='#030000',
            borderwidth=2,
            borderpad=4,
            bgcolor='#ff433d',
            opacity=0.8
        )])

dataa = [trace1a, trace2a]#, trace3]
figa = go.Figure(data=dataa, layout=layouta)

py.iplot(figa, filename='compare_webgl')

# Invest in loans with the highest expected profit.

By taking into account the probability of default and how much we stand to profit for each loan, we can calculate the expected profit for each loan. We can pick out the loans with the highest expected profit to maximize our return. 


<a href="https://demo.datascience.com/project/optimizing-your-investment-strategy/outputs/expected-profit-deep-dive-UG9zdFR5cGU6MTUz" target="_blank"><b>See here for a deep dive into our methodology for calculating expected profit.</b></a>


While this method will **on average** produce the highest return, some investors may still want throttle their risk tolerance. As a result, it is still useful to plot probability of default versus expected profit. The figure below does just that, while allowing the user to specify a risk tolerance (horizontal lines).

In [6]:
# Do expectation calculations
p = df_predict['loan_amnt']
r = df_predict['int_rate'] / 12 / 100
n = df_predict['term']

payment = p * (r * (1+r)**n) / ((1+r)**n -1)

df_predict['profit'] = (n * payment) - p

df_predict['expected_profit'] = ( (1 - df_predict['p_default']) * df_predict['profit']) - (df_predict['p_default'] * df_predict['loan_amnt'])

In [7]:
marker_size = df_predict['loan_amnt'] / np.mean(df_predict['loan_amnt'])

trace0 = go.Scatter(
    x = df_predict['expected_profit'],
    y = df_predict['p_default'],
    mode = 'markers',
    marker = dict(
        line = dict(width = 1),
        size = marker_size*5,
    ),
    name='Loan Amount'
)


low_risk_line = [dict(type='line',
                 xref='x', yref='y',
                 x0=min(df_predict['expected_profit'])-1000, y0=0.1,
                 x1=max(df_predict['expected_profit'])+1000, y1=0.1, 
                 line=dict(color='#ff0000', dash='dot'))]
                      
med_risk_line = [dict(type='line',
                 xref='x', yref='y',
                 x0=min(df_predict['expected_profit'])-1000, y0=0.2,
                 x1=max(df_predict['expected_profit'])+1000, y1=0.2, 
                 line=dict(color='#ff0000', dash='dot'))]

high_risk_line = [dict(type='line',
                 xref='x', yref='y',
                 x0=min(df_predict['expected_profit'])-1000, y0=0.3,
                 x1=max(df_predict['expected_profit'])+1000, y1=0.3, 
                 line=dict(color='#ff0000', dash='dot'))]

updatemenus = list([
    dict(type="buttons",
        x = 0.3,
        xanchor = 'left',
        y = 1.0,
        yanchor = 'top',
        direction = 'left',
         buttons=list([
                    
            dict(label = 'Low Risk',
                 method = 'relayout',
                 args = ['shapes', low_risk_line]),
                    
            dict(label = 'Medium Risk',
                 method = 'relayout',
                 args = ['shapes', med_risk_line]),
                    
            dict(label = 'High Risk',
                 method = 'relayout',
                 args = ['shapes', high_risk_line])
        ]),
    )
])


layout = dict(
                plot_bgcolor='rgb(229, 229, 229)',

                title='P(default), Loan Amount, and Expected Profit',
                xaxis={'title':'Expected Profit', 'zerolinecolor':'rgb(255,255,255)', 'gridcolor':'rgb(255,255,255)'},
                yaxis={'title':'Probability of Default', 'zerolinecolor':'rgb(255,255,255)', 'gridcolor':'rgb(255,255,255)'},
                
                updatemenus=updatemenus,
                
                annotations=[
        dict(
            x=11713,
            y=0.19,
            xref='x',
            yref='y',
            text='High ROI',
            showarrow=True,
            arrowhead=0.2,
            ax=-40,
            ay=30
        ),
        dict(
            x=9750,
            y=0.02,
            xref='x',
            yref='y',
            text='Low Risk, High Return👍 ',
            showarrow=False,
            
            font=dict(
                size=14,
                color='#030000'
            ),
            
            bordercolor='#030000',
            borderwidth=2,
            borderpad=4,
            bgcolor='#3dff5d',
            opacity=0.8            
        ),
        dict(
            x=-7500,
            y=0.65,
            xref='x',
            yref='y',
            text='High Risk, Low Return👎',
            showarrow=False,
            
            font=dict(
                size=14,
                color='#030000'
            ),
            bordercolor='#030000',
            borderwidth=2,
            borderpad=4,
            bgcolor='#ff433d',
            opacity=0.8
        )]
                
    
                )

data = [trace0]
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='compare_webgl2')

While choosing loans with a fairly conservative produces lower yield on average, one can be more certain of a favorable payout. Illustrated below is the expected ROI (%) after funding the 10 loans with the greatest expected profit that meet 3 different risk tolerance thresholds.

In [8]:
thresholds = [0.1, 0.2, 0.3]
principal = []
profit = []
n_loans = 10
for thresh in thresholds:
    top_loans = df_predict.ix[df_predict['p_default']<thresh].sort_values('expected_profit', ascending=False).iloc[:n_loans]
    principal.append(np.sum(top_loans['loan_amnt']))
    profit.append(np.sum(top_loans['expected_profit']))
    
p_thresholds_df = pd.DataFrame({'principal':principal, 'profit':profit},index=thresholds)
clear_output()

In [9]:
trace1 = go.Bar(
    x=map(lambda x:'< ' + str(int(x*100))+'%',thresholds),
    y=100*p_thresholds_df['profit']/p_thresholds_df['principal'],
)

data = [trace1]
layout = go.Layout(
    xaxis={'type':'category',
          'title':'Probability of Default Threshold'},
    yaxis={'title':'ROI (%)'},
    title='Expected return at different risk tolerances'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='stacked-bar')

## Validate

We've seen that machine learning can help guide us towards loans with high expected profit. However, how well does this generalize to new data? This is what we really care about if we want to use this model for future investments.

To validate our model, we ran the following experiment:
1. We started with a bankroll of 1M USD
2. Grabbed a random sample of 100 loans from our testing set, simulating a daily batch of possible loans.
3. For our test group, we selected the loans with the best E[profit] until our bankroll was exhausted. For our control group, we randomly selected loans to fund.
4. We then calculated the actual profit for each group based on the observed default labels in the test set.
5. Finally, we repeated steps 2-4 for 10K iterations. The bar heights and error bars represent the mean and standard error for each of these groups, respectively. 

### Without a doubt, our model out performed random guessing, consistently yielding a return of nearly 4% over the long term.

This is just a floor. With further optimization and model selection as well as feature engineering, one can improve how P(default) is modeled. Also, more advanced techniques of model selection can be implemented, such as diversifying one's risk further by investing only $X in each loan.

In [10]:
# Create new data frame for unseen data
p_default = [loan[1] for loan in clf.predict_proba(dat['X_test'].values)]

df_predict = pd.DataFrame({'p_default':p_default, 'int_rate':dat['X_test']['int_rate'].values,
                              'loan_amnt':dat['X_test']['loan_amnt'].values, 'default':dat['y_test'].values,
                             'term':[36 if term==1 else 60 for term in dat['X_test']['term_ 36 months'].values]})

p = df_predict['loan_amnt']
r = df_predict['int_rate'] / 12 / 100
n = df_predict['term']

payment = p * (r * (1+r)**n) / ((1+r)**n -1)

df_predict['profit'] = (n * payment) - p
df_predict['expected_profit'] = ( (1 - df_predict['p_default']) * df_predict['profit']) - (df_predict['p_default'] * df_predict['loan_amnt'])

In [11]:
def calc_profit(df):
    return sum(df.ix[df['default']==0]['profit']) - sum(df.ix[df['default']==1]['loan_amnt'])


bankroll = 1000000

# best_loans = df_predict.ix[df_predict.sort_values('expected_profit',ascending=False)['loan_amnt'].cumsum()<bankroll]
# optimized_profit = (calc_profit(best_loans)/sum(best_loans['loan_amnt']))

# Randomize
def experiment_profits(df_predict):
    
    rand = []
    opt = []
    
    for n in range(10000):
        
        # Random sample
        df_subsample = df_predict.sample(100)
        
        random_loans = df_subsample.ix[df_subsample['loan_amnt'].cumsum()<bankroll]
        rand.append(calc_profit(random_loans)/sum(random_loans['loan_amnt']))

        #Optimize
        df_subsample = df_subsample.sort_values('expected_profit',ascending=False)
        opt_loans = df_subsample.ix[df_subsample['loan_amnt'].cumsum()<bankroll]
        opt.append(calc_profit(opt_loans)/sum(opt_loans['loan_amnt']))

    return np.array(opt), np.array(rand)

optimized_profit, random_profits = experiment_profits(df_predict)

clear_output()

In [12]:
trace1 = go.Bar(
    x=['Random Guessing', 'Optimized Selection'],
    y=[np.mean(random_profits)*100, np.mean(optimized_profit)*100],
    name='Control',
    error_y=dict(
        type='data',
        array=[np.std(random_profits)*100/len(random_profits)**0.5, np.std(optimized_profit)*100/len(optimized_profit)**0.5],
        visible=True
    )
)

data = [trace1]

layout = go.Layout(
    xaxis={'title':''},
    yaxis={'title':'ROI (%)'},
    title='Validating Loan Selection'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='error-bar-bar')

# Profit

Now that we have validated the efficacy of our model, we can deploy it into production. Every day new loans are made available on Lending Club for investors to fund. With this deployed model, the savvy investor can schedule a script run at the percise time these loans are posted that will pass the loan data through the model, calculate expected profit, and fund the loan if it looks profitable. All of this can be done in seconds using the platforms deploy and scheduled runs apps, giving you an edge over your investor competition.

# Next Steps: Scheduled Run

Let's use the data science platform to set up a job that runs every day at 8:00 AM PST when a new daily batch of loans is available. Our script will pull the loan data from the Lending Club API, use our deployed model API to score the expected profit for each loan, and return the loan ideas of the most attractive loans.

### <a href="https://demo.datascience.com/project/optimizing-your-investment-strategy/job/daily-loan-scoring-166439" target="_blank">Click here to see the scheduled run interface.</a>



### <a href="https://demo.datascience.com/project/optimizing-your-investment-strategy/outputs/july-20th-scheduled-run-summary-UG9zdFR5cGU6MTUy" target="_blank">Click here to see the report from one of the runs.</a>