In [95]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor, AdaBoostClassifier, AdaBoostRegressor
import datetime 
from bs4 import BeautifulSoup
import re
from sklearn.model_selection import cross_val_predict
import sklearn.metrics as metrics

### Read, clean and convert loanbook

In [234]:
df = pd.read_csv('loanbook.csv')

# only 2012 dates onwards
df = df[pd.to_datetime(df['loan_accepted_date']) > datetime.date(year=2012, month=1, day=1)]

# ignore repaying (possible over simplification)
# df = df[df['status'] != 'loan: repaying']
df = df[(df['status'] != 'repaying') | (df['payments_remaining'] < df['term']*0.1)]

# select only status text
df['status'] = df['status'].str[6:]

# independent successful overall repayment
df['success'] = (df['status'] == 'repaid') | (df['status'] == 'repaying')

# clean purpose, sector, location
df['loan_purpose'] = df['loan_purpose'].str.lower().str.replace('/', ' ')
df['sector'] = df['sector'].str.lower().str.replace('/', ' ')
df['region_name'] = df['region_name'].str.lower()

# risk band simplification
df['credit_band'] = df['credit_band'].str[:2]

print(df['status'].value_counts())
print(df['success'].value_counts())
df.tail(50)

repaying     19844
repaid        7005
defaulted     1131
late           343
Name: status, dtype: int64
True     26849
False     1474
Name: success, dtype: int64


Unnamed: 0,id,status,credit_band,loan_purpose,sector,business_type_name,region_name,loan_amount,recoveries,interest_rate,...,num_loan_parts,loan_accepted_date,date_repaid,whole_loan,related_auctions,repayment_type,loan_guaranteed,year_incorporated,security_taken,success
28772,32701,repaying,A,refinancing a loan,automotive,Limited Company,wales,105000.0,0.0,9.0,...,1,2017-02-17,2022-02-17,WL,,Amortising,1.0,2010.0,No asset security,True
28773,32702,repaying,A,expansion growth,property and construction,Limited Company,south east,31500.0,0.0,9.0,...,1,2017-02-17,2022-02-17,WL,,Amortising,1.0,2007.0,No asset security,True
28774,32703,repaying,A,tax payment,professional and business support,Limited Company,south east,52500.0,0.0,9.0,...,1,2017-02-17,2022-02-17,WL,,Amortising,1.0,2014.0,No asset security,True
28775,32704,repaying,D,working capital,professional and business support,Limited Company,midlands,110000.0,0.0,17.9,...,1,2017-02-15,2022-02-15,WL,,Amortising,1.0,2010.0,No asset security,True
28776,32705,repaying,C,working capital,property and construction,Limited Company,midlands,5275.0,0.0,12.5,...,122,2017-02-15,2019-02-15,PL,,Amortising,1.0,2014.0,No asset security,True
28777,32706,repaying,B,expansion growth,i.t and telecommunications,Limited Company,midlands,52500.0,0.0,10.0,...,882,2017-02-15,2020-02-15,PL,,Amortising,1.0,2009.0,No asset security,True
28778,32707,repaying,A+,working capital,manufacturing and engineering,Limited Company,north east,78000.0,0.0,7.5,...,1,2017-02-17,2022-02-17,WL,,Amortising,1.0,2000.0,No asset security,True
28779,32708,repaying,A,expansion growth,retail,Limited Company,london,21000.0,0.0,9.0,...,378,2017-02-15,2022-02-15,PL,7501,Amortising,1.0,2008.0,No asset security,True
28780,32709,repaying,A,expansion growth,retail,Limited Liability Partnership,london,84000.0,0.0,9.0,...,1041,2017-02-16,2022-02-16,PL,,Amortising,1.0,2014.0,No asset security,True
28781,32710,repaying,B,expansion growth,transport and logistics,Limited Company,north east,26500.0,0.0,10.5,...,1,2017-02-17,2022-02-17,WL,"10432, 26466",Amortising,1.0,2005.0,No asset security,True


### Generate one hot encodings for text terms

This is to understand the `loan_purpose` and `sector` fields, which are written inconsistently.

First extract the most common terms, then generate a dataframe with one hot encoding per row.

In [235]:
# extract terms across dataframe column
def term_extraction(df_col, min_size=1, min_count=5):
    words = df_col.tolist()
    words = [x if type(x) is str else '' for x in words]
    words_join = ' '.join(words).split(' ')
    terms = Counter(words_join).most_common(20)
    
    terms = [t for t, c in terms if (len(t) > min_size and c > min_count)]
    
    return terms, words

purpose_terms, purpose_words = term_extraction(df['loan_purpose'])
sector_terms, sector_words = term_extraction(df['sector'])
    
# generate one hot dataframe representation
def term_onehot_generator(words, index, terms=purpose_terms, prepend='purpose_'):
    m = np.zeros((len(words), len(terms)))

    for i, row in enumerate(words):
        m[i,:] = [int(t in row) for t in terms]

    return pd.DataFrame(data=m, index=index, columns=[prepend + t for t in terms])

df_purpose = term_onehot_generator(purpose_words, df.index, terms=purpose_terms, prepend='purpose_')
df_sector = term_onehot_generator(sector_words, df.index, terms=sector_terms, prepend='sector_')

### Generate X and y dataframes for estimation

In [236]:
# one hot credit dataframe
df_credit_onehot = pd.get_dummies(df['credit_band'])
df_region_onehot = pd.get_dummies(df['region_name'])

# select continuous columns
df_cont = df[['loan_amount', 'interest_rate', 'term']]

# concatenate dataframes for X
X = pd.concat([df_purpose, df_sector, df_credit_onehot, df_region_onehot, df_cont], axis=1)

# concatenate encoding for success and status
y = pd.concat([pd.get_dummies(df['status']), df['success']], axis=1)
# y = df['success']

print(df_purpose.shape, df_credit_onehot.shape, df_cont.shape)
print(X.shape, y.shape)

(28323, 18) (28323, 6) (28323, 3)
(28323, 56) (28323, 5)


### Learn prediction model

We use a basic ExtraTrees Regression model from scikit-learn

In [237]:
clfs = {col: AdaBoostRegressor() for col in y.columns.values}

for col, clf in clfs.items():
    clf.fit(X, y[col])
    
    preds = cross_val_predict(clf, X, y[col], cv=5)
    scores = metrics.accuracy_score(np.round(preds), y[col])
    print('%s, binary decision accuracy : %.2f+/-%.2f' %(col, np.mean(scores), np.std(scores)))

for col, clf in clfs.items():
    col = col.ljust(10)
    print("%s feature importances" % (col))
    clf_fi = clf.feature_importances_
    clf_fi_order = np.argsort(clf_fi)[::-1][:10]
    for c, v in zip(np.array(X.columns.values)[clf_fi_order], clf_fi[clf_fi_order]):
        print('%s %s: %.4f' % (col, c, v))

defaulted, binary decision accuracy : 0.96+/-0.00
repaid, binary decision accuracy : 0.71+/-0.00
success, binary decision accuracy : 0.95+/-0.00
repaying, binary decision accuracy : 0.65+/-0.00
late, binary decision accuracy : 0.99+/-0.00
defaulted  feature importances
defaulted  purpose_capital: 0.4457
defaulted  A+: 0.2946
defaulted  interest_rate: 0.1091
defaulted  A : 0.0771
defaulted  term: 0.0286
defaulted  B : 0.0226
defaulted  loan_amount: 0.0156
defaulted  scotland: 0.0028
defaulted  purpose_other: 0.0013
defaulted  purpose_expansion: 0.0012
repaid     feature importances
repaid     term: 0.7312
repaid     purpose_capital: 0.1095
repaid     purpose_residential: 0.0511
repaid     interest_rate: 0.0399
repaid     sector_other: 0.0372
repaid     A+: 0.0158
repaid     purpose_expansion: 0.0079
repaid     purpose_purchase: 0.0074
repaid     northern ireland: 0.0000
repaid     purpose_tax: 0.0000
success    feature importances
success    interest_rate: 0.4391
success    purpose_capi

### Predictor for new data

Generates a dataframe for some new data and prints the estimated return.

Returns are crudely generated by `rate` \* `success` \* (1.0 - `CF_FEE`),

where `success` is is the likelihood of the loan being repaid (ie. not defauling or being late)

In [238]:
def predict(
    purpose = 'Working Capital Loan', 
    sector = 'Manufacturing and Engineering', 
    location = 'London',
    risk = 'A+', 
    amount = 52000, 
    term = 60, 
    rate = 7.5):
    print('%s | %s | %s | %s | %d | %dm | %.1f%%' % (purpose, sector, location, risk, amount, term, rate))
    print('-----')
    ln_purpose = term_onehot_generator([purpose.strip().lower()], [0], terms=purpose_terms, prepend='purpose_')
    ln_sector = term_onehot_generator([sector.strip().lower()], [0], terms=sector_terms, prepend='sector_')
    ln_credit_onehot = pd.DataFrame([[int(c == risk.strip().lower().ljust(2)) for c in df_credit_onehot.columns.values]], columns=df_credit_onehot.columns.values)
    ln_region_onehot = pd.DataFrame([[int(c == location.strip().lower().lower()) for c in df_region_onehot.columns.values]], columns=df_region_onehot.columns.values)
    
    ln_cont = pd.DataFrame([[amount, rate, term]], columns=df_cont.columns.values)

    ln = pd.concat([ln_purpose, ln_sector, ln_credit_onehot, ln_region_onehot, ln_cont], axis=1)
    
    proba = clfs['success'].predict(ln)[0]
    print('success ~ %.2f%%' %(proba*100.0))
    print('return  ~ %.3f%%   (= %.4f * %.4f * .90)' % (rate *.9 * proba, rate, proba))
    
    print(' | '.join(['%s ~ %.0f%%' % (col, clfs[col].predict(ln)[0]*100.0) for col in ['defaulted', 'late', 'repaid', 'repaying']]))
    print('-----\n\n')

### Pre-defined examples to show some outcomes

In [239]:
predict(
    purpose = 'Working Capital Loan', 
    sector = 'Manufacturing and Engineering', 
    location = 'London',
    risk = 'A+', 
    amount = 52000, 
    term = 60, 
    rate = 7.5)

predict(
    purpose = 'Expansion And Growth Loan ', 
    sector = 'Retail', 
    location = 'London',
    risk = 'B', 
    amount = 262500, 
    term = 60, 
    rate = 10.5)

predict(
    purpose = 'Expansion And Growth Loan ', 
    sector = 'Wholesale',
    location = 'South East',
    risk = 'B', 
    amount = 21200, 
    term = 60, 
    rate = 10.5)

predict(
    purpose = 'Working Capital Loan ', 
    sector = 'Retail', 
    location = 'South East',
    risk = 'B', 
    amount = 21200, 
    term = 60, 
    rate = 10.5)

Working Capital Loan | Manufacturing and Engineering | London | A+ | 52000 | 60m | 7.5%
-----
success ~ 95.21%
return  ~ 6.427%   (= 7.5000 * 0.9521 * .90)
defaulted ~ 9% | late ~ 1% | repaid ~ 33% | repaying ~ 73%
-----


Expansion And Growth Loan  | Retail | London | B | 262500 | 60m | 10.5%
-----
success ~ 95.32%
return  ~ 9.007%   (= 10.5000 * 0.9532 * .90)
defaulted ~ 2% | late ~ 1% | repaid ~ 20% | repaying ~ 81%
-----


Expansion And Growth Loan  | Wholesale | South East | B | 21200 | 60m | 10.5%
-----
success ~ 95.32%
return  ~ 9.007%   (= 10.5000 * 0.9532 * .90)
defaulted ~ 2% | late ~ 1% | repaid ~ 20% | repaying ~ 81%
-----


Working Capital Loan  | Retail | South East | B | 21200 | 60m | 10.5%
-----
success ~ 95.21%
return  ~ 8.998%   (= 10.5000 * 0.9521 * .90)
defaulted ~ 9% | late ~ 1% | repaid ~ 20% | repaying ~ 73%
-----




# Predict from an HTML file

To use this function, save your current Funding Circle loanbook page `https://www.fundingcircle.com/lend/loan-requests/` to `fc-lb.html`.

Each loan row will be predicted for. 

In [240]:
soup = BeautifulSoup(open('fc-lr.html'), 'html.parser')
for elem in soup.find_all(id=re.compile('auction')):
    sector_location_uid = elem('span')[0].contents[0].strip().split(',')
    
    purpose = elem('a')[1].contents[0].strip().lower()
    sector = sector_location_uid[0].strip().lower()
    location = sector_location_uid[1].split('-')[0].strip().lower()
    risk = elem('td')[2].contents[0].strip()
    
    amount = int(elem('td')[3].contents[0].strip().replace(',','')[1:])
    term = int(elem('td')[4].contents[0].strip())
    rate = float(elem('td')[5].contents[0].strip()[:-1])
    
    uid = sector_location_uid[1].split('-')[1].strip()
    
    print('loan id: %s' % (uid))
    predict(purpose, sector, location, risk, amount, term, rate)

loan id: 33145
expansion and growth loan | manufacturing and engineering | midlands | B | 106000 | 60m | 10.5%
-----
success ~ 95.32%
return  ~ 9.007%   (= 10.5000 * 0.9532 * .90)
defaulted ~ 2% | late ~ 1% | repaid ~ 20% | repaying ~ 81%
-----


loan id: 33006
expansion and growth loan | retail | london | B | 262500 | 60m | 10.5%
-----
success ~ 95.32%
return  ~ 9.007%   (= 10.5000 * 0.9532 * .90)
defaulted ~ 2% | late ~ 1% | repaid ~ 20% | repaying ~ 81%
-----


loan id: 33098
expansion and growth loan | wholesale | south east | B | 212000 | 60m | 10.5%
-----
success ~ 95.32%
return  ~ 9.007%   (= 10.5000 * 0.9532 * .90)
defaulted ~ 2% | late ~ 1% | repaid ~ 20% | repaying ~ 81%
-----


loan id: 33127
expansion and growth loan | manufacturing and engineering | south east | B | 43258 | 60m | 10.5%
-----
success ~ 95.32%
return  ~ 9.007%   (= 10.5000 * 0.9532 * .90)
defaulted ~ 2% | late ~ 1% | repaid ~ 20% | repaying ~ 81%
-----


loan id: 33115
expansion and growth loan | wholesale |