In [332]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor, AdaBoostClassifier, AdaBoostRegressor
import datetime 
from bs4 import BeautifulSoup
import re
from sklearn.model_selection import cross_val_predict
import sklearn.metrics as metrics
import requests

LOAN_REQUESTS_URL = 'https://www.fundingcircle.com/lend/loan-requests/' # loan requests url to scrape from
SUCCESSFUL_TERM_FRAC = 0.1 # Mark repaying loans in last 50% of term as 'successful'
FC_FEE_PERC = 0.1 # FC fee percetnage (1%)

### Read, clean and convert loanbook

- Loans accepted before 2012 are excluded, following "Bad debt has remained stable since 2012" from https://www.fundingcircle.com/uk/statistics/.
- Loans classed as 'successful' are either repaid in full, or only have 50% of their term left to repay.
- Only basic information found on FC's loan request page is used to predict loan outcomes

In [337]:
df = pd.read_csv('loanbook.csv')

# only 2012 dates onwards
df = df[pd.to_datetime(df['loan_accepted_date']) > datetime.date(year=2012, month=1, day=1)]

# select only status text
df['status'] = df['status'].str[6:]

# include only loans that not repaying or successfully 50% through their term
df = df[(df['status'] != 'repaying') | (df['payments_remaining'] < df['term']*SUCCESSFUL_TERM_FRAC)]

# independent successful overall repayment
df['success'] = (df['status'] == 'repaid') | (df['status'] == 'repaying')

# clean purpose, sector, location
df['loan_purpose'] = df['loan_purpose'].str.lower().str.replace('/', ' ')
df['sector'] = df['sector'].str.lower().str.replace('/', ' ')
df['region_name'] = df['region_name'].str.lower()

# risk band simplification
df['credit_band'] = df['credit_band'].str[:2]

print(df['status'].value_counts())
print(df['success'].value_counts())
df.tail(5)

repaid       7005
defaulted    1131
late          343
repaying      298
Name: status, dtype: int64
True     7303
False    1474
Name: success, dtype: int64


Unnamed: 0,id,status,credit_band,loan_purpose,sector,business_type_name,region_name,loan_amount,recoveries,interest_rate,...,num_loan_parts,loan_accepted_date,date_repaid,whole_loan,related_auctions,repayment_type,loan_guaranteed,year_incorporated,security_taken,success
26541,30464,repaid,A,working capital,arts & entertainment,Limited Company,london,10500.0,0.0,9.0,...,1,2016-12-30,2021-12-30,WL,,Amortising,1.0,2011.0,No asset security,True
26798,30725,repaid,A,expansion growth,finance,Limited Company,scotland,52000.0,0.0,8.5,...,1,2017-01-10,2020-01-10,WL,"30913, 13150",Amortising,1.0,2010.0,No asset security,True
26874,30802,repaid,B,expansion growth,manufacturing and engineering,Limited Company,north east,15675.0,0.0,9.5,...,1,2017-01-12,2019-01-12,WL,"9106, 22694, 16978, 13346, 31578, 6712",Amortising,1.0,1993.0,No asset security,True
28437,32365,repaid,A+,property development,property and construction,Limited Company,london,490000.0,0.0,10.0,...,6194,2017-02-09,2018-02-09,PL,"32937, 32876, 32634, 32593, 32482, 32409, 1544...",Interest only,1.0,2015.0,First charge,True
28504,32433,repaid,A+,working capital,consumer services,Partnership (less than 4 partners),north west,26020.0,0.0,7.5,...,1,2017-02-13,2022-02-13,WL,,Amortising,,,No asset security,True


### Generate one hot encodings for text terms

- `loan_purpose` and `sector` fields can sometimes be written inconsistently.
- We extract a list of the most common terms from each one.
- Terms are encoded as binary (one hot) `purpose_<term>` and `sector_<term>` columns.

In [338]:
# extract terms across dataframe column
def term_extraction(df_col, min_size=1, min_count=1):
    words = df_col.tolist()
    words = [x if type(x) is str else '' for x in words]
    words_join = ' '.join(words).split(' ')
    terms = Counter(words_join).most_common()
    
    terms = [t for t, c in terms if (len(t) > min_size and c > min_count)]
    
    return terms, words

purpose_terms, purpose_words = term_extraction(df['loan_purpose'])
sector_terms, sector_words = term_extraction(df['sector'])
    
# generate one hot dataframe representation
def term_onehot_generator(words, index, terms=purpose_terms, prepend='purpose_'):
    m = np.zeros((len(words), len(terms)))

    for i, row in enumerate(words):
        m[i,:] = [int(t in row) for t in terms]

    return pd.DataFrame(data=m, index=index, columns=[prepend + t for t in terms])

df_purpose = term_onehot_generator(purpose_words, df.index, terms=purpose_terms, prepend='purpose_')
df_sector = term_onehot_generator(sector_words, df.index, terms=sector_terms, prepend='sector_')

### Generate X and y dataframes for estimation

- `X` (indepedent variables) are learnt from.
- `X` includes: `purpose_term`, `sector_term`, `loan_amount`, `interest_rate`, `term` columns

- `y ` (dependent variables) include: `status` and `success`
- A successful loan is defined above as either fully repaid, or beyond 50% repayment term
- Defaulted, Repaid, Repaying and Late statuses are represented as a binary (one hot) encoding of the status column

In [339]:
# one hot credit dataframe
df_credit_onehot = pd.get_dummies(df['credit_band'])
df_region_onehot = pd.get_dummies(df['region_name'])

# select continuous columns
df_cont = df[['loan_amount', 'interest_rate', 'term']]

# concatenate dataframes for X
X = pd.concat([df_purpose, df_sector, df_credit_onehot, df_region_onehot, df_cont], axis=1)

# concatenate encoding for success and status
y = pd.concat([pd.get_dummies(df['status']), df['success']], axis=1)
# y = df['success']

print(df_purpose.shape, df_credit_onehot.shape, df_cont.shape)
print(X.shape, y.shape)

(8777, 36) (8777, 6) (8777, 3)
(8777, 82) (8777, 5)


### Learn prediction model

- AdaBoost Regression is an ensemble learning method, which attempts to reduce overfitting to the training data.
- Regression (rather than classification) is used, so we can give an idea of the likelihood of an outcome (e.g. success).
- 5-Fold cross-validation shows the model is:
 - 83% accurate for binary `SUCCESS` predictions.
 - 87% accurate for binary `DEFAULTED` predictions.
 - 99% accurate for binary `LATE` predictions.
 - 66% accurate for binary `REPAYING` predictions.
 - 71% accurate for binary `REPAID` predictions.
- Important terms are printed, that contribute more than 2% to the prediction model.

In [340]:
clfs = {col: AdaBoostRegressor(n_estimators=50) for col in y.columns.values}

for col, clf in clfs.items():
    print(col.upper())
    preds = cross_val_predict(clf, X, y[col], cv=5)
    scores = metrics.accuracy_score(np.round(preds), y[col])
    print('binary decision accuracy : %.2f +-%.2f' %(np.mean(scores), np.std(scores)))

    clf.fit(X, y[col])
    
    clf_fi = clf.feature_importances_
    clf_fi_order = np.argsort(clf_fi)[::-1]
    for c, v in zip(np.array(X.columns.values)[clf_fi_order], clf_fi[clf_fi_order]):
        if v > 0.02:
            print('%s (%.4f)' % (c.ljust(20), v))

DEFAULTED
binary decision accuracy : 0.87 +-0.00
term                 (0.6230)
interest_rate        (0.2072)
purpose_working      (0.0450)
loan_amount          (0.0292)
A+                   (0.0224)
purpose_capital      (0.0218)
REPAID
binary decision accuracy : 0.80 +-0.00
term                 (0.6447)
interest_rate        (0.2420)
loan_amount          (0.0365)
purpose_property     (0.0234)
SUCCESS
binary decision accuracy : 0.83 +-0.00
term                 (0.6231)
interest_rate        (0.2312)
purpose_capital      (0.0474)
purpose_asset        (0.0246)
loan_amount          (0.0224)
REPAYING
binary decision accuracy : 0.97 +-0.00
term                 (0.5515)
east anglia          (0.1342)
loan_amount          (0.0999)
purpose_somerset     (0.0978)
interest_rate        (0.0675)
LATE
binary decision accuracy : 0.95 +-0.00
interest_rate        (0.3898)
purpose_property     (0.2113)
term                 (0.1363)
purpose_short        (0.1356)
purpose_capital      (0.0472)
loan_amount     

### Predictor for new data

- Given a new loan description, `predict` prints out the loan's outcome likelihoods.
- `Return` is crudely estimated by $rate * success * (1.0 - $CF_FEE$)$

In [330]:
def predict(
    purpose = 'Working Capital Loan', 
    sector = 'Manufacturing and Engineering', 
    location = 'London',
    risk = 'A+', 
    amount = 52000, 
    term = 60, 
    rate = 7.5):
    print('%s | %s | %s | %s | %d | %dm | %.1f%%' % (purpose, sector, location, risk, amount, term, rate))
    print('-----')
    ln_purpose = term_onehot_generator([purpose.strip().lower()], [0], terms=purpose_terms, prepend='purpose_')
    ln_sector = term_onehot_generator([sector.strip().lower()], [0], terms=sector_terms, prepend='sector_')
    ln_credit_onehot = pd.DataFrame([[int(c == risk.strip().lower().ljust(2)) for c in df_credit_onehot.columns.values]], columns=df_credit_onehot.columns.values)
    ln_region_onehot = pd.DataFrame([[int(c == location.strip().lower().lower()) for c in df_region_onehot.columns.values]], columns=df_region_onehot.columns.values)
    
    ln_cont = pd.DataFrame([[amount, rate, term]], columns=df_cont.columns.values)

    ln = pd.concat([ln_purpose, ln_sector, ln_credit_onehot, ln_region_onehot, ln_cont], axis=1)
    
    proba = clfs['success'].predict(ln)
    print('success ~ %.2f%%' %(proba*100.0))
    print('return  ~ %.3f%%   (= %.4f * %.4f * %.4f)' % (rate * FC_FEE_PERC * proba, rate, proba, FC_FEE_PERC))
    
    print(' | '.join(['%s ~ %.0f%%' % (col, clfs[col].predict(ln)[0]*100.0) for col in ['defaulted', 'late', 'repaid', 'repaying']]))
    print('-----\n\n')

### Pre-defined examples to show some outcomes

In [331]:
predict(
    purpose = 'Working Capital Loan', 
    sector = 'Manufacturing and Engineering', 
    location = 'London',
    risk = 'A+', 
    amount = 52000, 
    term = 60, 
    rate = 7.5)

predict(
    purpose = 'Expansion And Growth Loan ', 
    sector = 'Retail', 
    location = 'London',
    risk = 'B', 
    amount = 262500, 
    term = 60, 
    rate = 10.5)

predict(
    purpose = 'Expansion And Growth Loan ', 
    sector = 'Wholesale',
    location = 'South East',
    risk = 'B', 
    amount = 21200, 
    term = 60, 
    rate = 10.5)

predict(
    purpose = 'Working Capital Loan ', 
    sector = 'Retail', 
    location = 'South East',
    risk = 'B', 
    amount = 21200, 
    term = 60, 
    rate = 10.5)

Working Capital Loan | Manufacturing and Engineering | London | A+ | 52000 | 60m | 7.5%
-----
success ~ 28.93%
return  ~ 0.217%   (= 7.5000 * 0.2893 * 0.1000)
defaulted ~ 6% | late ~ 1% | repaid ~ 21% | repaying ~ 69%
-----


Expansion And Growth Loan  | Retail | London | B | 262500 | 60m | 10.5%
-----
success ~ 16.15%
return  ~ 0.170%   (= 10.5000 * 0.1615 * 0.1000)
defaulted ~ 2% | late ~ 1% | repaid ~ 16% | repaying ~ 79%
-----


Expansion And Growth Loan  | Wholesale | South East | B | 21200 | 60m | 10.5%
-----
success ~ 16.15%
return  ~ 0.170%   (= 10.5000 * 0.1615 * 0.1000)
defaulted ~ 2% | late ~ 1% | repaid ~ 16% | repaying ~ 79%
-----


Working Capital Loan  | Retail | South East | B | 21200 | 60m | 10.5%
-----
success ~ 28.93%
return  ~ 0.304%   (= 10.5000 * 0.2893 * 0.1000)
defaulted ~ 6% | late ~ 1% | repaid ~ 21% | repaying ~ 69%
-----




# Predict from an HTML file

- This loads data from CF's loan book page and generates predictions, to help you with your biding ;)

In [266]:
soup = BeautifulSoup(requests.get(LOAN_REQUESTS_URL).text, 'html.parser')
for elem in soup.find_all(id=re.compile('auction')):
    sector_location_uid = elem('span')[0].contents[0].strip().split(',')
    
    purpose = elem('a')[1].contents[0].strip().lower()
    sector = sector_location_uid[0].strip().lower()
    location = sector_location_uid[1].split('-')[0].strip().lower()
    risk = elem('td')[2].contents[0].strip()
    
    amount = int(elem('td')[3].contents[0].strip().replace(',','')[1:])
    term = int(elem('td')[4].contents[0].strip())
    rate = float(elem('td')[5].contents[0].strip()[:-1])
    
    uid = sector_location_uid[1].split('-')[1].strip()
    
    print('loan id: %s' % (uid))
    predict(purpose, sector, location, risk, amount, term, rate)

loan id: 32982
working capital loan | professional and business support | south east | A | 364000 | 36m | 8.5%
-----


ValueError: Number of features of the model must match the input. Model n_features is 56 and input n_features is 73 