In [265]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
import datetime 

In [164]:
df = pd.read_csv('loanbook.csv')
df

Unnamed: 0,id,status,credit_band,loan_purpose,sector,business_type_name,region_name,loan_amount,recoveries,interest_rate,...,payments_remaining,num_loan_parts,loan_accepted_date,date_repaid,whole_loan,related_auctions,repayment_type,loan_guaranteed,year_incorporated,security_taken
0,4,loan: repaid,A (Low risk),Asset purchase,Wholesale,Limited Company,East Anglia,25000.0,0.00,8.11,...,0,257,2010-08-31,2013-09-01,PL,,Amortising,1.0,1992.0,No asset security
1,6,loan: repaid,A (Low risk),Expansion/growth Capital,Manufacturing and Engineering,Limited Company,South East,50000.0,0.00,10.57,...,0,437,2010-09-03,2013-09-03,PL,1079,Amortising,1.0,1967.0,No asset security
2,8,loan: repaid,A (Low risk),Working capital,Leisure & Hospitality,Limited Company,South West,15000.0,0.00,7.99,...,0,112,2010-08-24,2013-08-10,PL,,Amortising,1.0,1968.0,No asset security
3,9,loan: repaid,B (Below average risk),Asset purchase,Other,Limited Company,South West,12000.0,0.00,8.35,...,0,53,2010-08-19,2013-08-05,PL,,Amortising,1.0,2005.0,No asset security
4,10,loan: defaulted,B (Below average risk),Expansion/growth Capital,Retail,Limited Company,London,30000.0,2.38,9.06,...,17,258,2010-09-01,2013-08-22,PL,,Amortising,,1990.0,No asset security
5,11,loan: repaid,A+ (Very low risk),Working capital,Manufacturing and Engineering,Limited Company,North West,35000.0,0.00,7.86,...,0,331,2010-09-02,2013-08-18,PL,,Amortising,,1896.0,No asset security
6,12,loan: repaid,B (Below average risk),Expansion/growth Capital,Other,Limited Company,North East,40000.0,0.00,10.69,...,0,445,2010-09-05,2013-09-03,PL,,Amortising,1.0,2001.0,No asset security
7,13,loan: repaid,A (Low risk),Expansion/growth Capital,Professional and Business Support,Limited Company,London,30000.0,0.00,8.12,...,0,225,2010-09-01,2011-09-01,PL,"3045, 516",Amortising,,2006.0,No asset security
8,14,loan: repaid,A (Low risk),Working capital,Manufacturing and Engineering,Limited Company,North East,20000.0,0.00,7.75,...,0,168,2010-09-10,2013-08-26,PL,,Amortising,,1997.0,No asset security
9,15,loan: repaid,A (Low risk),Working capital,Manufacturing and Engineering,Limited Company,Scotland,30000.0,0.00,7.74,...,0,192,2010-09-08,2011-09-08,PL,"18871, 3416, 1218",Amortising,1.0,1998.0,No asset security


In [258]:
# only dates 2012 onwards
df = df[pd.to_datetime(df['loan_accepted_date']) > datetime.date(year=2012, month=1, day=1)]

# only defaulted, late or repaid
df = df[df['status'] != 'loan: repaying']

# risk band simplification
df['credit_band'] = df['credit_band'].str[:1]

In [259]:
purpose_words = df['loan_purpose'].str.lower().str.replace('/', ' ').tolist()
purpose_words = [x if type(x) is str else '' for x in purpose_words]
purpose_words_join = ' '.join(purpose_words).split(' ')
purpose_terms = Counter(purpose_words_join).most_common(20)

def purpose_generator(words, index, terms=purpose_terms):
    m = np.zeros((len(words), len(terms)))

    for i, row in enumerate(words):
        m[i,:] = [int(t in row) for t, c in terms]

    return pd.DataFrame(data=m, index=index, columns=['purpose_' + t for t, c in terms])

df_purpose = purpose_generator(purpose_words, index=df.index)

In [262]:
df_credit_onehot = pd.get_dummies(df['credit_band'])
df_cont = df[['loan_amount', 'interest_rate', 'term']]

X = pd.concat([df_purpose, df_credit_onehot, df_cont], axis=1)

y = pd.get_dummies(df['status'])

print(df_purpose.shape, df_credit_onehot.shape, df_cont.shape)
print(X.shape, y.shape)

(8479, 20) (8479, 5) (8479, 3)
(8479, 28) (8479, 3)


In [263]:
print(y.columns.values)
clf = ExtraTreesRegressor()

clf.fit(X,y)
np.array(X.columns.values)[np.argsort(clf.feature_importances_)[::-1]]

['loan: defaulted' 'loan: late' 'loan: repaid']


array(['interest_rate', 'loan_amount', 'term', 'A', 'purpose_property',
       'purpose_capital', 'D', 'C', 'E', 'B', 'purpose_expansion',
       'purpose_asset', 'purpose_purchase', 'purpose_growth', 'purpose_a',
       'purpose_loan', 'purpose_working', 'purpose_refinancing',
       'purpose_other', 'purpose_short', 'purpose_development',
       'purpose_term', 'purpose_tax', 'purpose_finance', 'purpose_payment',
       'purpose_residential', 'purpose_liability', 'purpose_'], dtype=object)

In [264]:
ln_purpose    = 'expansion and growth loan'
ln_risk       = 'B'
ln_amount     = 262500
ln_term       = 60
ln_rate       = 10.5

ln_purpose = purpose_generator([ln_purpose], index=[0])
ln_credit_onehot = pd.DataFrame([[int(c == ln_risk) for c in df_credit_onehot.columns.values]], columns=df_credit_onehot.columns.values)
ln_cont = pd.DataFrame([[ln_amount, ln_rate, ln_term]], columns=df_cont.columns.values)

ln = pd.concat([ln_purpose, ln_credit_onehot, ln_cont], axis=1)
print(ln.columns.values == X.columns.values)
{c: p for c, p in zip(y.columns.values, clf.predict(ln)[0])}

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True]


{'loan: defaulted': 0.0, 'loan: late': 0.0, 'loan: repaid': 1.0}