In [30]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
import _pickle as cPickle

### Load Dataframe

In [54]:
df_loan = pd.read_csv('loan_data_new.csv')

In [55]:
df_loan.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,15.451941,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,7.561644,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,12.90411,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,7.397146,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,11.139726,4740,39.5,0,1,0,0


In [56]:
df_loan['days.with.cr.line'] = df_loan['days.with.cr.line']/365

In [57]:
# df_loan.to_csv('loan_data_new.csv',index=False)

In [58]:
pd.value_counts(df_loan["not.fully.paid"],normalize=True)*100



0    83.994571
1    16.005429
Name: not.fully.paid, dtype: float64

In [59]:
df_loan_payback = df_loan[df_loan['not.fully.paid']==0]
df_loan_notpayback = df_loan[df_loan['not.fully.paid']==1]

In [60]:
df_loan_payback_sample = df_loan_payback.sample(frac=.16)

In [61]:
df_loan = pd.concat((df_loan_notpayback,df_loan_payback_sample))

In [62]:
df_loan.columns

Index(['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc',
       'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'not.fully.paid'],
      dtype='object')

### Raindom Forrest Model 1

In [63]:
X = pd.get_dummies(df_loan.drop(labels=['not.fully.paid'],axis=1))

In [64]:
y = df_loan['not.fully.paid']

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [66]:
rfcMODEL = RandomForestClassifier(n_estimators=300)

In [67]:
rfcMODEL.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [68]:
predictions = rfcMODEL.predict(X_test)

### Test to see how well this model performed

In [69]:
from sklearn.metrics import classification_report,confusion_matrix

In [70]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.60      0.52      0.56       419
          1       0.65      0.72      0.68       512

avg / total       0.63      0.63      0.63       931



### In the confusion matrix below, you can see the  the model did good when predicting if was going to pay the loan back.

### 0: the person will pay the loan back 
### 1: the person will not pay the loan back

In [71]:
print(confusion_matrix(y_test,predictions))

[[219 200]
 [144 368]]


### save model

In [149]:



# save model
with open("RFMODEL2",'wb') as Model:
    cPickle.dump(rfcMODEL,Model)

### Raindom Forrest Model 2

In [None]:
df_loan_Pay

### Test the model 1

In [152]:
#The reason I wrote the below code was to learn how to insert a vector inside the model.


df_loan = pd.read_csv('loan_data.csv')

# load model for testing
with open('RFMODEL','rb') as Model:
    rfK = cPickle.load(Model)


df_loan = df_loan.append({'credit.policy':0, 
               'purpose': 'credit_card',
               'int.rate': 0.1189, 
               'installment': 829.10, 
               'log.annual.inc': 19.48,
               'dti':737 , 
               'fico': 5639.958333,
               'days.with.cr.line':28854,
               'revol.bal': 28854, 
               'revol.util': 52.1,
               'inq.last.6mths': 0, 
               'delinq.2yrs': 0, 
               'pub.rec': 0, 
               'not.fully.paid': 1 # constant value
                         },ignore_index=True)


results = {}

results['vector'] = pd.get_dummies(df_loan.drop('not.fully.paid',axis=1)).iloc[-1:].to_dict('records')

results['precdiction'] = rfK.predict(pd.get_dummies(df_loan.drop('not.fully.paid',axis=1)).iloc[-1:]).item()

results['precdiction']

0

In [153]:
results

{'precdiction': 0,
 'vector': [{'credit.policy': 0.0,
   'days.with.cr.line': 28854.0,
   'delinq.2yrs': 0.0,
   'dti': 737.0,
   'fico': 5639.958333,
   'inq.last.6mths': 0.0,
   'installment': 829.1,
   'int.rate': 0.1189,
   'log.annual.inc': 19.48,
   'pub.rec': 0.0,
   'purpose_all_other': 0.0,
   'purpose_credit_card': 1.0,
   'purpose_debt_consolidation': 0.0,
   'purpose_educational': 0.0,
   'purpose_home_improvement': 0.0,
   'purpose_major_purchase': 0.0,
   'purpose_small_business': 0.0,
   'revol.bal': 28854.0,
   'revol.util': 52.1}]}