### Automatic Machine Learning

This notebook ingests a dataset, and trains many machine learning models intelligently searching their parameters for optimal values. A leaderboard is maintained. Finally, an ensemble is created stacking together some of the base learners and the result is added to the leaderboard. The best model is used ion production. 


In [44]:
import h2o
from h2o.automl import H2OAutoML

import plotly
import plotly.plotly as py
plotly.offline.init_notebook_mode(connected=True)
#myPlotlyKey = os.environ['SECRET_ENV_BRETTS_PLOTLY_KEY']
#py.sign_in(username='bretto777',api_key=myPlotlyKey)
py.sign_in(username='bretto777',api_key='conh5EnFad0Z9Lz6mVWr')

# Suppress unwatned warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

In [4]:
%%capture
h2o.init(nthreads=1, max_mem_size=2)

In [8]:
# Import some data from Amazon S3
h2oDF = h2o.import_file("https://s3-us-west-1.amazonaws.com/dsclouddata/LendingClubData/LoansGoodBad.csv")

# Stratified Split into Train/Test
stratsplit = h2oDF["Bad_Loan"].stratified_split(test_frac=0.3, seed=12349453)
train = h2oDF[stratsplit=="train"]
test = h2oDF[stratsplit=="test"]


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [14]:
h2oDF.group_by(by="State").sum().frame

State,sum_Longest_Credit_Length,sum_Loan_Amount,sum_Total_Accounts,sum_Revolving_Cr_Util,sum_Bad_Loan,sum_Employment_Years,sum_Interest_Rate,sum_Debt_to_Income,sum_Verification_Status,sum_Home_Ownership,sum_Term,sum_Delinquent_2yr,sum_Annual_Income,sum_Loan_Purpose,sum_RowID
,,,,,0,,,,,,,,,,688617.0
AK,6136.0,6321650.0,10057.0,25464.8,61,,5924.42,7099.96,380.0,1123.0,87.0,92.0,31005900.0,1208.0,34776300.0
AL,30634.0,25010000.0,51329.0,,411,,27328.4,34196.3,1940.0,4392.0,456.0,485.0,126494000.0,5823.0,165868000.0
AR,16909.0,13259900.0,28016.0,,212,,15080.4,19743.4,1054.0,2460.0,212.0,245.0,69630500.0,3253.0,91909900.0
AZ,58960.0,49538600.0,96444.0,,686,,54066.4,62254.9,3956.0,10965.0,823.0,875.0,268764000.0,11712.0,329551000.0
CA,419286.0,379287000.0,657644.0,,5109,,392424.0,428053.0,27652.0,101186.0,5394.0,6236.0,2169310000.0,86498.0,2333180000.0
CO,,47093000.0,,,506,,47705.6,57493.1,3517.0,9453.0,731.0,,250222000.0,9896.0,295017000.0
CT,,33292400.0,,,434,,34757.7,38050.8,2656.0,7639.0,558.0,,197127000.0,7882.0,199883000.0
DC,8305.0,7776600.0,13349.0,,67,,7625.1,7896.42,594.0,2219.0,100.0,139.0,48024100.0,1758.0,42818500.0
DE,6763.0,5571380.0,11553.0,,76,,6113.17,7444.13,420.0,1176.0,101.0,110.0,32213500.0,1413.0,35269700.0




In [36]:
dfSum = h2oDF.group_by(by="State").sum().frame
dfMean = h2oDF.group_by(by="State").mean().frame
stateData = dfSum.merge(dfMean).as_data_frame(use_pandas=True, header=True)

In [39]:
stateData = stateData.iloc[1:]
stateData.head(10)

Unnamed: 0,State,mean_Verification_Status,mean_Annual_Income,mean_RowID,mean_Debt_to_Income,mean_Bad_Loan,mean_Interest_Rate,mean_Term,mean_Home_Ownership,mean_Loan_Purpose,...,sum_Employment_Years,sum_Interest_Rate,sum_Debt_to_Income,sum_Verification_Status,sum_Home_Ownership,sum_Term,sum_Delinquent_2yr,sum_Annual_Income,sum_Loan_Purpose,sum_RowID
1,AK,0.920097,75074.78247,84204.145278,17.191186,0.1477,14.344843,0.210654,2.719128,2.924939,...,,,,,,,,,,688617
2,AL,0.987277,64373.552921,84411.264631,17.402707,0.20916,13.907583,0.232061,2.235115,2.963359,...,,27328.4,34196.32,1940.0,4392.0,456.0,485.0,126494000.0,5823.0,165868135
3,AR,0.95471,63071.069257,83251.72192,17.883524,0.192029,13.659783,0.192029,2.228261,2.946558,...,,15080.4,19743.41,1054.0,2460.0,212.0,245.0,69630460.0,3253.0,91909901
4,AZ,1.003806,68197.012971,83621.286729,15.796739,0.174067,13.71896,0.20883,2.782289,2.971835,...,,54066.42,62254.95,3956.0,10965.0,823.0,875.0,268764400.0,11712.0,329551491
5,CA,0.963417,75580.3596,81289.964532,14.913687,0.178002,13.672361,0.187931,3.525399,3.013658,...,,392424.11,428052.64,27652.0,101186.0,5394.0,6236.0,2169307000.0,86498.0,2333184562
6,CO,0.996035,70864.263786,83550.473237,16.282382,0.143302,13.510518,0.207024,2.677145,2.802605,...,,47705.64,57493.09,3517.0,9453.0,731.0,,250221700.0,9896.0,295016721
7,CT,1.036286,76912.555888,77987.909871,14.84618,0.169333,13.561334,0.217714,2.980492,3.075302,...,,34757.7,38050.76,2656.0,7639.0,558.0,,197126900.0,7882.0,199883013
8,DC,1.017123,82233.084486,73319.361301,13.521267,0.114726,13.056678,0.171233,3.799658,3.010274,...,,7625.1,7896.42,594.0,2219.0,100.0,139.0,48024120.0,1758.0,42818507
9,DE,0.94382,72389.984472,79257.860674,16.728382,0.170787,13.737461,0.226966,2.642697,3.175281,...,,6113.17,7444.13,420.0,1176.0,101.0,110.0,32213540.0,1413.0,35269748
10,FL,1.011144,67477.725996,81075.007459,16.02497,0.212531,13.700512,0.192348,2.956827,3.182608,...,,156131.03,182620.56,11523.0,33696.0,2192.0,,768976200.0,36269.0,923930785


In [63]:
for col in stateData.columns:
    stateData[col] = stateData[col].astype(str)

scl = [[0.0, 'rgb(164, 182, 216)'],[0.2, 'rgb(116, 141, 188)'],[0.4, 'rgb(69, 102, 165)'],\
            [0.6, 'rgb(45, 82, 153)'],[0.8, 'rgb(26, 62, 132)'],[1.0, 'rgb(4, 37, 99)']]

stateData['text'] = 'Avg Interest_Rate '+stateData['mean_Interest_Rate']+ '<br>' +\
    'Total Loan_Amount '+stateData['sum_Loan_Amount']+'<br>'+\
    'Avg Term '+stateData['mean_Term']+ '<br>' +\
    'Avg Income ' + stateData['mean_Annual_Income']

data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = stateData['State'],
        z = stateData['sum_Bad_Loan'].astype(float),
        locationmode = 'USA-states',
        text = stateData['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "# Bad Loans")
        ) ]

layout = dict(
        title = 'Bad Loans by State<br>(Hover for breakdown)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot( fig, filename='d3-cloropleth-map' )

In [60]:
# Identify predictors and response
x = train.columns
y = "Bad_Loan"
x.remove(y)

# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [61]:
# Run AutoML for 20 minutes or until leader fails to improve after 5 rounds
autoModel = H2OAutoML(max_runtime_secs = 30, stopping_rounds=5, stopping_tolerance=0.001)
autoModel.train(x = x, y = y,
          training_frame = train,
          leaderboard_frame = test)

AutoML progress: |████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


## Leaderboard
Display the best models, sorted by descending AUC

In [64]:
leaders = autoModel.leaderboard
leaders

C1,model_id,auc,logloss
0,DRF_model_1496459915419_4,0.697976,0.4407
1,StackedEnsemble_model_1496459915419_178,0.697972,0.440096




In [67]:
h2o.save_model(model=autoModel.leader)

u'/home/jupyter/Predict-Risk/DRF_model_1496459915419_4'

In [74]:
def approve_loan(Loan_Amount,Term,Interest_Rate,Employment_Years,Home_Ownership,Annual_Income,Verification_Status,Loan_Purpose,State,
                 Debt_to_Income,Delinquent_2yr,Revolving_Cr_Util,Total_Accounts,Longest_Credit_Length):
    # connect to the model scoring service
    h2o.connect()

    # open the downloaded model
    ChurnPredictor = h2o.load_model(path='DRF_model_1496459915419_4') 

    # define a feature vector to evaluate with the model
    newData = pd.DataFrame({'Loan_Amount' : Loan_Amount,
                            'Term' : Term,
                            'Interest_Rate' : Interest_Rate,
                            'Employment_Years' : Employment_Years,
                            'Home_Ownership' : Home_Ownership,
                            'Annual_Income' : Annual_Income,
                            'Verification_Status' : Verification_Status,
                            'Loan_Purpose' : Loan_Purpose,
                            'State' : State,
                            'Debt_to_Income' : Debt_to_Income,
                            'Delinquent_2yr' : Delinquent_2yr,
                            'Revolving_Cr_Util' : Revolving_Cr_Util,
                            'Total_Accounts' : Total_Accounts,
                            'Longest_Credit_Length' : Longest_Credit_Length}, index=[0])
    
    # evaluate the feature vector using the model
    predictions = ChurnPredictor.predict(h2o.H2OFrame(newData))
    predictionsOut = h2o.as_list(predictions, use_pandas=False)
    prediction = predictionsOut[1][0]
    probabilityBad = predictionsOut[1][1]
    probabilityGood = predictionsOut[1][2]
    return "Prediction: " + str(prediction) + " |Probability of Bad Loan: " + str(probabilityBad) + " |Probability of Good Loan: " + str(probabilityGood)

In [75]:
Loan_Amount = 5000
Term = "60 months"
Interest_Rate=13
Employment_Years=5
Home_Ownership="RENT"
Annual_Income=75000
Verification_Status="VERIFIED - income"
Loan_Purpose="credit_card"
State="CA"
Debt_to_Income="16.12"
Delinquent_2yr="0"
Revolving_Cr_Util=37
Total_Accounts=6
Longest_Credit_Length=97
approve_loan(Loan_Amount,Term,Interest_Rate,Employment_Years,Home_Ownership,Annual_Income,Verification_Status,Loan_Purpose,State,Debt_to_Income,Delinquent_2yr,Revolving_Cr_Util,Total_Accounts,Longest_Credit_Length)

Connecting to H2O server at http://localhost:54321... successful.


0,1
H2O cluster uptime:,2 hours 8 mins
H2O cluster version:,3.11.0.3901
H2O cluster version age:,1 day
H2O cluster name:,H2O_from_python_unknownUser_3mlsbt
H2O cluster total nodes:,1
H2O cluster free memory:,1.530 Gb
H2O cluster total cores:,16
H2O cluster allowed cores:,1
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


Parse progress: |█████████████████████████████████████████████████████████| 100%
drf prediction progress: |████████████████████████████████████████████████| 100%


'Prediction: 1 |Probability of Bad Loan: 0.6659056758880615 |Probability of Good Loan: 0.3340943241119385'