In [1]:
from sklearn.metrics import *
from IPython.core.display import HTML

import sys
import os
import time
import zipfile
import datetime
import glob
import pandas as pd
import numpy as np

# H2O AutoML

In [2]:
import h2o
from h2o.automl import H2OAutoML

In [3]:
h2o.init(max_mem_size=8)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.191-b12, mixed mode)
  Starting server from C:\Users\Abhinav\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Abhinav\AppData\Local\Temp\tmpw5jf9ulc
  JVM stdout: C:\Users\Abhinav\AppData\Local\Temp\tmpw5jf9ulc\h2o_Abhinav_started_from_python.out
  JVM stderr: C:\Users\Abhinav\AppData\Local\Temp\tmpw5jf9ulc\h2o_Abhinav_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.0.2
H2O cluster version age:,"7 days, 16 hours and 20 minutes"
H2O cluster name:,H2O_from_python_Abhinav_fjtuvb
H2O cluster total nodes:,1
H2O cluster free memory:,7.111 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [4]:
training_frame = h2o.import_file('historical_data1_time_Q12005.csv')

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [5]:
training_frame = training_frame.head(10000)

In [6]:
training_frame['delinquent'] = (training_frame['delq_sts'] > 0)

In [7]:
training_frame['delinquent'] = training_frame['delinquent'].asfactor()

In [8]:
training_frame = training_frame.drop('delq_sts',axis=1)

In [9]:
testing_frame = h2o.import_file('historical_data1_time_Q22005.csv')

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [10]:
testing_frame = testing_frame.head(10000)

In [11]:
testing_frame['delinquent'] = (testing_frame['delq_sts'] > 0)

In [12]:
testing_frame['delinquent'] = testing_frame['delinquent'].asfactor()

In [13]:
testing_frame = testing_frame.drop('delq_sts',axis=1)

In [14]:
X = training_frame.columns
y = 'delinquent'
X.remove(y)

#Unnecessary Columns
X.remove('loan_seq')

In [15]:
aml = H2OAutoML(max_runtime_secs=300)

In [16]:
aml.train(x = X, y = y, training_frame = training_frame, leaderboard_frame= testing_frame)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [17]:
aml.leaderboard.as_data_frame()

Unnamed: 0,model_id,auc,logloss,mean_per_class_error,rmse,mse
0,GBM_grid_1_AutoML_20181129_232951_model_3,0.731071,0.170347,0.393053,0.196077,0.038446
1,StackedEnsemble_BestOfFamily_AutoML_20181129_2...,0.729174,0.193488,0.352791,0.215663,0.046511
2,GBM_grid_1_AutoML_20181129_232951_model_5,0.726274,0.153573,0.302437,0.190421,0.03626
3,StackedEnsemble_AllModels_AutoML_20181129_232951,0.725138,0.194608,0.368214,0.213029,0.045382
4,GBM_grid_1_AutoML_20181129_232951_model_4,0.724881,0.191189,0.378779,0.210833,0.044451
5,GBM_grid_1_AutoML_20181129_232951_model_6,0.715824,0.164517,0.321505,0.195332,0.038155
6,GBM_1_AutoML_20181129_232951,0.714367,0.206178,0.388667,0.212965,0.045354
7,GBM_grid_1_AutoML_20181129_232951_model_1,0.710247,0.180101,0.394208,0.197137,0.038863
8,GBM_grid_1_AutoML_20181129_232951_model_2,0.709907,0.153836,0.357888,0.188795,0.035644
9,XRT_1_AutoML_20181129_232951,0.705804,0.168318,0.368168,0.198204,0.039285


# TPOT

In [1]:
from tpot import TPOTClassifier

In [2]:
tpot = TPOTClassifier(generations=2, population_size=50,
                     offspring_size=None,
                     mutation_rate=0.9,
                     verbosity=3,cv=2,n_jobs=-1)

In [5]:
df_train = pd.read_csv('historical_data1_time_Q12005.csv',low_memory=False,nrows=10000)

In [6]:
df_test = pd.read_csv('historical_data1_time_Q22005.csv',low_memory=False,nrows=10000)

In [7]:
def createDummies(df):
    dummies = pd.get_dummies(df['repch_flag']).rename(columns=lambda x: 'repch_flag' + str(x))
    df = pd.concat([df, dummies], axis=1)
    dummies1 = pd.get_dummies(df['cd_zero_bal']).rename(columns=lambda x: 'cd_zero_bal' + str(x))
    df = pd.concat([df, dummies1], axis=1)
    return df

In [8]:
def transformDF(df):
    df['delinquent'] = (df.delq_sts > 0).astype(int)
    df = df.drop(['cd_zero_bal'],axis = 1)
    df = df.drop('delq_sts', axis = 1)
    return df

In [9]:
def prepare_data_for_model(current_df,next_df):
    current_df = createDummies(current_df)
    next_df = createDummies(next_df)
    
    current_df = transformDF(current_df)
    next_df = transformDF(next_df)
    
    current_df = current_df._get_numeric_data()
    next_df = next_df._get_numeric_data()
    
    return current_df,next_df

In [10]:
processed_train,processed_test = prepare_data_for_model(df_train,df_test)

In [11]:
X_train = processed_train.drop('delinquent',axis=1)
y_train = processed_train['delinquent']

X_test = processed_test.drop('delinquent',axis=1)
y_test = processed_test['delinquent']

In [12]:
tpot.fit(X_train, y_train)

  from numpy.core.umath_tests import inner1d


29 operators have been imported by TPOT.


HBox(children=(IntProgress(value=0, description='Optimization Progress', max=150), HTML(value='')))

_pre_test decorator: _random_mutation_operator: num_test=0 Input X must be non-negative
_pre_test decorator: _random_mutation_operator: num_test=0 Unsupported set of arguments: The combination of penalty='l1' and loss='logistic_regression' are not supported when dual=True, Parameters: penalty='l1', loss='logistic_regression', dual=True
_pre_test decorator: _random_mutation_operator: num_test=0 Input X must be non-negative
Pipeline encountered that has previously been evaluated during the optimization process. Using the score from the previous evaluation.
Generation 1 - Current Pareto front scores:
-1	0.9633999985359999	LogisticRegression(input_matrix, LogisticRegression__C=0.5, LogisticRegression__dual=False, LogisticRegression__penalty=l2)
-2	0.9639000985560039	GaussianNB(SelectPercentile(input_matrix, SelectPercentile__percentile=15))

_pre_test decorator: _random_mutation_operator: num_test=0 Unsupported set of arguments: The combination of penalty='l2' and loss='hinge' are not supp

TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=2,
        disable_update_check=False, early_stop=None, generations=2,
        max_eval_time_mins=5, max_time_mins=None, memory=None,
        mutation_rate=0.9, n_jobs=-1, offspring_size=None,
        periodic_checkpoint_folder=None, population_size=50,
        random_state=None, scoring=None, subsample=1.0, use_dask=False,
        verbosity=3, warm_start=False)

In [13]:
print('The accuracy of TPOT Classifier is {}'.format(tpot.score(X_test,y_test)))

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


The RMSE of TPOT Regressor is 0.9631


In [14]:
tpot.export('tpot_freddiemac_classification_pipeline.py')

True