In [1]:
# Module: Data Science in Finance, AutoML 
# Version 1.0
# Topic : AutoML - auto-sklearn
# Example source: https://www.kaggle.com/wendykan/lending-club-loan-data
#####################################################################
# For support or questions, contact Sri Krishnamurthy at
# sri@quantuniversity.com
# Copyright 2018 QuantUniversity LLC.
#####################################################################

# AutoML with auto-sklearn

AutoML is the process of automating an end-to-end Machine Learning pipeline. [auto-sklearn](https://automl.github.io/auto-sklearn/stable/index.html) specifically uses Bayesian optimization, meta-learning and ensemble construction to optimise these pipelines by selecting the best model and its hyperparamters.

This notebook explains the basic workflow involved in an AutoML pipeline with auto-sklearn

### Imports

In [2]:
# for numerical analysis and data processing
import numpy as np
import pandas as pd

#AutoML
import sklearn.metrics
import autosklearn.regression

import warnings
warnings.filterwarnings('ignore')

  from numpy.core.umath_tests import inner1d


### Dataset

The data set is the lending data for lendingclub from August 2011 to December 2011 for some borrowers. The feature descriptions for the data are also provided. Not all the features are required for making predictions, some features are redundant in the original data file. The provided data file is already cleaned and only relevant features are provided. There are two types of features, numerical and categorical.

Reading the input data from csv file.

In [3]:
df = pd.read_csv("../data/LendingClubLoan.csv", low_memory=False)
del df['issue_d'] # removing issue date as it wont affect the prediction (redundant feature)
df_description = pd.read_excel('../data/LCDataDictionary.xlsx').dropna()

In [4]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,purpose,addr_state,dti,delinq_2yrs,inq_last_6mths,loan_status_Binary
0,5000,36 months,10.65,162.87,B,B2,10+ years,RENT,24000.0,Verified,credit_card,AZ,27.65,0,1,0
1,2500,60 months,15.27,59.83,C,C4,< 1 year,RENT,30000.0,Source Verified,car,GA,1.0,0,5,1
2,2400,36 months,15.96,84.33,C,C5,10+ years,RENT,12252.0,Not Verified,small_business,IL,8.72,0,2,0
3,10000,36 months,13.49,339.31,C,C1,10+ years,RENT,49200.0,Source Verified,other,CA,20.0,0,1,0
4,3000,60 months,12.69,67.79,B,B5,1 year,RENT,80000.0,Source Verified,other,OR,17.94,0,0,0


In [5]:
df.info()
feature_types = ['numerical']+['categorical']+(['numerical']*1)+(['categorical']*4)+['categorical']+(['categorical']*3)+(['numerical']*4)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 16 columns):
loan_amnt              9999 non-null int64
term                   9999 non-null object
int_rate               9999 non-null float64
installment            9999 non-null float64
grade                  9999 non-null object
sub_grade              9999 non-null object
emp_length             9644 non-null object
home_ownership         9999 non-null object
annual_inc             9999 non-null float64
verification_status    9999 non-null object
purpose                9999 non-null object
addr_state             9999 non-null object
dti                    9999 non-null float64
delinq_2yrs            9999 non-null int64
inq_last_6mths         9999 non-null int64
loan_status_Binary     9999 non-null int64
dtypes: float64(4), int64(4), object(8)
memory usage: 1.2+ MB


In [6]:
numeric_columns = df.select_dtypes(include=['float64','int64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

In [7]:
for col in categorical_columns:
    df[col] = df[col].astype('category')

#### Dictionary for categorical features.

In [8]:
categories={}
for cat in categorical_columns:
    categories[cat] = df[cat].cat.categories.tolist()

In [9]:
p_categories = df['purpose'].cat.categories.tolist()
s_categories = df['addr_state'].cat.categories.tolist()
df[categorical_columns] = df[categorical_columns].apply(lambda x: x.cat.codes)

In [10]:
df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,purpose,addr_state,dti,delinq_2yrs,inq_last_6mths,loan_status_Binary
0,5000,0,10.65,162.87,1,6,1,2,24000.0,2,1,3,27.65,0,1,0
1,2500,1,15.27,59.83,2,13,10,2,30000.0,1,0,10,1.0,0,5,1
2,2400,0,15.96,84.33,2,14,1,2,12252.0,0,10,12,8.72,0,2,0
3,10000,0,13.49,339.31,2,10,1,2,49200.0,1,8,4,20.0,0,1,0
4,3000,1,12.69,67.79,1,9,0,2,80000.0,1,8,31,17.94,0,0,0


Storing interest rate statistics

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 16 columns):
loan_amnt              9999 non-null int64
term                   9999 non-null int8
int_rate               9999 non-null float64
installment            9999 non-null float64
grade                  9999 non-null int8
sub_grade              9999 non-null int8
emp_length             9999 non-null int8
home_ownership         9999 non-null int8
annual_inc             9999 non-null float64
verification_status    9999 non-null int8
purpose                9999 non-null int8
addr_state             9999 non-null int8
dti                    9999 non-null float64
delinq_2yrs            9999 non-null int64
inq_last_6mths         9999 non-null int64
loan_status_Binary     9999 non-null int64
dtypes: float64(4), int64(4), int8(8)
memory usage: 703.1 KB


In [12]:
min_rate= df['int_rate'].min()
max_rate= df['int_rate'].max()
print(min_rate, max_rate, max_rate- min_rate)

5.42 24.11 18.689999999999998


In [13]:
df_max = df.max()
df_min = df.min()

## Preparing the dataset 

The data is split into training and testing data. x represents the input features whereas y represents the output i.e. the interest rate.As a rule of thumb, we split the data into 80% training data and 20% testing or validation data.

In [14]:
y = df.iloc[:,df.columns.isin(["int_rate"])]
x = df.loc[:, ~df.columns.isin(["int_rate"])]

total_samples=len(df)
split = 0.8

x_train = x[0:int(total_samples*split)]
x_test = x[int(total_samples*split):total_samples]
y_train = y[0:int(total_samples*split)]
y_test = y[int(total_samples*split):total_samples]

## AutoML

### The following is all the code needed to find the best model:

In [15]:
automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=600, # in seconds
    per_run_time_limit=60, # in seconds
)
automl.fit(x_train, y_train, dataset_name='finance',
           feat_type=feature_types)



AutoSklearnRegressor(delete_output_folder_after_terminate=True,
           delete_tmp_folder_after_terminate=True,
           disable_evaluator_output=False, ensemble_nbest=50,
           ensemble_size=50, exclude_estimators=None,
           exclude_preprocessors=None, get_smac_object_callback=None,
           include_estimators=None, include_preprocessors=None,
           initial_configurations_via_metalearning=25,
           ml_memory_limit=3072, output_folder=None, per_run_time_limit=60,
           resampling_strategy='holdout',
           resampling_strategy_arguments=None, seed=1, shared_mode=False,
           smac_scenario_args=None, time_left_for_this_task=600,
           tmp_folder=None)

### AutoML training details

#### A list of all the algorithm runs

In [16]:
automl.show_models()

"[(0.680000, SimpleRegressionPipeline({'categorical_encoding:__choice__': 'no_encoding', 'imputation:strategy': 'mean', 'preprocessor:__choice__': 'feature_agglomeration', 'regressor:__choice__': 'random_forest', 'rescaling:__choice__': 'standardize', 'preprocessor:feature_agglomeration:affinity': 'euclidean', 'preprocessor:feature_agglomeration:linkage': 'average', 'preprocessor:feature_agglomeration:n_clusters': 28, 'preprocessor:feature_agglomeration:pooling_func': 'max', 'regressor:random_forest:bootstrap': 'False', 'regressor:random_forest:criterion': 'mse', 'regressor:random_forest:max_depth': 'None', 'regressor:random_forest:max_features': 0.5619411763261347, 'regressor:random_forest:max_leaf_nodes': 'None', 'regressor:random_forest:min_impurity_decrease': 0.0, 'regressor:random_forest:min_samples_leaf': 13, 'regressor:random_forest:min_samples_split': 19, 'regressor:random_forest:min_weight_fraction_leaf': 0.0, 'regressor:random_forest:n_estimators': 100},\ndataset_properties={

#### A summary of all the algorithm runs

In [17]:
print(automl.sprint_statistics())

auto-sklearn results:
  Dataset name: finance
  Metric: r2
  Best validation score: 0.999178
  Number of target algorithm runs: 42
  Number of successful target algorithm runs: 34
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 5
  Number of target algorithms that exceeded the memory limit: 3



### Using the best pipeline to make predictions

In [18]:
predictions = automl.predict(x_test)
predictions_train = automl.predict(x_train)

#### Best model performance

In [19]:
print("MAE score:", sklearn.metrics.mean_absolute_error(y_test, predictions))
print("R2 score:", sklearn.metrics.r2_score(y_test, predictions))

MAE score: 0.5502034230995179
R2 score: 0.979238597993327


### Export the best model

In [20]:
import pickle
pickle.dump(automl, open('automl.model','wb'))

### MAPE (Mean Absolute Percentage Error)

In [21]:
# from sklearn.utils import check_arrays
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [22]:
mape_test = mean_absolute_percentage_error(y_test.values.ravel(), predictions)
mape_train = mean_absolute_percentage_error(y_train.values.ravel(), predictions_train)

In [25]:
print("Training-set MAPE: "+str(mape_train))
print("Test-set MAPE: "+str(mape_test))

Training-set MAPE: 0.4850749017858119
Test-set MAPE: 4.993423605084602


In [23]:
y_test.values[0:5].ravel()

array([13.49, 11.49, 13.99, 10.59,  7.49])

In [24]:
predictions[0:5]

array([14.21756 , 12.344653, 14.584254, 10.621102,  7.868778],
      dtype=float32)