GOAL - simple application ot TPOT AutoML to the UCI banking dataset to find the most optimal
model to predict whether to give a customer a loan

In [None]:
# ONE OFF INSTALL
# !pip install tpot

In [None]:
# Libraries
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

#### STEP 1 Connect directly to a datasource on the web and unzip it

In [None]:
# Reference: https://svaderia.github.io/articles/downloading-and-unzipping-a-zipfile/

from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

zipurl = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip'

with urlopen(zipurl) as zipresp:
    with ZipFile(BytesIO(zipresp.read())) as zfile:
        zfile.extractall()
        print(zfile.filelist)

In [None]:
# get filenames

fnames= [f.filename for f in zfile.infolist()]
fnames

#### STEP 2 Import & Explore data

In [None]:
import pandas as pd

# use bank-full.csv below for a larger dataset (testing may take several hours)
df = pd.read_csv('bank.csv',delimiter=";")
df.head()

In [None]:
df.dtypes # data types

In [None]:
df.isna().sum() # missing values

#### STEP 3 Basic Data Wrangling

In [None]:
# ordinal encode all the string columns
for cols in df:
    if df[cols].dtype == "object": # if its a string
        df[cols]  = df[cols].astype("category") # change string fields to python category
        df[cols] = df[cols].cat.codes # replace values with numbers

In [None]:
df.head()

#### STEP 4 Data partitioning

In [None]:
# partition the data for modelling
X = df[["age", "job", "marital", "balance", "housing","loan", "duration"]]
y = df["y"]

In [None]:
#split 50/50
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [None]:
# have a look
X_train

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold

# define KFolds model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

#### STEP 5 AutoML: TPOT optimization

In [None]:
# TPOT optimization - may take up to 5 minutes

from tpot import TPOTClassifier

# define search
pipeline_optimizer = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1)

# perform the search
pipeline_optimizer.fit(X_train, y_train)

In [None]:
# check test score

print(pipeline_optimizer.score(X_test, y_test))

In [None]:
# export the best model

pipeline_optimizer.export('tpot_best_model.py')

open the exported pipeline file tpot_best_model.py to view the best performing algo and corresponding hyperparameters