In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from tabular_benchmark.utils.misc import set_seeds
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Comparision between GBDT model from sklearn and tabular_benchmark for classification task

## Sklearn

### Load and Split dataset

This file concerns credit card applications. All attribute names and values have been changed to meaningless symbols to protect confidentiality of the data.

This dataset is interesting because there is a good mix of attributes -- continuous, nominal with small numbers of values, and nominal with larger numbers of values. There are also a few missing values.

For more information:
https://archive.ics.uci.edu/ml/datasets/statlog+(australian+credit+approval)

In [2]:
dataset = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/australian/australian.dat', 
                      sep='\s+', header=None)
dataset

There are 6 numerical and 8 categorical attributes. The labels have been changed for the convenience of the statistical algorithms. For example, attribute 4 originally had 3 labels p,g,gg and these have been changed to labels 1,2,3.

- A1: 0,1 CATEGORICAL (formerly: a,b)
- A2: continuous.
- A3: continuous.
- A4: 1,2,3 CATEGORICAL (formerly: p,g,gg)
- A5: 1, 2,3,4,5, 6,7,8,9,10,11,12,13,14 CATEGORICAL (formerly: ff,d,i,k,j,aa,m,c,w, e, q, r,cc, x)
- A6: 1, 2,3, 4,5,6,7,8,9 CATEGORICAL (formerly: ff,dd,j,bb,v,n,o,h,z)
- A7: continuous.
- A8: 1, 0 CATEGORICAL (formerly: t, f)
- A9: 1, 0 CATEGORICAL (formerly: t, f)
- A10: continuous.
- A11: 1, 0 CATEGORICAL (formerly t, f)
- A12: 1, 2, 3 CATEGORICAL (formerly: s, g, p)
- A13: continuous.
- A14: continuous.
- A15: 1,2 class attribute (formerly: +,-)

In [3]:
cat_features = [0, 3, 4, 5, 7, 8, 10, 11]
target = 14

In [4]:
raw_X = dataset.drop(target, axis=1)
raw_X

690 samples

The target variable is the approval or refusal of the credit card application. Therefore, we are dealing with a **classification** problem

In [5]:
raw_y = pd.DataFrame(dataset[target])
raw_y = raw_y.rename(columns={14: 'Approved?'})
raw_y 

In [6]:
plt.hist(raw_y)
plt.savefig('australian_credit_dist')

### Load default model

In [7]:
model = GradientBoostingClassifier()  # classic GBDT from sklearn for classification

### Preprocess dataset

We need to do at least some preprocessing, starting by spliting the data in train, validation and test. The data is already in a DataFrame

In [8]:
seed = 42
set_seeds(seed)  # set seeds of python, numpy and pytorch altogheter
pct_test = 0.2  # pct of total data
pct_valid = 0.1 * (1 - pct_test)  # pct of total data
train_data, X_test, train_target, y_test = train_test_split(raw_X, raw_y, test_size=pct_test)
X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_target, test_size=pct_valid)

In [9]:
X_train

In [10]:
y_train

We need to ensure that there is no missing data. We will replace any missing data by the feature median for continuous data and by the feature mode for categorical data. (Note that for this particular dataset they have already replaced the missing data)

In [11]:
# split categirucal and continuous data
all_features = [i for i in range(X_train.shape[1])]
cont_features = list(set(all_features) - set(cat_features))
data_cat = X_train.iloc[:, cat_features]
data_cont = X_train.iloc[:, cont_features]
# preprocess categorical
mode_nan = data_cat.mode().iloc[0, :]
data_cat = data_cat.fillna(mode_nan)
# preprocess continuous
median_nan = data_cont.median()
data_cont = data_cont.fillna(median_nan)
# encode target
label_encoder = LabelEncoder()
label_encoder.fit(np.ravel(y_train))
y = pd.DataFrame(label_encoder.transform(np.ravel(y_train)), columns=y_train.columns)
# encode categoiral data
cat_dims = data_cat.nunique(dropna=False).to_list()
unknown_value = max(cat_dims)
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=unknown_value)
encoder.fit(X=data_cat, y=y)
data_cat = pd.DataFrame(encoder.transform(data_cat), index=data_cat.index)
# cast data
data_cont = data_cont.astype(np.float32)
data_cat = data_cat.astype(np.float32)
# join data
data = pd.concat([data_cont, data_cat.set_index(data_cont.index)], axis=1).set_axis(cont_features + cat_features, axis=1)
target = y
train_preprocessed = [data, target]

In [12]:
# Do the same for validation...(without refiting encoders, mode and medians)
# split categirucal and continuous data
data_cat = X_valid.iloc[:, cat_features]
data_cont = X_valid.iloc[:, cont_features]
# preprocess categorical
data_cat = data_cat.fillna(mode_nan)
# preprocess continuous
data_cont = data_cont.fillna(median_nan)
# encode target
y = pd.DataFrame(label_encoder.transform(np.ravel(y_valid)), columns=y_valid.columns)
# encode categoiral data
data_cat = pd.DataFrame(encoder.transform(data_cat), index=data_cat.index)
# cast data
data_cont = data_cont.astype(np.float32)
data_cat = data_cat.astype(np.float32)
# join data
data = pd.concat([data_cont, data_cat.set_index(data_cont.index)], axis=1).set_axis(cont_features + cat_features, axis=1)
target = y
valid_preprocessed = [data, target]

In [13]:
# Do the same for test...(without refiting encoders, mode and medians)
# split categirucal and continuous data
data_cat = X_test.iloc[:, cat_features]
data_cont = X_test.iloc[:, cont_features]
# preprocess categorical
data_cat = data_cat.fillna(mode_nan)
# preprocess continuous
data_cont = data_cont.fillna(median_nan)
# encode target
y = pd.DataFrame(label_encoder.transform(np.ravel(y_test)), columns=y_test.columns)
# encode categoiral data
data_cat = pd.DataFrame(encoder.transform(data_cat), index=data_cat.index)
# cast data
data_cont = data_cont.astype(np.float32)
data_cat = data_cat.astype(np.float32)
# join data
data = pd.concat([data_cont, data_cat.set_index(data_cont.index)], axis=1).set_axis(cont_features + cat_features, axis=1)
target = y
test_preprocessed = [data, target]

### Fit the model

In [14]:
model.fit(train_preprocessed[0], np.ravel(train_preprocessed[1]))  # sklearn expects the shape of y to be (n_samples, ) 

### Visualizing the result

By default the model uses the log_loss

In [15]:
model.train_score_

In [16]:
fig, ax = plt.subplots()
ax.plot(model.train_score_)
ax.set_xlabel('iterations')
ax.set_ylabel('Logloss')

In [17]:
results_columns = ['Model', 'LogLoss', 'Accuracy']
results = pd.DataFrame(columns=results_columns)
y_pred = model.predict(test_preprocessed[0])
acc = accuracy_score(test_preprocessed[1], y_pred)
y_pred_proba = model.predict_proba(test_preprocessed[0])
logloss = log_loss(test_preprocessed[1], y_pred_proba)
res = ['Scikit-learn GBDT', logloss, acc]
res_row = pd.DataFrame(dict(zip(results_columns, res)), index=[0])
results = pd.concat([results, res_row], ignore_index=True)
results

## Tabular benchmark

In [18]:
from tabular_benchmark.utils.datasets import load_dataset
from tabular_benchmark.models.xgboost import XGBoostModel
from tabular_benchmark.utils.misc import train_valid_test_split

### Load and Split dataset

We can directly load the dataset using the load_dataset function...

In [19]:
dataset = load_dataset('australian_credit')

In [20]:
dataset

For other datasets, we can construct a similar dictionary as the retuned by load_dataset

In [21]:
name = 'australian_credit'
task = 'classification'
full_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/australian/australian.dat', 
                        sep='\s+', header=None)
data = full_data.iloc[:, :-1]
target = pd.DataFrame(full_data.iloc[:, 14])
test_data = None
test_target = None
cat_features = [0, 3, 4, 5, 7, 8, 10, 11]
dataset = {
    'name': name,
    'task': task,
    'data': data,
    'target': target,
    'test_data': test_data,
    'test_target': test_target,
    'cat_features': cat_features,
}
dataset

In [22]:
raw_X = dataset['data']
raw_X

In [23]:
raw_y = dataset['target']
raw_y

Note that we are already working with DataFrames

In [24]:
task = dataset['task']
cat_features = dataset['cat_features']

In [25]:
set_seeds(seed)
pct_test = 0.2  # pct of total data
pct_valid = 0.1 * (1 - pct_test)  # pct of total data
train_data, X_test, train_target, y_test = train_test_split(raw_X, raw_y, test_size=pct_test)
X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_target, test_size=pct_valid)
train = [X_train, y_train]
valid = [X_valid, y_valid]
test = [X_test, y_test]
# We could run the code below, but we are using the same split as before to compare the methods
# pct_test = 0.2  # pct of total data
# pct_valid = 0.1 * (1 - pct_test)  # pct of total data
# train, valid, test = train_valid_test_split(raw_X, raw_y, pct_valid, pct_test, random_state=seed)

Each set is a list of [features, target]

In [26]:
train

### Load default model

In [27]:
model = XGBoostModel(random_seed=seed)

Note that we do not need to specify if it is a regressor or a classifier, we will automatically instanciate the right model given the task.

### Preprocess dataset

In [28]:
train, valid, test, p_cat_features, p_cat_dims = model.preprocess_dataset(train=train, task=task, cat_features=cat_features,
                                                                          valid=valid, test=test)

This function automatically treat missing values, encode categorical features, cast the variables to its correct type and normalize the features following the recommendations for each model. It also return the categorical features and the dimensions of each categorical feature, which are needed for some models.

### Fit the model

In [29]:
X_train = train[0]
y_train = train[1]
eval_sets = [train, valid]
eval_names = ['train', 'valid']

In [30]:
model.fit(X_train=X_train, y_train=y_train, task=task, eval_sets=eval_sets, eval_names=eval_names, 
          cat_features=p_cat_features, cat_dims=p_cat_dims)

### Visualizing the result

By default the model uses log_loss

In [31]:
model.evals_result_

In [32]:
fig, ax = plt.subplots()
ax.plot(model.evals_result_['train']['loss_logloss'], label='train')
ax.plot(model.evals_result_['valid']['loss_logloss'], label='valid')
ax.legend()
ax.set_xlabel('iterations')
ax.set_ylabel('logloss')

Note that we use by default the valid set for early stopping (we could do the same with sklearn with some tweaking)

In [33]:
y_pred = model.predict(test[0])
acc = accuracy_score(test[1], y_pred)
y_pred_proba = model.predict_proba(test[0])
logloss = log_loss(test[1], y_pred_proba)
res = ['XGBoost', logloss, acc]
res_row = pd.DataFrame(dict(zip(results_columns, res)), index=[0])
results = pd.concat([results, res_row], ignore_index=True)
results

### Beyond the default model...tunning the hyperparameters

In [34]:
from tabular_benchmark.tunners import DEHBTunner

In [35]:
model = XGBoostModel(random_seed=seed)

We use the total training data, before spliting into the validation set, and we choose a budget_type ('iteration' or 'subsample') to perform the tunning of the hyperparameters.

In [38]:
tunner = DEHBTunner(budget_type='iteration',
                    min_budget=100, max_budget=10000, resample_strategy='k-fold_cv', k_folds=5,
                    n_workers=1, seed_model=seed, seed_dataset=seed)  
# note that we can use more than 1 worker, but the results will not be reproductible due to assyncronous nature of the 
# algorithm in its parallel version

We run until one of the conditions is met (we should only set **one**):
- we evaluate a certain amount of models (fevals)
- we evaluate a certain amount of brackets (min budget to max budget)
- we evaluate for a certain amount of time in seconds (total_cost)

In [39]:
fevals = None
brackets = 10
total_cost = None
trajectory, runtime, history = tunner.run(model.__class__, train_data, train_target, task, cat_features,
                                          fevals=fevals, brackets=brackets, total_cost=total_cost,
                                          verbose=False, save_intermediate=False)

In [40]:
tunner.tunner.get_incumbents()

In [41]:
model = XGBoostModel(**tunner.tunner.get_incumbents()[0], random_seed=seed)

In [42]:
model.fit(X_train=X_train, y_train=y_train, task=task, eval_sets=eval_sets, eval_names=eval_names, 
          cat_features=p_cat_features, cat_dims=p_cat_dims)

In [43]:
fig, ax = plt.subplots()
ax.plot(model.evals_result_['train']['loss_logloss'], label='train')
ax.plot(model.evals_result_['valid']['loss_logloss'], label='valid')
ax.legend()
ax.set_xlabel('iterations')
ax.set_ylabel('logloss')

In [44]:
y_pred = model.predict(test[0])
acc = accuracy_score(test[1], y_pred)
y_pred_proba = model.predict_proba(test[0])
logloss = log_loss(test[1], y_pred_proba)
res = ['XGBoost Tunned', logloss, acc]
res_row = pd.DataFrame(dict(zip(results_columns, res)), index=[0])
results = pd.concat([results, res_row], ignore_index=True)
results