In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer, load_iris, load_wine, fetch_openml
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from leakage_free_openfe import OpenFE

# we are now going to test a sample dataset with engineered features from Leakage-Free-OpenFE

In [2]:
X, y = fetch_openml(name='diabetes', version=1, return_X_y=True, as_frame=True)
X.columns = ['col_'+str(x+1) for x in range(len(X.columns))]
y = y.map({'tested_positive': 1, 'tested_negative': 0}).astype(int)
print('Data dimensions (%d rows x %d cols)' %(X.shape[0], X.shape[1]))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Train data dimensions (rows x cols) = %d dims' %(int(X_train.shape[0]*X_train.shape[1])))
print('Test data dimensions (rows x cols) = %d dims' %(int(X_test.shape[0]*X_test.shape[1])))

Data dimensions (768 rows x 8 cols)
Train data dimensions (rows x cols) = 4912 dims
Test data dimensions (rows x cols) = 1232 dims


In [3]:
base = RandomForestClassifier(n_estimators=100, random_state=42)
eng = RandomForestClassifier(n_estimators=100, random_state=42)

In [4]:
# Training phase
ofe = OpenFE()
X.head(2), y.head(2)

(   col_1  col_2  col_3  col_4  col_5  col_6  col_7  col_8
 0      6    148     72     35      0   33.6  0.627     50
 1      1     85     66     29      0   26.6  0.351     31,
 0    1
 1    0
 Name: class, dtype: int32)

In [5]:
features = ofe.fit(X_train, y_train)

The number of candidate features is 428
Start stage I selection.


100%|██████████| 4/4 [00:15<00:00,  3.76s/it]


119 same features have been deleted.
Meet early-stopping in successive feature-wise halving.


100%|██████████| 4/4 [00:13<00:00,  3.41s/it]


The number of remaining candidate features is 283
Start stage II selection.


100%|██████████| 4/4 [00:09<00:00,  2.39s/it]


Finish data processing.
[LightGBM] [Info] Number of positive: 170, number of negative: 321
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003105 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20170
[LightGBM] [Info] Number of data points in the train set: 491, number of used features: 291


In [7]:
X_trans = ofe.transform(X_train, is_train=True, new_features_list=features[:100], n_jobs=4)
print(X_train.shape)
print('Transformed shape: ', X_trans.shape)

(614, 8)
Transformed shape:  (614, 28)


In [8]:
# Test phase - uses stored training stats
X_test_trans = ofe.transform(X_test, is_train=False, new_features_list=features[:100], n_jobs=4)
print(X_test.shape)
print('Transformed test: ', X_test_trans.shape)

(154, 8)
Transformed test:  (154, 28)


In [10]:
from leakage_free_openfe import tree_to_formula
print(len(features))
print('printing the first five features:')
for ax in features[:5]:
    print(tree_to_formula(ax))

283
printing the first five features:
(col_2*col_8)
(col_2*col_6)
(col_6*col_8)
(col_6+col_7)
log(col_2)


In [11]:
### train a model on original features
base.fit(X_train, y_train)
y_pred = base.predict(X_test)

In [12]:
from print_metrics import print_classification_metrics, print_regression_metrics
print_classification_metrics(y_test, y_pred, verbose=1)

Bal accu 70%
              precision    recall  f1-score   support

           0       0.79      0.78      0.78        99
           1       0.61      0.62      0.61        55

    accuracy                           0.72       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.72      0.72      0.72       154

final average balanced accuracy score = 0.70


0.6979797979797979

### The baseline balanced accuracy score with 8 original features is 71%

In [13]:
## train a model on engineered features
eng.fit(X_trans, y_train)
y_pred = eng.predict(X_test_trans)

In [14]:
from print_metrics import print_classification_metrics, print_regression_metrics
print_classification_metrics(y_test, y_pred, verbose=1)

Bal accu 71%
              precision    recall  f1-score   support

           0       0.79      0.80      0.79        99
           1       0.63      0.62      0.62        55

    accuracy                           0.73       154
   macro avg       0.71      0.71      0.71       154
weighted avg       0.73      0.73      0.73       154

final average balanced accuracy score = 0.71


0.7080808080808081

### The Leakage Free OpenFE enriched features give only 1% extra balanced accuracy with 28 features
Though the increase in accuracy is very small, it is better than having very high accuracy with data leakage