In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer, load_iris, load_wine, fetch_openml
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from leakage_free_openfe import OpenFE

# we are now going to test a sample dataset with engineered features from Leakage-Free-OpenFE

In [None]:
X, y = fetch_openml(name='diabetes', version=1, return_X_y=True, as_frame=True)
X.columns = ['col_'+str(x+1) for x in range(len(X.columns))]
y = y.map({'tested_positive': 1, 'tested_negative': 0}).astype(int)
print('Data dimensions (%d rows x %d cols)' %(X.shape[0], X.shape[1]))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Train data dimensions (rows x cols) = %d dims' %(int(X_train.shape[0]*X_train.shape[1])))
print('Test data dimensions (rows x cols) = %d dims' %(int(X_test.shape[0]*X_test.shape[1])))

In [None]:
base = RandomForestClassifier(n_estimators=100, random_state=42)
eng = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Training phase
ofe = OpenFE()
X.head(2), y.head(2)

In [None]:
features = ofe.fit(X_train, y_train)

In [None]:
X_trans = ofe.transform(X_train, is_train=True, new_features_list=features[:10], n_jobs=4)
print(X_train.shape)
print('Transformed shape: ', X_trans.shape)

In [None]:
# Test phase - uses stored training stats
X_test_trans = ofe.transform(X_test, is_train=False, new_features_list=features[:10], n_jobs=4)
print(X_test.shape)
print('Transformed test: ', X_test_trans.shape)

In [None]:
from leakage_free_openfe import tree_to_formula
print(len(features))
print('printing the first five features:')
for ax in features[:5]:
    print(tree_to_formula(ax))

In [None]:
### train a model on original features
base.fit(X_train, y_train)
y_pred = base.predict(X_test)

In [None]:
from print_metrics import print_classification_metrics, print_regression_metrics
print_classification_metrics(y_test, y_pred, verbose=1)

### The baseline balanced accuracy score with 8 original features is 71%

In [None]:
print(X_trans.shape)
X_trans = X_trans.replace([np.inf, -np.inf], np.nan).dropna()
print(X_trans.shape)

In [None]:
y_train = y_train.loc[X_trans.index]
y_train.shape

In [None]:
## train a model on engineered features
eng.fit(X_trans, y_train)

In [None]:
y_pred = eng.predict(X_test_trans)

In [None]:
from print_metrics import print_classification_metrics, print_regression_metrics
print_classification_metrics(y_test, y_pred, verbose=1)

### The Leakage Free OpenFE enriched features give only 1% extra balanced accuracy with 28 features
Though the increase in accuracy is very small, it is better than having very high accuracy with data leakage