In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.datasets import load_iris, load_breast_cancer, load_wine
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from statistics import mean, stdev

# Todo: remove once have pip install
import sys  
sys.path.insert(0, 'C:\python_projects\ArithmeticFeatures_project\ArithmeticFeatures')
from ArithmeticFeatures import ArithmeticFeatures # todo: fix once have pip install

np.random.seed(0)

## Methods to load data

In [2]:
# This provides methods to test with 3 of the toy datasets provided by sklearn. These are used below
# to test the accuracy of models using RotationFeatures.

def get_iris():
    iris = load_iris()
    X, y = iris.data, iris.target
    X = pd.DataFrame(X, columns=iris['feature_names'])
    return X, y

def get_breast_cancer():
    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    return X,y

def get_wine():
    X, y = load_wine(return_X_y=True, as_frame=True)
    return X,y

## Examples 1 to 4: Simple examples using the ArithmeticFeatures class

In [3]:
X,y = get_iris()
arith = ArithmeticFeatures()

# Example 1: Using fit() then transform(), and get_feature_names()
arith.fit(X)
X1 = pd.DataFrame(arith.transform(X), columns=arith.get_feature_names())
print("\nExample 1:")
display(X1.head())  

# Example 2: Using fit_transform() and get_params()
X2 = pd.DataFrame(arith.fit_transform(X), columns=arith.get_feature_names())
print("\nExample 2:")
display(X2.head())
print("Calling get_params(): ", arith.get_params())

# Example 3: Using a two-column numpy array
print("\nExample 3:")
X = np.arange(6).reshape(3, 2)
print("type of X: ", type(X))
X3 = pd.DataFrame(arith.fit_transform(X), columns=arith.get_feature_names())
display(X3.head())

# Example 4: Using a two-column python matrix
print("\nExample 4:")
X = np.arange(6).reshape(3, 2).tolist()
print("type of X: ", type(X))
X4 = pd.DataFrame(arith.fit_transform(X), columns=arith.get_feature_names())
display(X4.head())



Example 1:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),sepal length (cm) plus sepal width (cm),sepal length (cm) times sepal width (cm),sepal length (cm) minus sepal width (cm),sepal length (cm) divide sepal width (cm),sepal length (cm) plus petal length (cm),sepal length (cm) times petal length (cm),...,sepal width (cm) minus petal length (cm),sepal width (cm) divide petal length (cm),sepal width (cm) plus petal width (cm),sepal width (cm) times petal width (cm),sepal width (cm) minus petal width (cm),sepal width (cm) divide petal width (cm),petal length (cm) plus petal width (cm),petal length (cm) times petal width (cm),petal length (cm) minus petal width (cm),petal length (cm) divide petal width (cm)
0,5.1,3.5,1.4,0.2,8.6,17.85,1.6,1.457143,6.5,7.14,...,2.1,2.5,3.7,0.7,3.3,17.5,1.6,0.28,1.2,7.0
1,4.9,3.0,1.4,0.2,7.9,14.7,1.9,1.633333,6.3,6.86,...,1.6,2.142857,3.2,0.6,2.8,15.0,1.6,0.28,1.2,7.0
2,4.7,3.2,1.3,0.2,7.9,15.04,1.5,1.46875,6.0,6.11,...,1.9,2.461538,3.4,0.64,3.0,16.0,1.5,0.26,1.1,6.5
3,4.6,3.1,1.5,0.2,7.7,14.26,1.5,1.483871,6.1,6.9,...,1.6,2.066667,3.3,0.62,2.9,15.5,1.7,0.3,1.3,7.5
4,5.0,3.6,1.4,0.2,8.6,18.0,1.4,1.388889,6.4,7.0,...,2.2,2.571429,3.8,0.72,3.4,18.0,1.6,0.28,1.2,7.0



Example 2:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),sepal length (cm) plus sepal width (cm),sepal length (cm) times sepal width (cm),sepal length (cm) minus sepal width (cm),sepal length (cm) divide sepal width (cm),sepal length (cm) plus petal length (cm),sepal length (cm) times petal length (cm),...,sepal width (cm) minus petal length (cm),sepal width (cm) divide petal length (cm),sepal width (cm) plus petal width (cm),sepal width (cm) times petal width (cm),sepal width (cm) minus petal width (cm),sepal width (cm) divide petal width (cm),petal length (cm) plus petal width (cm),petal length (cm) times petal width (cm),petal length (cm) minus petal width (cm),petal length (cm) divide petal width (cm)
0,5.1,3.5,1.4,0.2,8.6,17.85,1.6,1.457143,6.5,7.14,...,2.1,2.5,3.7,0.7,3.3,17.5,1.6,0.28,1.2,7.0
1,4.9,3.0,1.4,0.2,7.9,14.7,1.9,1.633333,6.3,6.86,...,1.6,2.142857,3.2,0.6,2.8,15.0,1.6,0.28,1.2,7.0
2,4.7,3.2,1.3,0.2,7.9,15.04,1.5,1.46875,6.0,6.11,...,1.9,2.461538,3.4,0.64,3.0,16.0,1.5,0.26,1.1,6.5
3,4.6,3.1,1.5,0.2,7.7,14.26,1.5,1.483871,6.1,6.9,...,1.6,2.066667,3.3,0.62,2.9,15.5,1.7,0.3,1.3,7.5
4,5.0,3.6,1.4,0.2,8.6,18.0,1.4,1.388889,6.4,7.0,...,2.2,2.571429,3.8,0.72,3.4,18.0,1.6,0.28,1.2,7.0


Calling get_params():  {'scale_data': False, 'support_plus': True, 'support_times': True, 'support_minus': True, 'support_div': True, 'support_min': False, 'support_max': False}

Example 3:
type of X:  <class 'numpy.ndarray'>


Unnamed: 0,0,1,0 plus 1,0 times 1,0 minus 1,0 divide 1
0,0.0,1.0,1.0,0.0,-1.0,0.0
1,2.0,3.0,5.0,6.0,-1.0,0.666667
2,4.0,5.0,9.0,20.0,-1.0,0.8



Example 4:
type of X:  <class 'list'>


Unnamed: 0,0,1,0 plus 1,0 times 1,0 minus 1,0 divide 1
0,0.0,1.0,1.0,0.0,-1.0,0.0
1,2.0,3.0,5.0,6.0,-1.0,0.666667
2,4.0,5.0,9.0,20.0,-1.0,0.8


## Example 5: Comparing the accuracy when using the arithmetic-based generated features vs using only the original features.

In [4]:
# Test sklearn's decision tree using either the original or the original plus the generated features. In most cases there
# is an improvement in the f1 score, as well as a reduction in the variation between folds, and smaller decision trees, and 
# hence greater interpretability.

def test_classification(X, y):
    # The depth is limited to 4 in order to maintain the interpretability of the trees induced. 
    dt = tree.DecisionTreeClassifier(max_depth=4, random_state=0)
    scores = cross_validate(dt, X, y, cv=5, scoring='f1_macro', return_train_score=True, return_estimator=True)
    test_scores = scores['test_score']
    train_scores = scores['train_score']
    score_name = "f1_score: "
    avg_test_score = test_scores.mean()
    scores_std_dev = stdev(test_scores)
    avg_train_score = train_scores.mean()

    estimators = scores['estimator']    
    total_num_nodes = 0
    for est in estimators:
        total_num_nodes += est.tree_.node_count
    avg_num_nodes = total_num_nodes / len(estimators)

    print("\n Average f1 score on training data: ", round(avg_train_score,3))
    print(" Average f1 score on test data: ", round(avg_test_score,3))
    print(" Std dev of f1 scores on test data: ", round(scores_std_dev,3))
    print(" Average number of nodes: ", round(avg_num_nodes,3))

# Given a dataframe X, return an extended dataframe, having the same set of rows, but additional, generated columns
def get_extended_X(X):
    arith = ArithmeticFeatures()
    extended_X = pd.DataFrame(arith.fit_transform(X), columns=arith.get_feature_names())
    return extended_X

# Given a method to load a dataset, load the dataset and test the accuracy of a sklearn decision tree with and without
# the extended features.
def test_dataset(load_method, file_name):
    print("\n\n*********************************************")
    print("Calling for " + file_name)
    print("*********************************************")
    
    X,y = load_method()
    print("\nUsing original features only")
    test_classification(X, y)

    print("\nUsing arithmetic-based features")
    extended_X = get_extended_X(X)
    test_classification(extended_X, y)
    
test_dataset(get_iris, "Iris")
test_dataset(get_breast_cancer, "Breast Cancer")
test_dataset(get_wine, "Wine")



*********************************************
Calling for Iris
*********************************************

Using original features only

 Average f1 score on training data:  0.99
 Average f1 score on test data:  0.967
 Std dev of f1 scores on test data:  0.041
 Average number of nodes:  12.6

Using arithmetic-based features

 Average f1 score on training data:  0.992
 Average f1 score on test data:  0.966
 Std dev of f1 scores on test data:  0.024
 Average number of nodes:  9.0


*********************************************
Calling for Breast Cancer
*********************************************

Using original features only

 Average f1 score on training data:  0.983
 Average f1 score on test data:  0.909
 Std dev of f1 scores on test data:  0.022
 Average number of nodes:  20.6

Using arithmetic-based features

 Average f1 score on training data:  0.997
 Average f1 score on test data:  0.946
 Std dev of f1 scores on test data:  0.021
 Average number of nodes:  16.6


***********

## Example 6: Using an ArithmeticFeatures in a sklearn pipeline

In [5]:
X,y = get_iris()
pipe = Pipeline([('arith', ArithmeticFeatures()), ('dt', tree.DecisionTreeClassifier())])

# Example getting the training score
pipe.fit(X,y)
sc = pipe.score(X,y)
print('Training Accuracy: %.3f' % sc)

# Example getting the cross validated accuracy
n_scores = cross_val_score(pipe, X, y, scoring='accuracy', n_jobs=-1, error_score='raise')
print('Cross validated Accuracy: %.3f (%.3f)' % (mean(n_scores), stdev(n_scores)))


Training Accuracy: 1.000
Cross validated Accuracy: 0.953 (0.018)


## Example 7: Using a pipeline and grid search to optimize the hyperparameters used by ArithmeticFeatures and by the Decision Tree

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

parameters = {
    'arith__support_plus': (True, False),
    'arith__support_mult': (True, False),
    'arith__support_minus': (True, False),
    'arith__support_div': (True, False),
    'arith__support_min': (True, False),
    'arith__support_max': (True, False),    
    'dt__max_depth': (3,4,5)
}

gs_clf = GridSearchCV(pipe, parameters)
gs_clf.fit(X_train, y_train)
s = gs_clf.score(X_test, y_test)
print("score: ", s)
b = gs_clf.best_params_
b


score:  0.9473684210526315


{'arith__support_div': False,
 'arith__support_max': True,
 'arith__support_min': True,
 'arith__support_minus': True,
 'arith__support_mult': True,
 'arith__support_plus': False,
 'dt__max_depth': 3}