In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.datasets import load_iris, load_breast_cancer, load_wine
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from statistics import stdev
from RotationFeatures import RotationFeatures

np.random.seed(0)

# Methods to load data

In [3]:
# This provides methods to test with 3 of the toy datasets provided by sklearn.

def get_iris():
    iris = load_iris()
    X, y = iris.data, iris.target
    X = pd.DataFrame(X, columns=iris['feature_names'])
    return X, y

def get_breast_cancer():
    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    return X,y

def get_wine():
    X, y = load_wine(return_X_y=True, as_frame=True)
    return X,y

## Examples 1 to 4: Simple examples using the RotationFeatures class

In [4]:
X,y = get_iris()
rota = RotationFeatures(degree_increment=30)

# Example 1: Using fit() then transform(), and get_feature_names()
rota.fit(X)
X1 = pd.DataFrame(rota.transform(X), columns=rota.get_feature_names())
print("\nExample 1:")
display(X1.head())  

# Example 2: Using fit_transform() and get_params()
X2 = pd.DataFrame(rota.fit_transform(X), columns= rota.get_feature_names())
print("\nExample 2:")
display(X2.head())
print("Calling get_params(): ", rota.get_params())

# Example 3: Using a numpy array
print("\nExample 3:")
X = np.arange(6).reshape(3, 2)
print("type of X: ", type(X))
X3 = pd.DataFrame(rota.fit_transform(X), columns= rota.get_feature_names())
display(X3.head())

# Example 4: Using a python array
print("\nExample 4:")
X = np.arange(6).reshape(3, 2).tolist()
print("type of X: ", type(X))
X4 = pd.DataFrame(rota.fit_transform(X), columns= rota.get_feature_names())
display(X4.head())



Example 1:


Unnamed: 0,0,1,2,3,R_0,R_1,R_2,R_3,R_4,R_5,...,R_14,R_15,R_16,R_17,R_18,R_19,R_20,R_21,R_22,R_23
0,5.1,3.5,1.4,0.2,-0.12005,0.652377,-0.430155,0.50495,0.158552,0.169825,...,0.253786,0.575164,0.520433,0.348584,0.276416,0.562099,0.03788,0.069983,-0.002186,0.079547
1,4.9,3.0,1.4,0.2,-0.063996,0.444177,-0.277511,0.352671,0.110439,0.142047,...,0.14962,0.394742,0.340011,0.244418,0.172249,0.381677,0.03788,0.069983,-0.002186,0.079547
2,4.7,3.2,1.3,0.2,-0.153775,0.488568,-0.377457,0.346225,0.070801,0.099591,...,0.205965,0.458436,0.412179,0.286084,0.213916,0.453846,0.023202,0.061508,-0.010661,0.064869
3,4.6,3.1,1.5,0.2,-0.156998,0.438595,-0.355262,0.301335,0.029796,0.115059,...,0.155775,0.439301,0.376095,0.265251,0.193082,0.417762,0.052559,0.078457,0.006288,0.094225
4,5.0,3.6,1.4,0.2,-0.16494,0.674572,-0.480128,0.501727,0.134496,0.155936,...,0.27462,0.611249,0.556517,0.369418,0.297249,0.598184,0.03788,0.069983,-0.002186,0.079547



Example 2:


Unnamed: 0,0,1,2,3,R_0,R_1,R_2,R_3,R_4,R_5,...,R_14,R_15,R_16,R_17,R_18,R_19,R_20,R_21,R_22,R_23
0,5.1,3.5,1.4,0.2,-0.12005,0.652377,-0.430155,0.50495,0.158552,0.169825,...,0.253786,0.575164,0.520433,0.348584,0.276416,0.562099,0.03788,0.069983,-0.002186,0.079547
1,4.9,3.0,1.4,0.2,-0.063996,0.444177,-0.277511,0.352671,0.110439,0.142047,...,0.14962,0.394742,0.340011,0.244418,0.172249,0.381677,0.03788,0.069983,-0.002186,0.079547
2,4.7,3.2,1.3,0.2,-0.153775,0.488568,-0.377457,0.346225,0.070801,0.099591,...,0.205965,0.458436,0.412179,0.286084,0.213916,0.453846,0.023202,0.061508,-0.010661,0.064869
3,4.6,3.1,1.5,0.2,-0.156998,0.438595,-0.355262,0.301335,0.029796,0.115059,...,0.155775,0.439301,0.376095,0.265251,0.193082,0.417762,0.052559,0.078457,0.006288,0.094225
4,5.0,3.6,1.4,0.2,-0.16494,0.674572,-0.480128,0.501727,0.134496,0.155936,...,0.27462,0.611249,0.556517,0.369418,0.297249,0.598184,0.03788,0.069983,-0.002186,0.079547


Calling get_params():  {'degree_increment': 30}

Example 3:
type of X:  <class 'numpy.ndarray'>


Unnamed: 0,0,1,R_0,R_1,R_2,R_3
0,0.0,1.0,0.0,0.0,0.0,0.0
1,2.0,3.0,0.183013,0.683013,-0.183013,0.683013
2,4.0,5.0,0.366025,1.366025,-0.366025,1.366025



Example 4:
type of X:  <class 'list'>


Unnamed: 0,0,1,R_0,R_1,R_2,R_3
0,0.0,1.0,0.0,0.0,0.0,0.0
1,2.0,3.0,0.183013,0.683013,-0.183013,0.683013
2,4.0,5.0,0.366025,1.366025,-0.366025,1.366025


## Example 5: comparing the accuracy when using the rotation-based generated features vs using only the original features.

In [5]:
# Test sklearn's decision tree using either the original or the original plus the generated features
def test_classification(X, y):
    dt = tree.DecisionTreeClassifier(min_samples_split=50, max_depth=5, random_state=0)
    scores = cross_validate(dt, X, y, cv=5, scoring='f1_macro', return_train_score=True, return_estimator=True)
    test_scores = scores['test_score']
    train_scores = scores['train_score']
    score_name = "f1_score: "
    avg_test_score = test_scores.mean()
    scores_std_dev = stdev(test_scores)
    avg_train_score = train_scores.mean()

    estimators = scores['estimator']    
    total_num_nodes = 0
    for est in estimators:
        total_num_nodes += est.tree_.node_count
    avg_num_nodes = total_num_nodes / len(estimators)

    print("\nAverage f1 score on training data: ", round(avg_train_score,3))
    print("Average f1 score on test data: ", round(avg_test_score,3))
    print("Std dev of f1 scores on test data: ", round(scores_std_dev,3))
    print("Average number of nodes: ", round(avg_num_nodes,3))

# Given a dataframe X, return an extended dataframe, having the same set of rows, but additional, generated columns
def get_extended_X(X):
    rota = RotationFeatures(degree_increment=30)
    extended_X = pd.DataFrame(rota.fit_transform(X), columns= rota.get_feature_names())
    return extended_X

# Given a method to load a dataset, load the dataset and test the accuracy of a sklearn decision tree with and without
# the extended features.
def test_dataset(load_method, file_name):
    print("\n\n*********************************************")
    print("Calling for " + file_name)
    print("*********************************************")
    
    X,y = load_method()
    test_classification(X, y)
    extended_X = get_extended_X(X)
    test_classification(extended_X, y)
    
test_dataset(get_iris, "Iris")
test_dataset(get_breast_cancer, "Breast Cancer")
test_dataset(get_wine, "Wine")



*********************************************
Calling for Iris
*********************************************

Average f1 score on training data:  0.962
Average f1 score on test data:  0.933
Std dev of f1 scores on test data:  0.053
Average number of nodes:  5.0

Average f1 score on training data:  0.973
Average f1 score on test data:  0.966
Std dev of f1 scores on test data:  0.024
Average number of nodes:  5.0


*********************************************
Calling for Breast Cancer
*********************************************

Average f1 score on training data:  0.955
Average f1 score on test data:  0.912
Std dev of f1 scores on test data:  0.015
Average number of nodes:  16.6

Average f1 score on training data:  0.986
Average f1 score on test data:  0.936
Std dev of f1 scores on test data:  0.012
Average number of nodes:  13.8


*********************************************
Calling for Wine
*********************************************

Average f1 score on training data:  0.949
A

## Example using an sklearn pipeline and a grid search to optimize the hyperparameter used by RotationFeatures