In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.datasets import load_iris, load_breast_cancer, load_wine
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from statistics import mean, stdev

# Todo: remove once have pip install
import sys  
sys.path.insert(0, 'C:\python_projects\RotationFeatures_project\RotationFeatures')
from RotationFeatures import RotationFeatures # todo: fix once have pip install

np.random.seed(0)

In [None]:
#todo:
# do an accuracy test in another notebook with DatasetTester. Will be very similar to the one for ExtendedDecisionTree,
# just minus the EDT.

## Methods to load data

In [None]:
# This provides methods to test with 3 of the toy datasets provided by sklearn. These are used below
# to test the accuracy of models using RotationFeatures.

def get_iris():
    iris = load_iris()
    X, y = iris.data, iris.target
    X = pd.DataFrame(X, columns=iris['feature_names'])
    return X, y

def get_breast_cancer():
    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    return X,y

def get_wine():
    X, y = load_wine(return_X_y=True, as_frame=True)
    return X,y

## Examples 1 to 4: Simple examples using the RotationFeatures class

In [None]:
X,y = get_iris()
rota = RotationFeatures(degree_increment=30)

# Example 1: Using fit() then transform(), and get_feature_names()
rota.fit(X)
X1 = pd.DataFrame(rota.transform(X), columns=rota.get_feature_names())
print("\nExample 1:")
display(X1.head())  

# Example 2: Using fit_transform() and get_params()
X2 = pd.DataFrame(rota.fit_transform(X), columns= rota.get_feature_names())
print("\nExample 2:")
display(X2.head())
print("Calling get_params(): ", rota.get_params())

# Example 3: Using a two-column numpy array
print("\nExample 3:")
X = np.arange(6).reshape(3, 2)
print("type of X: ", type(X))
X3 = pd.DataFrame(rota.fit_transform(X), columns= rota.get_feature_names())
display(X3.head())

# Example 4: Using a two-column python matrix
print("\nExample 4:")
X = np.arange(6).reshape(3, 2).tolist()
print("type of X: ", type(X))
X4 = pd.DataFrame(rota.fit_transform(X), columns= rota.get_feature_names())
display(X4.head())


## Example 5: Comparing the accuracy when using the rotation-based generated features vs using only the original features.

In [None]:
# Test sklearn's decision tree using either the original or the original plus the generated features. In most cases there
# is an improvement in the f1 score, as well as a reduction in the variation between folds, and smaller decision trees, and 
# hence greater interpretability.

def test_classification(X, y):
    # The depth is limited to 5 in order to maintain the interpretability of the trees induced. 
    dt = tree.DecisionTreeClassifier(min_samples_split=50, max_depth=5, random_state=0)
    scores = cross_validate(dt, X, y, cv=5, scoring='f1_macro', return_train_score=True, return_estimator=True)
    test_scores = scores['test_score']
    train_scores = scores['train_score']
    score_name = "f1_score: "
    avg_test_score = test_scores.mean()
    scores_std_dev = stdev(test_scores)
    avg_train_score = train_scores.mean()

    estimators = scores['estimator']    
    total_num_nodes = 0
    for est in estimators:
        total_num_nodes += est.tree_.node_count
    avg_num_nodes = total_num_nodes / len(estimators)

    print("\n Average f1 score on training data: ", round(avg_train_score,3))
    print(" Average f1 score on test data: ", round(avg_test_score,3))
    print(" Std dev of f1 scores on test data: ", round(scores_std_dev,3))
    print(" Average number of nodes: ", round(avg_num_nodes,3))

# Given a dataframe X, return an extended dataframe, having the same set of rows, but additional, generated columns
def get_extended_X(X):
    rota = RotationFeatures(degree_increment=30)
    extended_X = pd.DataFrame(rota.fit_transform(X), columns= rota.get_feature_names())
    return extended_X

# Given a method to load a dataset, load the dataset and test the accuracy of a sklearn decision tree with and without
# the extended features.
def test_dataset(load_method, file_name):
    print("\n\n*********************************************")
    print("Calling for " + file_name)
    print("*********************************************")
    
    X,y = load_method()
    print("\nUsing original features only")
    test_classification(X, y)

    print("\nUsing rotation-based features")
    extended_X = get_extended_X(X)
    test_classification(extended_X, y)
    
test_dataset(get_iris, "Iris")
test_dataset(get_breast_cancer, "Breast Cancer")
test_dataset(get_wine, "Wine")

## Example 6: Using an RotationFeatures in a sklearn pipeline

In [None]:
X,y = get_iris()
pipe = Pipeline([('rota', RotationFeatures()), ('dt', tree.DecisionTreeClassifier())])

# Example getting the training score
pipe.fit(X,y)
sc = pipe.score(X,y)
print('Training Accuracy: %.3f' % sc)

# Example getting the cross validated accuracy
n_scores = cross_val_score(pipe, X, y, scoring='accuracy', n_jobs=-1, error_score='raise')
print('Cross validated Accuracy: %.3f (%.3f)' % (mean(n_scores), stdev(n_scores)))


## Example 7: Using a pipeline a grid search to optimize the hyperparameters used by RotationFeatures and by the Decision Tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

parameters = {
     'rota__degree_increment': (10,15,30),
     'dt__max_depth': (3,4,5)
}

gs_clf = GridSearchCV(pipe, parameters)
gs_clf.fit(X_train, y_train)
s = gs_clf.score(X_test, y_test)
print("score: ", s)
b = gs_clf.best_params_
b


## Example 8: Specifying the maximum number of output columns

In [None]:
# It's possible to specify the maximum number of columns that may be created by the RotationFeatures tool. 

X,y = get_iris()

# First example: Using 45 degree increments, few enough columns are created
rota_45= RotationFeatures(degree_increment=45, max_cols_created=20)
X_45 = pd.DataFrame(rota_45.fit_transform(X))
dt = tree.DecisionTreeClassifier(min_samples_split=50, max_depth=5, random_state=0)
scores = cross_validate(dt, X_45, y, cv=5, scoring='f1_macro')
test_scores = scores['test_score']
avg_test_score = test_scores.mean()
print("Number of output features: ", rota.n_output_features_ )
print("Average f1 score on test data: ", round(avg_test_score,3))

In [None]:
# Second example: Using 5 degree increments, few enough columns are created. This will throw an exception. 
rota_5= RotationFeatures(degree_increment=5, max_cols_created=20)
X_5 = pd.DataFrame(rota_5.fit_transform(X))
dt = tree.DecisionTreeClassifier(min_samples_split=50, max_depth=5, random_state=0)
scores = cross_validate(dt, X_5, y, cv=5, scoring='f1_macro')
test_scores = scores['test_score']
avg_test_score = test_scores.mean()
print("Average f1 score on test data: ", round(avg_test_score,3))