In [2]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.model_selection import cross_validate
from sklearn.metrics import f1_score
from RotationFeatures import RotationFeatures

np.random.seed(0)

# Methods to load data

In [16]:
# This notebook provides methods to test Rotation Features with five publicly available datasets.

def get_spam7():  
    # From: http://vincentarelbundock.github.io/Rdatasets/doc/DAAG/spam7.html
    spam=pd.read_csv("https://raw.githubusercontent.com/Brett-Kennedy/DatasetTester/main/spam7.csv")
    X = spam.dollar, spam.make, spam.bang, spam.money, spam.n000, spam.crl_tot
    y = spam.yesno
    return X, y

def get_biomed():
    # From https://www.openml.org/d/481
    biomed=pd.read_csv("https://raw.githubusercontent.com/Brett-Kennedy/DatasetTester/main/biomed.csv")
    X = biomed.Observation_number,biomed.Hospital_identification_number_for_blood_sample,biomed.Age_of_patient,biomed.Date_that_blood_sample_was_taken,biomed.ml,biomed.m2,biomed.m3,biomed.m4
    y = biomed['class'].unique() 
    return X, y

def get_vowel():
    # From https://www.openml.org/d/307
    vowel=pd.read_csv("https://github.com/Brett-Kennedy/DatasetTester/raw/main/vowel.csv")
    X = vowel.Speaker_Number, vowel.Sex, vowel.Feature_0, vowel.Feature_1, vowel.Feature_2, vowel.Feature_3, vowel.Feature_4, vowel.Feature_5, vowel.Feature_6,vowel.Feature_7,vowel.Feature_8,vowel.Feature_9
    y = vowel['class'].unique()
    return X, y
    
def get_statlog():
    # From https://archive.ics.uci.edu/ml/datasets/Statlog+%28Image+Segmentation%29
    statlog=pd.read_csv("https://github.com/Brett-Kennedy/DatasetTester/raw/main/statlog.csv")
    X = statlog.a1, statlog.a2, statlog.a3, statlog.a4, statlog.a5, statlog.a6, statlog.a7, statlog.a8, statlog.a9, statlog.a10, statlog.a11, statlog.a12, statlog.a13, statlog.a14, statlog.a15, statlog.a16, statlog.a17, statlog.a18, statlog.a19
    y = statlog['class'].unique()
    return X, y

def get_ringnorm():
    # From https://www.openml.org/d/1496
    ringnorm = pd.read_csv("https://github.com/Brett-Kennedy/DatasetTester/raw/main/ringnorm.csv")
    X = ringnorm.V1, ringnorm.V2, ringnorm.V3, ringnorm.V4, ringnorm.V5, ringnorm.V6, ringnorm.V7, ringnorm.V8, ringnorm.V9, ringnorm.V10, ringnorm.V11, ringnorm.V12, ringnorm.V13, ringnorm.V14, ringnorm.V15, ringnorm.V16, ringnorm.V17, ringnorm.V18, ringnorm.V19, ringnorm.V20
    y = ringnorm['class'].unique()
    return X, y

## Examples 1 to 4: Simple examples using the RotationFeatures class

In [None]:
X,y = get_spam7()
rota = RotationFeatures(degree_increment=30)

# Example 1: Using fit() then transform() and get_feature_names()
rota.fit(X)
X1 = pd.DataFrame(rota.transform(X), columns=rota.get_feature_names())
print("\nExample 1:")
display(X1.head())  

# Example 2: Using fit_transform() and get_params()
X2 = pd.DataFrame(rota.fit_transform(X), columns= rota.get_feature_names())
print("\nExample 2:")
display(X2.head())
print("Calling get_params(): ", rota.get_params())

# Example 3: Using a Numpy array
print("\nExample 3:")
X = np.arange(6).reshape(3, 2)
print("type of X: ", type(X))
X3 = pd.DataFrame(rota.fit_transform(X), columns= rota.get_feature_names())
display(X3.head())

# Example 4: Using a Python array
print("\nExample 4:")
X = np.arange(6).reshape(3, 2).tolist()
print("type of X: ", type(X))
X4 = pd.DataFrame(rota.fit_transform(X), columns= rota.get_feature_names())
display(X4.head())


## Example 5: comparing the accuracy when using the rotation-based generated features vs using only the original features.

In [None]:
# Test sklearn's decision tree using either the original or the original plus the generated features
def test_classification(X, y):
    dt = tree.DecisionTreeClassifier(min_samples_split=50, max_depth=5, random_state=0)
    scores = cross_validate(dt, X, y, cv=5, scoring='f1_macro', return_train_score=True, return_estimator=True)
    test_scores = scores['test_score']
    train_scores = scores['train_score']
    score_name = "f1_score: "
    avg_test_score = test_scores.mean()
    scores_std_dev = statistics.stdev(test_scores)
    avg_train_score = train_scores.mean()

    estimators = scores['estimator']    
    total_num_nodes = 0
    for est in estimators:
        total_num_nodes += est.tree_.node_count
    avg_num_nodes = total_num_nodes / len(estimators)

    print("\nAverage f1 score on training data: ", round(avg_train_score,3))
    print("Average f1 score on test data: ", round(avg_test_score,3))
    print("Std dev of f1 scores on test data: ", round(scores_std_dev,3))
    print("Average number of nodes: ", round(avg_num_nodes,3))

# Given a dataframe X, return an extended dataframe, having the same set of rows, but additional, generated columns
def get_extended_X(X):
    rota = RotationFeatures(degree_increment=30)
    extended_X = pd.DataFrame(rota.fit_transform(X), columns= rota.get_feature_names())
    return extended_X

# Given a method to load a dataset, load the dataset and test the accuracy of a sklearn decision tree with and without
# the extended features.
def test_dataset(load_method, file_name):
    print("\n\n*********************************************")
    print("Calling for " + file_name)
    print("*********************************************")
    
    X,y = load_method()
    test_classification(X, y)
    extended_X = get_extended_X(X)
    test_classification(extended_X, y)
    
test_dataset(get_spam7, "Spam7")
test_dataset(get_biomed, "Biomed")
test_dataset(get_vowel, "Vowel")
test_dataset(get_statlog, "Statlog")
test_dataset(get_ringnorm, "Ringnorm")