In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from pandas.api.types import is_numeric_dtype

from genetic_decision_tree import GeneticDecisionTree

# List the OpenML datasets

In [None]:
real_files = [
    'soybean',
    'micro-mass',
    'mfeat-karhunen',
    'Amazon_employee_access',
    'abalone',
    'cnae-9',
    'semeion',
    'vehicle',
    'satimage',
    'analcatdata_authorship',
    'breast-w',
    'SpeedDating',
    'eucalyptus',
    'vowel',
    'wall-robot-navigation',
    'credit-approval',
    'artificial-characters',
    'splice',
    'har',
    'cmc',
    'segment',
    'JapaneseVowels',
    'jm1',
    'gas-drift',
    'mushroom',
    'irish',
    'profb',
    'adult',
    'higgs',
    'anneal',
    'credit-g',
    'blood-transfusion-service-center',
    'monks-problems-2',
    'tic-tac-toe',
    'qsar-biodeg',
    'wdbc',
    'phoneme',
    'diabetes',
    'ozone-level-8hr',
    'hill-valley',
    'kc2',
    'eeg-eye-state',
    'climate-model-simulation-crashes',
    'spambase',
    'ilpd',
    'one-hundred-plants-margin',
    'banknote-authentication',
    'mozilla4',
    'electricity',
    'madelon',
    'scene',
    'musk',
    'nomao',
    'bank-marketing',
    'MagicTelescope',
    'Click_prediction_small',
    'PhishingWebsites',
    'nursery',
    'page-blocks',
    'hypothyroid',
    'yeast',
    'kropt',
    'CreditCardSubset',
    'shuttle',
    'Satellite',
    'baseball',
    'mc1',
    'pc1',
    'cardiotocography',
    'kr-vs-k',
    'volcanoes-a1',
    'wine-quality-white',
    'car-evaluation',
    'solar-flare',
    'allbp',
    'allrep',
    'dis',
    'car',
    'steel-plates-fault'
    ]


In [None]:
# This defines a function to test a single file from OpenML. It creates
# a standard decision tree as well as four variations on the GeneticDecisionTree
# based on whether it peforms mutations and/or combinations. 

def test_dataset(dataset_name):
    def test_model(clf):
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        return f1_score(y_test, y_pred, average='macro')    
    
    np.random.seed(0)

    # Load the data
    data = fetch_openml(dataset_name, version=1, parser='auto') 
    df = pd.DataFrame(data['data'])
    y_true = data['target']
    display(df)
    
    # One-hot encode categorical columns unless there are too many unique values,
    # in which case, we drop the column.
    drop_cols = []
    for col_name in df.columns:
        if (not is_numeric_dtype(df[col_name])) and (df[col_name].nunique() > 10):
            drop_cols.append(col_name)
    df = df.drop(columns=drop_cols)
    if len(df.columns) == 0:
        print("All columns are categorical with many unique values")
        return None   
    df = pd.get_dummies(df)        

    # Divide the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(df, y_true, test_size=0.3, random_state=42)

    # Fit and evaluate a standard decision tree
    clf = DecisionTreeClassifier(max_depth=4)
    dt_score = test_model(clf)
    print("DT:", dt_score)

    # Fit and evaluate a GeneticDecisionTree based only on random trees
    np.random.seed(0)
    max_iterations = 4
    gdt = GeneticDecisionTree(
        max_depth=4, max_iterations=max_iterations, allow_mutate=False, allow_combine=False, n_jobs=-1, verbose=True)
    score1 = test_model(gdt)
    print("Genetic DT:", score1)

    # Fit and evaluate a GeneticDecisionTree allowing mutations of strong trees
    gdt = GeneticDecisionTree(
        max_depth=4, max_iterations=max_iterations, allow_mutate=True, allow_combine=False, n_jobs=-1, verbose=True)
    score2 = test_model(gdt)
    print("Genetic DT:", score2)    

    # Fit and evaluate a GeneticDecisionTree allowing combinations of pairs of strong trees
    gdt = GeneticDecisionTree(
        max_depth=4, max_iterations=max_iterations, allow_mutate=False, allow_combine=True, n_jobs=-1, verbose=True)
    score3 = test_model(gdt)
    print("Genetic DT:", score3)

    # Fit and evaluate a GeneticDecisionTree allowing both mutations and combinations
    gdt = GeneticDecisionTree(
        max_depth=4, max_iterations=max_iterations, allow_mutate=True, allow_combine=True, n_jobs=-1, verbose=True)
    score4 = test_model(gdt)
    print("Genetic DT:", score4)

    return [dt_score, score1, score2, score3, score4]

In [None]:
# Loop through the real datasets on OpenML and display, for each,
# how standard and genetic decision trees compare. 

display_rows = []
display_dt = None

for file_name in real_files:
    print(".................................................................")
    print(file_name)
    results = test_dataset(file_name)
    if not results:
        continue
    display_rows.append([file_name] + results)
    display_dt = pd.DataFrame(display_rows, columns=[
        'File Name', 
        "DT", 
        "GDT (random only)", 
        "GDT (with mutations)", 
        "GDT (with combinations)", 
        "GDT (with both)"])
    display(display_dt)
    
print()
print("Final Results")
display(display_dt)