In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from pandas.api.types import is_numeric_dtype

from genetic_decision_tree import GeneticDecisionTree

# List the OpenML datasets

In [2]:
real_files = [
    'Reading_Hydro',
    'treasury',
]    

In [3]:
# This provides a demonstration of using GeneticDecisionTree with regression
# problems, but does not provide significant benefits. With the 'Reading_Hydro'
# dataset, the R2 scores found are similar. With the 'treasury' dataset, the 
# R2 score is only slightly higher (0.98 as opposed to 0.97).

def test_dataset(dataset_name):
    np.random.seed(0)

    # Load the data
    data = fetch_openml(dataset_name, version=1, parser='auto')         
    df = pd.DataFrame(data['data'])    
    y_true = data['target']

    # One-hot encode categorical columns unless there are too many unique values,
    # in which case, we drop the column.
    drop_cols = []
    for col_name in df.columns:
        if (not is_numeric_dtype(df[col_name])) and (df[col_name].nunique() > 10):
            drop_cols.append(col_name)
    df = df.drop(columns=drop_cols)
    if len(df.columns) == 0:
        print("All columns are categorical with many unique values")
        return None   
    df = pd.get_dummies(df)        

    # Divide the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(df, y_true, test_size=0.3, random_state=42)

    # Fit and evaluate a standard decision tree
    clf = DecisionTreeRegressor(max_depth=4)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    dt_score = r2_score(y_test, y_pred)
    print("DT:", dt_score)

    # Fit and evaluate a GeneticDecisionTree based only on random trees
    np.random.seed(0)
    max_iterations = 4
    gdt = GeneticDecisionTree(
        max_depth=4, max_iterations=max_iterations, allow_mutate=False, allow_combine=False, n_jobs=-1, verbose=True)
    gdt.fit(X_train, y_train)
    y_pred = gdt.predict(X_test)
    score1 = r2_score(y_test, y_pred)
    print("Genetic DT:", score1)

    # Fit and evaluate a GeneticDecisionTree allowing mutations of strong trees
    gdt = GeneticDecisionTree(
        max_depth=4, max_iterations=max_iterations, allow_mutate=True, allow_combine=False, n_jobs=-1, verbose=True)
    gdt.fit(X_train, y_train)
    y_pred = gdt.predict(X_test)
    score2 = r2_score(y_test, y_pred)
    print("Genetic DT:", score2)

    # Fit and evaluate a GeneticDecisionTree allowing combinations of pairs of strong trees
    gdt = GeneticDecisionTree(
        max_depth=4, max_iterations=max_iterations, allow_mutate=False, allow_combine=True, n_jobs=-1, verbose=True)
    gdt.fit(X_train, y_train)
    y_pred = gdt.predict(X_test)
    score3 = r2_score(y_test, y_pred)
    print("Genetic DT:", score3)

    # Fit and evaluate a GeneticDecisionTree allowing both mutations and combinations
    gdt = GeneticDecisionTree(
        max_depth=4, max_iterations=max_iterations, allow_mutate=True, allow_combine=True, n_jobs=-1, verbose=True)
    gdt.fit(X_train, y_train)
    y_pred = gdt.predict(X_test)
    score4 = r2_score(y_test, y_pred)
    print("Genetic DT:", score4)

    return [dt_score, score1, score2, score3, score4]

In [4]:
display_rows = []
display_dt = None

for file_name in real_files:
    print(".................................................................")
    print(file_name)
    results = test_dataset(file_name)
    if not results:
        continue
    display_rows.append([file_name] + results)
    display_dt = pd.DataFrame(display_rows, columns=[
        'File Name', 
        "DT", 
        "GDT (random only)", 
        "GDT (with mutations)", 
        "GDT (with combinations)", 
        "GDT (with both)"])
    display(display_dt)
    
print()
print("Final Results")
display(display_dt)

.................................................................
Reading_Hydro
DT: 0.970289105292342

Iteration: 1
Top (training) scores so far: ['0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979']

Iteration: 2
Top (training) scores so far: ['0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979']

Iteration: 3
Top (training) scores so far: ['0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979']

Iteration: 4
Top (training) scores so far: ['0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979']
Genetic DT: 0.9703255085446699

Iteration: 1
Top (training) scores so far: ['0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979']
Number in top 20 based on mutation: 18

Iteration: 2
Top (training) scores so far: ['0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979', '0.979']
Number in top 20 b

Unnamed: 0,File Name,DT,GDT (random only),GDT (with mutations),GDT (with combinations),GDT (with both)
0,Reading_Hydro,0.970289,0.970326,0.970119,0.970326,0.970161


.................................................................
treasury
DT: 0.9777979913620938

Iteration: 1
Top (training) scores so far: ['0.991', '0.991', '0.991', '0.990', '0.990', '0.990', '0.990', '0.990', '0.990', '0.990']

Iteration: 2
Top (training) scores so far: ['0.991', '0.991', '0.991', '0.991', '0.991', '0.990', '0.990', '0.990', '0.990', '0.990']

Iteration: 3
Top (training) scores so far: ['0.991', '0.991', '0.991', '0.991', '0.991', '0.991', '0.990', '0.990', '0.990', '0.990']

Iteration: 4
Top (training) scores so far: ['0.991', '0.991', '0.991', '0.991', '0.991', '0.991', '0.990', '0.990', '0.990', '0.990']
Genetic DT: 0.9807006937776327

Iteration: 1
Top (training) scores so far: ['0.991', '0.991', '0.991', '0.991', '0.991', '0.991', '0.991', '0.991', '0.991', '0.991']
Number in top 20 based on mutation: 6

Iteration: 2
Top (training) scores so far: ['0.991', '0.991', '0.991', '0.991', '0.991', '0.991', '0.991', '0.991', '0.991', '0.991']
Number in top 20 based 

Unnamed: 0,File Name,DT,GDT (random only),GDT (with mutations),GDT (with combinations),GDT (with both)
0,Reading_Hydro,0.970289,0.970326,0.970119,0.970326,0.970161
1,treasury,0.977798,0.980701,0.981066,0.984355,0.981778



Final Results


Unnamed: 0,File Name,DT,GDT (random only),GDT (with mutations),GDT (with combinations),GDT (with both)
0,Reading_Hydro,0.970289,0.970326,0.970119,0.970326,0.970161
1,treasury,0.977798,0.980701,0.981066,0.984355,0.981778
