In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import f1_score, r2_score
from pandas.api.types import is_numeric_dtype

from genetic_decision_tree import GeneticDecisionTree

# Wine

In [15]:
# This is an example of a real-world (though toy) binary classification problem.
# In this case, the genetic decision tree does significantly better than the 
# standard decision tree limited to a depth of 2, acheiving an f1 macro score
# on a hold-out test set of 0.97 as compared to 0.88

from sklearn.datasets import load_wine

np.random.seed(0)

# Load the data
data = load_wine()
df = pd.DataFrame(data.data)
df.columns = data.feature_names
y_true = data.target
print(pd.Series(y_true).value_counts())

# Divide the data into train and test
X_train, X_test, y_train, y_test = train_test_split(df, y_true, test_size=0.3, random_state=42)

# Fit and evaluate a standard decision tree
clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("DT:", f1_score(y_test, y_pred, average='macro'))

# Fit and evaluate a GeneticDecisionTree
np.random.seed(0)
gdt = GeneticDecisionTree(max_depth=2, max_iterations=5, allow_mutate=True, allow_combine=True, verbose=True)
gdt.fit(X_train, y_train)
y_pred = gdt.predict(X_test)
print("Genetic DT:", f1_score(y_test, y_pred, average='macro'))

1    71
0    59
2    48
dtype: int64
DT: 0.8807591761488132

Iteration: 1
Top (training) scores so far: ['0.963', '0.963', '0.955', '0.955', '0.955', '0.955', '0.955', '0.954', '0.948', '0.948']
Number in top 20 based on mutation: 0
Number in top 20 based on combination: 12

Iteration: 2
Top (training) scores so far: ['0.963', '0.963', '0.963', '0.963', '0.963', '0.963', '0.963', '0.963', '0.955', '0.955']
Number in top 20 based on mutation: 7
Number in top 20 based on combination: 11

Iteration: 3
Top (training) scores so far: ['0.963', '0.963', '0.963', '0.963', '0.963', '0.963', '0.963', '0.963', '0.963', '0.963']
Number in top 20 based on mutation: 10
Number in top 20 based on combination: 19

Iteration: 4
Top (training) scores so far: ['0.963', '0.963', '0.963', '0.963', '0.963', '0.963', '0.963', '0.963', '0.963', '0.963']
Number in top 20 based on mutation: 19
Number in top 20 based on combination: 19

Iteration: 5
Top (training) scores so far: ['0.963', '0.963', '0.963', '0.963

In [16]:
gdt.export_tree()


IF flavanoids < 1.4000
| IF color_intensity < 3.7250
| | 1
| ELSE color_intensity > 3.7250
| | 2
ELSE flavanoids > 1.4000
| IF proline < 724.5000
| | 1
| ELSE proline > 724.5000
| | 0


# digits

In [4]:
# This is an example of a multi-class classification problem (10 classes).
# In this case, the Genetic Decision Tree does substantially better than 
# the standard decision tree. This, using a depth of 4, achieves an 
# f1 macro score of 0.75 compared to 0.53.

from sklearn.datasets import load_digits

np.random.seed(0)

# Load the data
data = load_digits()
df = pd.DataFrame(data.data)
df.columns = data.feature_names
y_true = data.target
print(pd.Series(y_true).value_counts())

# Divide the data into train and test
X_train, X_test, y_train, y_test = train_test_split(df, y_true, test_size=0.3, random_state=42)

# Fit and evaluate a standard decision tree
clf = DecisionTreeClassifier(max_depth=4)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("DT:", f1_score(y_test, y_pred, average='macro'))

# Fit and evaluate a GeneticDecisionTree
np.random.seed(0)
gdt = GeneticDecisionTree(max_depth=4, max_iterations=5, allow_mutate=False, allow_combine=False, verbose=True)
gdt.fit(X_train, y_train)
y_pred = gdt.predict(X_test)
print("Genetic DT:", f1_score(y_test, y_pred, average='macro'))

3    183
1    182
5    182
4    181
6    181
9    180
7    179
0    178
2    177
8    174
dtype: int64
DT: 0.5296251262630574

Iteration: 1
Top (training) scores so far: ['0.743', '0.722', '0.711', '0.709', '0.673', '0.671', '0.671', '0.638', '0.637', '0.629']

Iteration: 2
Top (training) scores so far: ['0.754', '0.743', '0.722', '0.711', '0.709', '0.673', '0.671', '0.671', '0.647', '0.639']

Iteration: 3
Top (training) scores so far: ['0.754', '0.743', '0.722', '0.714', '0.711', '0.709', '0.704', '0.673', '0.671', '0.671']

Iteration: 4
Top (training) scores so far: ['0.754', '0.743', '0.722', '0.714', '0.711', '0.709', '0.704', '0.673', '0.671', '0.671']

Iteration: 5
Top (training) scores so far: ['0.759', '0.754', '0.743', '0.722', '0.714', '0.711', '0.709', '0.704', '0.685', '0.673']
Genetic DT: 0.7500570191170912


In [5]:
gdt.export_tree()


IF pixel_5_3 < 2.5000
| IF pixel_3_4 < 1.5000
| | IF pixel_5_2 < 3.0000
| | | IF pixel_7_3 < 15.5000
| | | | 7
| | | ELSE pixel_7_3 > 15.5000
| | | | 2
| | ELSE pixel_5_2 > 3.0000
| | | 0
| ELSE pixel_3_4 > 1.5000
| | IF pixel_3_2 < 3.5000
| | | IF pixel_3_6 < 3.5000
| | | | 3
| | | ELSE pixel_3_6 > 3.5000
| | | | 9
| | ELSE pixel_3_2 > 3.5000
| | | IF pixel_2_5 < 7.5000
| | | | 5
| | | ELSE pixel_2_5 > 7.5000
| | | | 9
ELSE pixel_5_3 > 2.5000
| IF pixel_6_6 < 1.5000
| | IF pixel_4_6 < 0.5000
| | | IF pixel_2_4 < 15.5000
| | | | 8
| | | ELSE pixel_2_4 > 15.5000
| | | | 1
| | ELSE pixel_4_6 > 0.5000
| | | IF pixel_3_2 < 10.5000
| | | | 7
| | | ELSE pixel_3_2 > 10.5000
| | | | 4
| ELSE pixel_6_6 > 1.5000
| | IF pixel_4_2 < 8.5000
| | | IF pixel_5_5 < 7.5000
| | | | 2
| | | ELSE pixel_5_5 > 7.5000
| | | | 8
| | ELSE pixel_4_2 > 8.5000
| | | IF pixel_2_5 < 7.0000
| | | | 6
| | | ELSE pixel_2_5 > 7.0000
| | | | 0


# Synthetic Classification Example

In [17]:
# This, using a depth of 4, achieves an F1 macro score of 0.81 compared to 0.57
# for a standard decision tree.

np.random.seed(0)
n_rows = 1000
df = pd.DataFrame({
    'a': np.random.rand(n_rows),
    'b': np.random.rand(n_rows),
    'c': np.random.rand(n_rows),
    'd': np.random.rand(n_rows),
    'e': np.random.rand(n_rows),
    'f': np.random.rand(n_rows),
    'g': np.random.rand(n_rows),
    'h': np.random.rand(n_rows),
    'i': np.random.rand(n_rows),
})

df['Y'] = 'W'
for i in df.index:
    if (df.loc[i, 'a'] > 0.9) and (df.loc[i, 'b'] > 0.5) and (df.loc[i, 'e'] > 0.5):
        df.loc[i, 'Y'] = 'X'
    elif (df.loc[i, 'a'] > 0.5) and (df.loc[i, 'b'] > 0.5) and (df.loc[i, 'c'] > 0.9):
        df.loc[i, 'Y'] = 'X'
    elif (df.loc[i, 'b'] > 0.9) and (df.loc[i, 'c'] > 0.4):
        df.loc[i, 'Y'] = 'Y'
    elif (df.loc[i, 'b'] > 0.4) and (df.loc[i, 'c'] > 0.9):
        df.loc[i, 'Y'] = 'Y'
    elif (df.loc[i, 'a'] > 0.7) and (df.loc[i, 'c'] > 0.5) and (df.loc[i, 'd'] > 0.3):
        df.loc[i, 'Y'] = 'Z'
    elif (df.loc[i, 'a'] > 0.3) and (df.loc[i, 'c'] > 0.5) and (df.loc[i, 'd'] > 0.7):
        df.loc[i, 'Y'] = 'Z'

print(df['Y'].value_counts())

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Y']), df['Y'], test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(max_depth=4)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("DT:", f1_score(y_test, y_pred, average='macro'))

np.random.seed(0)
gdt = GeneticDecisionTree(
    max_depth=4,
    max_iterations=5,
    allow_mutate=True,
    allow_combine=True,
    n_jobs=1,
    verbose=True)
gdt.fit(X_train, y_train)
y_pred = gdt.predict(X_test)
print("Genetic DT:", f1_score(y_test, y_pred, average='macro'))
gdt.export_tree()


W    758
Z    105
Y     86
X     51
Name: Y, dtype: int64
DT: 0.570624521282416

Iteration: 1
Top (training) scores so far: ['0.841', '0.825', '0.822', '0.817', '0.814', '0.794', '0.792', '0.783', '0.781', '0.779']
Number in top 20 based on mutation: 0
Number in top 20 based on combination: 0

Iteration: 2
Top (training) scores so far: ['0.842', '0.842', '0.841', '0.841', '0.841', '0.840', '0.838', '0.837', '0.833', '0.832']
Number in top 20 based on mutation: 10
Number in top 20 based on combination: 7

Iteration: 3
Top (training) scores so far: ['0.843', '0.843', '0.842', '0.842', '0.842', '0.842', '0.842', '0.842', '0.842', '0.842']
Number in top 20 based on mutation: 20
Number in top 20 based on combination: 8

Iteration: 4
Top (training) scores so far: ['0.845', '0.845', '0.845', '0.844', '0.843', '0.843', '0.843', '0.843', '0.843', '0.843']
Number in top 20 based on mutation: 20
Number in top 20 based on combination: 13

Iteration: 5
Top (training) scores so far: ['0.845', '0.845

# Synthetic Regression Example

In [18]:
# A simple example creating a synthetic regression dataset, a standard
# decision tree, and a GeneticDecisionTree and measuring the R2 score 
# for both. The standard Decision Tree achieved 0.56 and the 
# GeneticDecisionTree 0.86. This gain is, though, larger than normally 
# seen with real data, as the true function here relies here more heavily 
# on conditional logic than is normally the case. 

np.random.seed(0)
n_rows = 1000
df = pd.DataFrame({
    'a': np.random.rand(n_rows),
    'b': np.random.rand(n_rows),
    'c': np.random.rand(n_rows),
    'd': np.random.rand(n_rows),
    'e': np.random.rand(n_rows),
    'f': np.random.rand(n_rows),
    'g': np.random.rand(n_rows),
    'h': np.random.rand(n_rows),
    'i': np.random.rand(n_rows),
})

df['Y'] = df['c']
for i in df.index:
    if (df.loc[i, 'a'] > 0.9) and (df.loc[i, 'b'] > 0.5) and (df.loc[i, 'e'] > 0.5):
        df.loc[i, 'Y'] = df.loc[i, 'a'] * df.loc[i, 'b']
    elif (df.loc[i, 'a'] > 0.5) and (df.loc[i, 'b'] > 0.5) and (df.loc[i, 'c'] > 0.9):
        df.loc[i, 'Y'] = df.loc[i, 'a'] + df.loc[i, 'b'] + df.loc[i, 'c']
    elif (df.loc[i, 'b'] > 0.9) and (df.loc[i, 'c'] > 0.4):
        df.loc[i, 'Y'] = df.loc[i, 'b'] * df.loc[i, 'c']
    elif (df.loc[i, 'b'] > 0.4) and (df.loc[i, 'c'] > 0.9):
        df.loc[i, 'Y'] = df.loc[i, 'b'] * df.loc[i, 'c']
    elif (df.loc[i, 'a'] > 0.7) and (df.loc[i, 'c'] > 0.5) and (df.loc[i, 'd'] > 0.3):
        df.loc[i, 'Y'] = df.loc[i, 'e']
    elif (df.loc[i, 'a'] > 0.3) and (df.loc[i, 'c'] > 0.5) and (df.loc[i, 'd'] > 0.7):
        df.loc[i, 'Y'] = df.loc[i, 'f']

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Y']), df['Y'], test_size=0.3, random_state=42)

clf = DecisionTreeRegressor(max_depth=4)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("DT:", r2_score(y_test, y_pred))

np.random.seed(0)
gdt = GeneticDecisionTree(
    max_depth=4,
    max_iterations=5,
    allow_mutate=True,
    allow_combine=True,
    n_jobs=1,
    verbose=True)
gdt.fit(X_train, y_train)
y_pred = gdt.predict(X_test)
print("Genetic DT:", r2_score(y_test, y_pred))
gdt.export_tree()


DT: 0.5655716777606731

Iteration: 1
Top (training) scores so far: ['0.904', '0.889', '0.887', '0.882', '0.881', '0.879', '0.868', '0.866', '0.864', '0.851']
Number in top 20 based on mutation: 0
Number in top 20 based on combination: 0

Iteration: 2
Top (training) scores so far: ['0.910', '0.910', '0.909', '0.908', '0.908', '0.908', '0.908', '0.908', '0.907', '0.907']
Number in top 20 based on mutation: 14
Number in top 20 based on combination: 6

Iteration: 3
Top (training) scores so far: ['0.911', '0.910', '0.910', '0.910', '0.910', '0.910', '0.910', '0.910', '0.910', '0.910']
Number in top 20 based on mutation: 14
Number in top 20 based on combination: 20

Iteration: 4
Top (training) scores so far: ['0.911', '0.911', '0.911', '0.911', '0.911', '0.911', '0.911', '0.911', '0.911', '0.911']
Number in top 20 based on mutation: 20
Number in top 20 based on combination: 20

Iteration: 5
Top (training) scores so far: ['0.912', '0.912', '0.912', '0.912', '0.911', '0.911', '0.911', '0.911',