In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

from GeneticFeatures.GeneticFeatureGenerator import *
from GeneticFeatures.Node import *

In [30]:
df = pd.read_csv('winequality-red.csv')
# x is everything except the last column
X = df.iloc[:, :-1].values
# y is the last column
Y = df.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [31]:
def add(x, y):
    return x + y

def sub(x, y):
    return x - y

def mul(x, y):
    return x * y

def div(x, y):
    return x / (y + 1e-10)

In [32]:
generator = GeneticFeatureGenerator(
    [add, sub, mul, div],
    operation_names = ["+", "-", "*", "/"],
    popsize = 50, 
    maxiter = 40, 
    clone_prob = 0.1, 
    mutation_rate = 0.05
)

In [33]:
multifeature = MultiFeatureGenerator(X_train, Y_train, generator, 10, 1000, verbose=True)

In [34]:
trees = [i for i in multifeature]

Split:  1
Generation:  0
Best fitness:  0.14713541666666663
Worst fitness:  0.0
Mean fitness:  0.020814732142857145
Median fitness:  0.0
Std fitness:  0.03119770128605255

Generation:  1
Best fitness:  0.25037202380952384
Worst fitness:  0.0
Mean fitness:  0.05947544642857143
Median fitness:  0.04845610119047622
Std fitness:  0.058128345951175935

Generation:  2
Best fitness:  0.24032738095238093
Worst fitness:  0.0
Mean fitness:  0.09297247023809524
Median fitness:  0.0767299107142857
Std fitness:  0.06990792335854175

Generation:  3
Best fitness:  0.24032738095238093
Worst fitness:  0.0
Mean fitness:  0.08767113095238095
Median fitness:  0.08193824404761901
Std fitness:  0.06825066966228388

Generation:  4
Best fitness:  0.25
Worst fitness:  0.0
Mean fitness:  0.1015811011904762
Median fitness:  0.08147321428571425
Std fitness:  0.0854734903625843

Generation:  5
Best fitness:  0.25074404761904767
Worst fitness:  0.0
Mean fitness:  0.11188244047619048
Median fitness:  0.0899367559523

In [35]:
trees

[<Node.Tree at 0x22e80b92e50>,
 <Node.Tree at 0x22e80bde8b0>,
 <Node.Tree at 0x22e81783430>,
 <Node.Tree at 0x22e90bfc5b0>,
 <Node.Tree at 0x22e9092f8b0>,
 <Node.Tree at 0x22e81984610>,
 <Node.Tree at 0x22e80d7d7f0>,
 <Node.Tree at 0x22e80a96700>,
 <Node.Tree at 0x22e90371490>,
 <Node.Tree at 0x22e81174640>]

In [36]:
new_train_features = np.array([tree(X_train) for tree in trees]).T
new_test_features = np.array([tree(X_test) for tree in trees]).T

new_train_features_concated = np.concatenate([X_train, new_train_features], axis=1)
new_test_features_concated = np.concatenate([X_test, new_test_features], axis=1)

In [44]:
from sklearn.ensemble import RandomForestRegressor

# test with old featues
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, Y_train)
print("Old features score: ", rf.score(X_test, Y_test))

# test with new features
rf = RandomForestRegressor(n_estimators=100)
rf.fit(new_train_features, Y_train)
print("New features score: ", rf.score(new_test_features, Y_test))

Old features score:  0.5138672126003079
New features score:  0.4610769437678387
