In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

from GeneticFeatures.GeneticFeatureGenerator import *
from GeneticFeatures.Node import *

In [2]:
df = pd.read_csv('winequality-red.csv')
# x is everything except the last column
X = df.iloc[:, :-1].values
# y is the last column
Y = df.iloc[:, -1].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [3]:
def add(x, y):
    return x + y

def sub(x, y):
    return x - y

def mul(x, y):
    return x * y

def div(x, y):
    return x / (y + 1e-10)

In [4]:
generator = GeneticFeatureGenerator(
    [add, sub, mul, div],
    operation_names = ["+", "-", "*", "/"],
    popsize = 100, 
    maxiter = 60, 
    mutation_rate = 0.05
)

In [5]:
multifeature = MultiFeatureGenerator(X_train, Y_train, generator, 8, 4, 5000, verbose=True)

In [6]:
trees = [i for i in multifeature]

Split:  0 Feature:  0
Split:  1 Feature:  1itness:  0.24914504716981134 Generation best 0.24914504716981134
Split:  2 Feature:  2itness:  0.27588443396226414 Generation best 0.25191627358490565
Split:  3 Feature:  3itness:  0.25229952830188684 Generation best 0.23558372641509429
Split:  0 Feature:  4itness:  0.2934846698113207 Generation best 0.27862617924528327
Split:  1 Feature:  5itness:  0.2675117924528302 Generation best 0.262028301886792586
Split:  2 Feature:  6itness:  0.2564563679245283 Generation best 0.239504716981132077
Split:  3 Feature:  7itness:  0.2929540094339622 Generation best 0.29295400943396223
Iteration:  59 Best fitness:  0.2581662735849056 Generation best 0.249262971698113182

In [7]:
trees

[<Node.Tree at 0x1d097bde8b0>,
 <Node.Tree at 0x1d0984651c0>,
 <Node.Tree at 0x1d097c26250>,
 <Node.Tree at 0x1d0978cc460>,
 <Node.Tree at 0x1d0999afac0>,
 <Node.Tree at 0x1d097bbc160>,
 <Node.Tree at 0x1d097e52df0>,
 <Node.Tree at 0x1d0986210a0>]

In [8]:
new_train_features = np.array([tree(X_train) for tree in trees]).T
new_test_features = np.array([tree(X_test) for tree in trees]).T

new_train_features_concated = np.concatenate([X_train, new_train_features], axis=1)
new_test_features_concated = np.concatenate([X_test, new_test_features], axis=1)

In [13]:
from sklearn.ensemble import RandomForestRegressor

# test with old featues
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, Y_train)
print("Old features score: ", rf.score(X_test, Y_test))

# test with new features
rf = RandomForestRegressor(n_estimators=100)
rf.fit(new_train_features_concated, Y_train)
print("New features score: ", rf.score(new_test_features_concated, Y_test))

Old features score:  0.5194271283193115
New features score:  0.5249301394222867
