In [1]:
# Imports for gplearn and pydotplus in order to see graph view

In [2]:

from IPython.display import Image
import pydotplus

from gplearn.genetic import SymbolicRegressor
from gplearn.fitness import make_fitness

In [3]:
#--Import the required libraries--
import math
import random
import matplotlib.pyplot as plt
import numpy as np

#--debug mode to report on evaluation of tree--
debug_eval = False



# Import Scipy generic dataset 
* Number of Instances:506
* Number of Attributes:13
* Attribute Information (in order):


<li>CRIM     per capita crime rate by town</li>
<li>ZN       proportion of residential land zoned for lots over 25,000 sq.ft.</li>
<li>INDUS    proportion of non-retail business acres per town</li>
<li>CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)</li>
<li>NOX      nitric oxides concentration (parts per 10 million)</li>
<li>RM       average number of rooms per dwelling</li>
<li>AGE      proportion of owner-occupied units built prior to 1940</li>
<li>DIS      weighted distances to five Boston employment centres</li>
<li>RAD      index of accessibility to radial highways</li>
<li>TAX      full-value property-tax rate per \\$10,000</li>
<li>PTRATIO  pupil-teacher ratio by town</li>
<li>B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town</li>
<li>LSTAT    \\% lower status of the population</li>
<li>MEDV     Median value of owner-occupied homes in $1000’s</li>

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston

#load the data from the default data set, and split it into a tuple
data = load_boston(return_X_y = True)

#what percent of our data do we want to use to validate
split_percent = 0.2
train_x, test_x, train_y, test_y = train_test_split(*data, test_size = split_percent, random_state = 0)

#print out the shapes for clarity
print("Shapes:\n data_x:{}\n data_y:{}\n train_x:{}\n test_x:{}\n train_y:{}\n test_y:{}"
      .format(data[0].shape,data[1].shape,train_x.shape,test_x.shape,train_y.shape,test_y.shape))

Shapes:
 data_x:(506, 13)
 data_y:(506,)
 train_x:(404, 13)
 test_x:(102, 13)
 train_y:(404,)
 test_y:(102,)


# Symbolic regression with grid search


In [None]:
from sklearn.model_selection import GridSearchCV
est_gp = SymbolicRegressor()


In [None]:
parameters = {'function_set': [('add', 'sub', 'mul', 'div'), ('add', 'sub', 'mul', 'div',
                'sqrt', 'log', 'abs', 'neg', 'inv','max', 'min')],
             'init_depth': [(2, 6),(3,7)],
             'max_samples': [1.0,0.9],
             'p_crossover': [0.9,0.8],
             'p_hoist_mutation': [0.01,0.05],
             'p_point_mutation': [0.01,0.02],
             'random_state': [0],
             'tournament_size': [20,10,30],
             'verbose': [1],
             'warm_start': [False]}

In [None]:
#This part sets up the symbolic regressor
clf = GridSearchCV(est_gp, parameters, cv=5,n_jobs = -1, verbose = 1)
#This part runs it on our data
clf.fit(train_x, train_y)

Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 378 tasks      | elapsed:  2.7min


In [None]:
clf.best_params_

# Scoring


In [None]:
 print(clf.best_estimator_._program)
clf.best_estimator_.score(test_x,test_y)

In [None]:
graph = pydotplus.graphviz.graph_from_dot_data(clf.best_estimator_._program.export_graphviz())
Image(graph.create_png())