In [None]:
import operator
import itertools
import numpy as np

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from deap import gp

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split

np.seterr(all='raise')

digits = load_digits()
digit_features, digit_labels = digits.data, digits.target
X_train, X_test, y_train, y_test = train_test_split(digit_features, digit_labels, stratify=digit_labels,
                                                    train_size=0.75, test_size=0.25)

# defined a new primitive set for strongly typed GP
pset = gp.PrimitiveSetTyped('MAIN', itertools.repeat(float, digit_features.shape[1]), bool, 'Feature')

# boolean operators
pset.addPrimitive(operator.and_, [bool, bool], bool)
pset.addPrimitive(operator.or_, [bool, bool], bool)
pset.addPrimitive(operator.not_, [bool], bool)

# floating point operators
# Define a protected division function
def protectedDiv(left, right):
    try: return left / right
    except (ZeroDivisionError, FloatingPointError): return 1.

pset.addPrimitive(operator.add, [float, float], float)
pset.addPrimitive(operator.sub, [float, float], float)
pset.addPrimitive(operator.mul, [float, float], float)
pset.addPrimitive(protectedDiv, [float, float], float)

# logic operators
# Define a new if-then-else function
def if_then_else(in1, output1, output2):
    if in1: return output1
    else: return output2

pset.addPrimitive(operator.lt, [float, float], bool)
pset.addPrimitive(operator.eq, [float, float], bool)
pset.addPrimitive(if_then_else, [bool, float, float], float)

# terminals
pset.addTerminal(False, bool)
pset.addTerminal(True, bool)
for val in np.arange(-10., 11.):
    pset.addTerminal(val, float)

creator.create('FitnessMax', base.Fitness, weights=(1.0,))
creator.create('Individual', gp.PrimitiveTree, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register('expr', gp.genHalfAndHalf, pset=pset, min_=1, max_=3)
toolbox.register('individual', tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)
toolbox.register('compile', gp.compile, pset=pset)

def evaluate_individual(individual):
    # Transform the tree expression into a callable function
    func = toolbox.compile(expr=individual)
    subsample = np.array([func(*record) for record in X_train])
    
    if X_train[subsample].shape[0] == 0:
        return 1e-20,
    
    clf = DecisionTreeClassifier(random_state=34092)
    clf.fit(X_train[subsample], y_train[subsample])
    score = clf.score(X_test, y_test)
    
    return score,
    
toolbox.register('evaluate', evaluate_individual)
toolbox.register('select', tools.selTournament, tournsize=3)
toolbox.register('mate', gp.cxOnePoint)
toolbox.register('expr_mut', gp.genFull, min_=0, max_=3)
toolbox.register('mutate', gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

population = toolbox.population(n=100)
halloffame = tools.HallOfFame(1)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register('std', np.std)
stats.register('min', np.min)
stats.register('avg', np.mean)
stats.register('max', np.max)

clf = DecisionTreeClassifier(random_state=34092)
clf.fit(X_train, y_train)
print('Base DecisionTreeClassifier accuracy: {}'.format(clf.score(X_test, y_test)))

clf = RandomForestClassifier(random_state=34092)
clf.fit(X_train, y_train)
print('Base RandomForestClassifier accuracy: {}'.format(clf.score(X_test, y_test)))

clf = GradientBoostingClassifier(random_state=34092)
clf.fit(X_train, y_train)
print('Base GradientBoostingClassifier accuracy: {}'.format(clf.score(X_test, y_test)))

print('')

cxpb = 0.5
mutpb = 0.5
ngen = 50
verbose = True

logbook = tools.Logbook()
logbook.header = ['gen', 'nevals'] + (stats.fields if stats else [])

# Evaluate the individuals with an invalid fitness
invalid_ind = [ind for ind in population if not ind.fitness.valid]
fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
for ind, fit in zip(invalid_ind, fitnesses):
    ind.fitness.values = fit

if halloffame is not None:
    halloffame.update(population)

record = stats.compile(population) if stats else {}
logbook.record(gen=0, nevals=len(invalid_ind), **record)
if verbose:
    print(logbook.stream)

# Begin the generational process
for gen in range(1, ngen + 1):
    # Select the next generation individuals
    offspring = toolbox.select(population, len(population))

    # Vary the pool of individuals
    offspring = algorithms.varAnd(offspring, toolbox, cxpb, mutpb)

    # Evaluate the individuals with an invalid fitness
    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    # Update the hall of fame with the generated individuals
    if halloffame is not None:
        halloffame.update(offspring)

    # Replace the current population by the offspring
    population[:] = offspring

    # Append the current generation statistics to the logbook
    record = stats.compile(population) if stats else {}
    logbook.record(gen=gen, nevals=len(invalid_ind), **record)
    if verbose:
        print(logbook.stream)

str(halloffame[0])

Base DecisionTreeClassifier accuracy: 0.869179600887
Base RandomForestClassifier accuracy: 0.946784922395
Base GradientBoostingClassifier accuracy: 0.953436807095

gen	nevals	std     	min  	avg     	max     
0  	100   	0.379036	1e-20	0.394612	0.875831
1  	85    	0.337387	1e-20	0.584945	0.878049
2  	76    	0.300443	1e-20	0.709135	0.873614
3  	73    	0.311377	1e-20	0.707761	0.873614
4  	75    	0.251842	1e-20	0.774656	0.875831
5  	73    	0.268957	1e-20	0.754745	0.880266
6  	71    	0.21039 	1e-20	0.80051 	0.882483
7  	78    	0.209782	1e-20	0.811175	0.882483
8  	77    	0.125856	1e-20	0.852106	0.891353
9  	71    	0.132058	1e-20	0.843259	0.891353
10 	76    	0.0994663	1e-20	0.855277	0.891353

In [None]:
forest_predictions = []

for ind_num, individual in enumerate(pop):
    func = toolbox.compile(expr=individual)
    subsample = np.array([func(*record) for record in X_train])
    
    if X_train[subsample].shape[0] == 0:
        continue
    
    clf = DecisionTreeClassifier(random_state=34092)
    clf.fit(X_train[subsample], y_train[subsample])
    predictions = clf.predict(X_test)
    forest_predictions.append(predictions)

In [None]:
from collections import Counter
from sklearn.metrics import accuracy_score

y_pred = np.array(
    [Counter(instance_forest_predictions).most_common(1)[0][0] for instance_forest_predictions in zip(*forest_predictions)])
np.sum(y_test == y_pred) / len(y_test)