In [79]:
def mydeap(mungedtrain):
    
    import operator
    import math
    import random
    
    import numpy
    
    from deap import algorithms
    from deap import base
    from deap import creator
    from deap import tools
    from deap import gp
    
    inputs = mungedtrain.iloc[:,2:10].values.tolist()
    outputs = mungedtrain['Survived'].values.tolist()
    
    # Define new functions
    def protectedDiv(left, right):
        try:
            return left / right
        except ZeroDivisionError:
            return 1
    
    pset = gp.PrimitiveSet("MAIN", 8) # eight input
    pset.addPrimitive(operator.add, 2)
    pset.addPrimitive(operator.sub, 2)
    pset.addPrimitive(operator.mul, 2)
    pset.addPrimitive(protectedDiv, 2)
    pset.addPrimitive(operator.neg, 1)
    pset.addPrimitive(math.cos, 1)
    pset.addPrimitive(math.sin, 1)
    pset.addPrimitive(max, 2)
    pset.addPrimitive(min, 2) # add more?
    #pset.addEphemeralConstant("rand101", lambda: random.uniform(-10,10)) # adjust?
    pset.renameArguments(ARG0='x1')
    pset.renameArguments(ARG1='x2')
    pset.renameArguments(ARG2='x3')
    pset.renameArguments(ARG3='x4')
    pset.renameArguments(ARG4='x5')
    pset.renameArguments(ARG5='x6')
    pset.renameArguments(ARG6='x7')
    pset.renameArguments(ARG7='x8')

    
    creator.create("FitnessMin", base.Fitness, weights=(1.0,))
    creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin)
    
    toolbox = base.Toolbox()
    toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=3) #
    toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("compile", gp.compile, pset=pset)
    
    def evalSymbReg(individual):
        # Transform the tree expression in a callable function
        func = toolbox.compile(expr=individual)
        # Evaluate the accuracy
        return sum(round(1.-(1./(1.+numpy.exp(-func(*in_))))) == out for in_, out in zip(inputs, outputs))/len(mungedtrain),
    
    toolbox.register("evaluate", evalSymbReg)
    toolbox.register("select", tools.selTournament, tournsize=3)
    toolbox.register("mate", gp.cxOnePoint)
    toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
    toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)
    
    toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))
    toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))
    
    
    
    
    random.seed(318)
    
    pop = toolbox.population(n=400) #
    hof = tools.HallOfFame(10)
    
    stats_fit = tools.Statistics(lambda ind: ind.fitness.values)
    stats_size = tools.Statistics(len)
    mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size)
    mstats.register("avg", numpy.mean)
    mstats.register("std", numpy.std)
    mstats.register("min", numpy.min)
    mstats.register("max", numpy.max)
    
    pop, log = algorithms.eaSimple(pop, toolbox, 0.5, 0.2, 50, stats=mstats,
                                   halloffame=hof, verbose=True) #
    print(hof)
    func2 =toolbox.compile(expr=hof[0])
    return func2, hof

In [80]:
import numpy as np
import pandas as pd

def Outputs(data):
    return np.round(1.-(1./(1.+np.exp(-data))))

In [81]:
def MungeData(data):
    # Sex
    data.drop(['Ticket', 'Name'], inplace=True, axis=1)
    data.Sex.fillna('0', inplace=True)
    data.loc[data.Sex != 'male', 'Sex'] = 0
    data.loc[data.Sex == 'male', 'Sex'] = 1
    # Cabin
    data.Cabin.fillna('0', inplace=True)
    data.loc[data.Cabin.str[0] == 'A', 'Cabin'] = 1
    data.loc[data.Cabin.str[0] == 'B', 'Cabin'] = 2
    data.loc[data.Cabin.str[0] == 'C', 'Cabin'] = 3
    data.loc[data.Cabin.str[0] == 'D', 'Cabin'] = 4
    data.loc[data.Cabin.str[0] == 'E', 'Cabin'] = 5
    data.loc[data.Cabin.str[0] == 'F', 'Cabin'] = 6
    data.loc[data.Cabin.str[0] == 'G', 'Cabin'] = 7
    data.loc[data.Cabin.str[0] == 'T', 'Cabin'] = 8
    # Embarked
    data.loc[data.Embarked == 'C', 'Embarked'] = 1
    data.loc[data.Embarked == 'Q', 'Embarked'] = 2
    data.loc[data.Embarked == 'S', 'Embarked'] = 3
    data.Embarked.fillna(0, inplace=True)
    data.fillna(-1, inplace=True)
    return data.astype(float)

In [82]:
bestInd = 0
pareto = 0
if __name__ == "__main__":
    train = pd.read_csv("./datasets/train.csv", dtype={"Age": np.float64}, )
    test = pd.read_csv("./datasets/test.csv", dtype={"Age": np.float64}, )
    mungedtrain = MungeData(train)
    
    #GP
    GeneticFunction, bestInd = mydeap(mungedtrain)
    
    #test
    mytrain = mungedtrain.iloc[:,2:10].values.tolist()
    trainPredictions = Outputs(np.array([GeneticFunction(*x) for x in mytrain]))

    pdtrain = pd.DataFrame({'PassengerId': mungedtrain.PassengerId.astype(int),
                            'Predicted': trainPredictions.astype(int),
                            'Survived': mungedtrain.Survived.astype(int)})
    pdtrain.to_csv('MYgptrain.csv', index=False)
    from sklearn.metrics import accuracy_score
    print(accuracy_score(mungedtrain.Survived.astype(int),trainPredictions.astype(int)))
    
    mungedtest = MungeData(test)
    mytest = mungedtest.iloc[:,1:9].values.tolist()
    testPredictions = Outputs(np.array([GeneticFunction(*x) for x in mytest]))

    pdtest = pd.DataFrame({'PassengerId': mungedtest.PassengerId.astype(int),
                            'Survived': testPredictions.astype(int)})
    pdtest.to_csv('gptest.csv', index=False)





   	      	                        fitness                         	              size             
   	      	--------------------------------------------------------	-------------------------------
gen	nevals	avg     	max     	min     	std     	avg   	max	min	std    
0  	400   	0.541944	0.750842	0.213244	0.111546	5.2175	15 	2  	3.32117
1  	214   	0.598173	0.750842	0.213244	0.0741944	5.43  	17 	1  	3.37344
2  	268   	0.614335	0.786756	0.213244	0.0654604	5.78  	21 	1  	3.44262
3  	250   	0.625783	0.786756	0.230079	0.0774875	6.7175	19 	1  	3.80561
4  	210   	0.646341	0.786756	0.213244	0.0790135	7.7225	19 	1  	4.03429
5  	243   	0.65901 	0.786756	0.213244	0.0847431	8.9   	20 	1  	4.17433
6  	264   	0.653165	0.786756	0.213244	0.0961555	9.2925	24 	1  	4.74626
7  	230   	0.661004	0.786756	0.213244	0.108822 	9.72  	21 	1  	4.66975
8  	240   	0.665567	0.786756	0.213244	0.112182 	10.185	25 	1  	4.72184
9  	251   	0.668547	0.786756	0.213244	0.117734 	10.2975	26 	1  	4.96478
10 	243   	0.685272	

In [83]:
from deap import gp
from deap import base
from deap import creator
from deap import tools
import operator
import math
def protectedDiv(left, right):
        try:
            return left / right
        except ZeroDivisionError:
            return 1
pset = gp.PrimitiveSet("MAIN", 8) # eight input
pset.addPrimitive(operator.add, 2)
pset.addPrimitive(operator.sub, 2)
pset.addPrimitive(operator.mul, 2)
pset.addPrimitive(protectedDiv, 2)
pset.addPrimitive(operator.neg, 1)
pset.addPrimitive(math.cos, 1)
pset.addPrimitive(math.sin, 1)
pset.addPrimitive(max, 2)
pset.addPrimitive(min, 2)
pset.renameArguments(ARG0='x1')
pset.renameArguments(ARG1='x2')
pset.renameArguments(ARG2='x3')
pset.renameArguments(ARG3='x4')
pset.renameArguments(ARG4='x5')
pset.renameArguments(ARG5='x6')
pset.renameArguments(ARG6='x7')
pset.renameArguments(ARG7='x8')

creator.create("FitnessMin", base.Fitness, weights=(1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=3) #
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)
    
def evalSymbReg(individual):
     # Transform the tree expression in a callable function
    func = toolbox.compile(expr=individual)
    # Evaluate the accuracy
    return sum(round(1.-(1./(1.+numpy.exp(-func(*in_))))) == out for in_, out in zip(inputs, outputs))/len(mungedtrain),
    
toolbox.register("evaluate", evalSymbReg)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)
    
toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))
toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=17))

In [84]:
def getSubtreeSlices(ind):
    subtrees = []
    for i in range(len(ind)):
        subtrees.append(ind.searchSubtree(i))
    return subtrees

In [85]:
def subtreeGenerator(subtreeSlices):
    listOfPrimSubTF = []
    for j in range(len(subtreeSlices)):
        primSubTreeL = []
        for i in range(subtreeSlices[j].start, subtreeSlices[j].stop):
             primSubTreeL.append(treeBestInd[i])
        primSubTree = gp.PrimitiveTree(primSubTreeL)

        listOfPrimSubTF.append(toolbox.compile(expr=primSubTree))
    return listOfPrimSubTF

In [86]:
def indvBloatIdentifier(indSubtreeFs, slices):    
    train = pd.read_csv("./datasets/train.csv", dtype={"Age": np.float64}, )
    test = pd.read_csv("./datasets/test.csv", dtype={"Age": np.float64}, )
    mungedtrain = MungeData(train)
    mytrain = mungedtrain.iloc[:,2:10].values.tolist()
    sols = []
    for i in range(len(indSubtreeFs)):
        sols.append(np.array([indSubtreeFs[i](*x) for x in mytrain]))
    '''for i in range(len(sols)):
        print(sols[i])'''
    isBloat = [None]*len(sols)
    alreadyBloat = set()
    for i in range(len(sols)):
        if i in alreadyBloat:
            continue
        aVal = None
        notConst = True
        for j in range(len(sols[i])):
            if aVal == None:
                aVal = sols[i][j]
            else:
                if aVal != sols[i][j]:
                    notConst = True
                    break
        isBloat[i] = not notConst
        if isBloat[i]:
            j = i+1
            while j < len(slices):
                if slices[j].start > slices[i].start and slices[j].stop <= slices[i].stop:
                    alreadyBloat.add(j)
                    isBloat[j] = True
                j += 1
        nodeRootChildren = set() #Check if just root node if bloat
        startRange = slices[i].start+1
        j = i+1
        while j < len(slices) and startRange != slices[i].stop:
            if slices[j].start == startRange:
                startRange = slices[j].stop
                nodeRootChildren.add(j)
            j += 1
        for child in nodeRootChildren:
            if np.array_equal(sols[i], sols[child]):
                isBloat[i] = True
    numNonBloat = 0
    for i in range(len(isBloat)):
        if not isBloat[i]:
            numNonBloat += 1
    return (numNonBloat/len(isBloat))*100, isBloat

In [93]:
for k in range(10):
    print("* *")
    print(str(bestInd[k]))
    print(" ")
    treeBestInd = gp.PrimitiveTree.from_string(string=str(bestInd[k]), pset=pset)
    subtrees = getSubtreeSlices(treeBestInd)
    listOfPrimSubTF = subtreeGenerator(subtrees)
    score, bloatArr = indvBloatIdentifier(listOfPrimSubTF, subtrees)
    print("Score:", score, "percent")
    print(" ")
    for j in range(len(subtrees)):
        primSubTreeL = []
        for l in range(subtrees[j].start, subtrees[j].stop):
             primSubTreeL.append(treeBestInd[l])
        primSubTree = gp.PrimitiveTree(primSubTreeL)
        if bloatArr[j]:
            print(j, ": ", str(primSubTree))
            print(" ")
    print("* *")


* *
sin(sub(x2, protectedDiv(protectedDiv(x1, min(cos(cos(x6)), x4)), protectedDiv(protectedDiv(min(min(sub(neg(cos(protectedDiv(min(min(protectedDiv(x2, mul(sin(protectedDiv(x7, x2)), cos(x4))), x7), x4), min(neg(x8), x2)))), sub(x8, x7)), x7), x4), cos(protectedDiv(x5, x1))), x2))))
 
Score: 95.65217391304348 percent
 
14 :  min(sub(neg(cos(protectedDiv(min(min(protectedDiv(x2, mul(sin(protectedDiv(x7, x2)), cos(x4))), x7), x4), min(neg(x8), x2)))), sub(x8, x7)), x7)
 
32 :  min(neg(x8), x2)
 
* *
* *
sin(sub(x2, protectedDiv(protectedDiv(x1, min(cos(cos(x6)), x4)), protectedDiv(protectedDiv(min(min(sub(neg(cos(protectedDiv(min(min(protectedDiv(x2, mul(sin(protectedDiv(x7, x2)), cos(x4))), x7), x4), min(min(sub(neg(x8), sub(x8, x7)), x7), x2)))), sub(x8, x7)), x7), x4), cos(protectedDiv(x5, x1))), x2))))
 
Score: 96.15384615384616 percent
 
14 :  min(sub(neg(cos(protectedDiv(min(min(protectedDiv(x2, mul(sin(protectedDiv(x7, x2)), cos(x4))), x7), x4), min(min(sub(neg(x8), sub(x8, x7))