In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from deap import creator, base, tools, algorithms
import sys




In [2]:
def avg(l):
    """
    Returns the average between list elements
    """
    return (sum(l)/float(len(l)))

In [3]:
def getFitness(individual, X, y): ## ds
    """
    Feature subset fitness function
    """

    if(individual.count(0) != len(individual)):
        # get index with value 0
        cols = [index for index in range(
            len(individual)) if individual[index] == 0]

        # get features subset
        X_parsed = X.drop(X.columns[cols], axis=1)
        X_subset = pd.get_dummies(X_parsed)

        # apply classification algorithm
        clf = LogisticRegression()

        return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
    else:
        return(0,)

In [4]:
def geneticAlgorithm(X, y, n_population, n_generation):
    """
    Deap global variables
    Initialize variables to use eaSimple
    """
    # create individual
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    # create toolbox
    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)
    toolbox.register("individual", tools.initRepeat,
                     creator.Individual, toolbox.attr_bool, len(X.columns))
    toolbox.register("population", tools.initRepeat, list,
                     toolbox.individual)
    toolbox.register("evaluate", getFitness, X=X, y=y)
    toolbox.register("mate", tools.cxOnePoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)

    # initialize parameters
    pop = toolbox.population(n=n_population)

    hof = tools.HallOfFame(n_population * n_generation)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    stats.register("max", np.max)
    
    print("xx")
    # genetic algorithm
    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2,
                                   ngen=n_generation, stats=stats, halloffame=hof,
                                   verbose=True)
    print(" hall of frame :", hof.maxsize)

    # return hall of fame
    return hof

In [5]:
def bestIndividual(hof, X, y):
    """
    Get the best individual
    """
    maxAccurcy = 0.0
    for individual in hof:
        print(type(individual.fitness.values[0]))
        if(individual.fitness.values[0] > maxAccurcy):
            maxAccurcy = individual.fitness.values[0]
            _individual = individual

    _individualHeader = [list(X)[i] for i in range(
        len(_individual)) if _individual[i] == 1]
    return _individual.fitness.values, _individual, _individualHeader

In [6]:
def getArguments():
    """
    Get argumments from command-line
    If pass only dataframe path, pop and gen will be default
    """
    dfPath = sys.argv[1]
    if(len(sys.argv) == 4):
        pop = int(sys.argv[2])
        gen = int(sys.argv[3])
    else:
        pop = 10
        gen = 2
    return dfPath, pop, gen

In [7]:

# # get dataframe path, population number and generation number from command-line argument
# n_pop = 20
# n_gen = 6
# # read dataframe from csv
# df = pd.read_csv('datasets/nuclear.csv', sep=',')

# # encode labels column to numbers
# le = LabelEncoder()
# le.fit(df.iloc[:, -1])
# y = le.transform(df.iloc[:, -1]) # label
# y_test = y[:20]
# X = df.iloc[:, :-1] # data

# # get accuracy with all features
# individual = [1 for i in range(len(X.columns))] # true column (feature)
# print("Accuracy with all features: \t" +
#       str(getFitness(individual, X, y)) + "\n")

# # apply genetic algorithm
# hof = geneticAlgorithm(X, y, n_pop, n_gen)

# # select the best individual
# accuracy, individual, header = bestIndividual(hof, X, y)
# print('Best Accuracy: \t' + str(accuracy))
# print('Number of Features in Subset: \t' + str(individual.count(1)))
# print('Individual: \t\t' + str(individual))
# print('Feature Subset\t: ' + str(header))

# print('\n\ncreating a new classifier with the result')

# # read dataframe from csv one more time
# df = pd.read_csv('datasets/nuclear.csv', sep=',')

# # with feature subset
# X = df[header]

# clf = LogisticRegression()

# scores = cross_val_score(clf, X, y, cv=5)
# print("Accuracy with Feature Subset: \t" + str(avg(scores)) + "\n")

In [8]:
n_pop = 100
n_gen = 20
# read dataframe from csv
df = pd.read_csv('datasets/iris.csv',)
le = LabelEncoder()
le.fit(df.iloc[:, -1])
y = le.transform(df.iloc[:, -1]) # label
print(le.classes_)
print(y)
X = df.drop([df.columns[0], 'Species'], axis=1)

from sklearn.model_selection import train_test_split

x_train,x_test,y_train, y_test = train_test_split(X , y , test_size=0.4 , random_state=0)

# get accuracy with all features
individual = [1 for i in range(len(x_train.columns))] # true column (feature)
print("Accuracy with all features: \t" +
      str(getFitness(individual, x_train, y_train)) + "\n")


# apply genetic algorithm
hof = geneticAlgorithm(x_train, y_train , n_pop, n_gen)

# select the best individual
accuracy, individual, header = bestIndividual(hof, x_train, y_train)
print('Best Accuracy: \t' + str(accuracy))
print('Number of Features in Subset: \t' + str(individual.count(1)))
print('Individual: \t\t' + str(individual))
print('Feature Subset\t: ' + str(header))

print('\n\ncreating a new classifier with the result')

# read dataframe from csv one more time
# df = pd.read_csv('datasets/iris.csv', sep=',')

# with feature subset
x_train = x_train[header]

clf = LogisticRegression()

scores = cross_val_score(clf, x_train, y_train, cv=5)
print("Accuracy with Feature Subset: \t" + str(avg(scores)) + "\n")

# Import necessary modules
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

# Setup the parameters and distributions to sample from: param_dist
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, len(header)),
              "min_samples_leaf": randint(1, len(header)),
              "criterion": ["gini", "entropy"]}

# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(DecisionTreeClassifier(), param_dist, cv=5)
tree_cv2 = RandomizedSearchCV(DecisionTreeClassifier(), param_dist, cv=5)
# Fit it to the data
tree_cv.fit(x_train, y_train)
predicted = tree_cv.predict(x_test)
# Print the tuned parameters and score

print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))


# model = LogisticRegression()
# model.fit(x_train, y_train)
# predicted = model.predict(x_test)


from sklearn.metrics import f1_score, accuracy_score
scores = accuracy_score(predicted, y_test)
print("Test acc : {}".format(scores))

from sklearn.model_selection import cross_val_predict
score = cross_val_score(tree_cv2, x_train, y_train, cv=5)


['setosa' 'versicolor' 'virginica']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
Accuracy with all features: 	(0.94422514619883047,)

xx
gen	nevals	avg     	min	max     
0  	100   	0.779441	0  	0.944225
1  	53    	0.90346 	0.700658	0.944225
2  	65    	0.932829	0.767544	0.944225
3  	67    	0.94389 	0.910673	0.944225
4  	64    	0.942107	0.900146	0.944225
5  	57    	0.943554	0.910673	0.944225
6  	43    	0.943554	0.910673	0.944225
7  	58    	0.943554	0.910673	0.944225
8  	63    	0.93981 	0.700658	0.944225
9  	62    	0.941017	0.822734	0.944225
10 	54    	0.943554	0.910673	0.944225
11 	62    	0.942126	0.900146	0.944225
12 	58    	0.943113	0.900146	0.944225
13 	69    	0.942673	0.900146	0.944225
14 	65    	0.944225	0.944225	0.944225
15 	