In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from deap import creator, base, tools, algorithms
from sklearn.metrics import f1_score
import sys
import re


In [2]:
def avg(l):
    """
    Returns the average between list elements
    """
    return (sum(l)/float(len(l)))

In [3]:
def getFitness(individual, X, y): ## ds
    """
    Feature subset fitness function
    """

    if(individual.count(0) != len(individual)):
        # get index with value 0
        cols = [index for index in range(
            len(individual)) if individual[index] == 0]

        # get features subset
        X_parsed = X.drop(X.columns[cols], axis=1)
        X_subset = pd.get_dummies(X_parsed)

        # apply classification algorithm
        clf = LogisticRegression()

        return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
    else:
        return(0,)

In [4]:
def geneticAlgorithm(X, y, n_population, n_generation):
    """
    Deap global variables
    Initialize variables to use eaSimple
    """
    # create individual
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    # create toolbox
    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)
    toolbox.register("individual", tools.initRepeat,
                     creator.Individual, toolbox.attr_bool, len(X.columns))
    toolbox.register("population", tools.initRepeat, list,
                     toolbox.individual)
    toolbox.register("evaluate", getFitness, X=X, y=y)
    toolbox.register("mate", tools.cxOnePoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)

    # initialize parameters
    pop = toolbox.population(n=n_population)

    hof = tools.HallOfFame(n_population * n_generation)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    stats.register("max", np.max)
    
    print("xx")
    # genetic algorithm
    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2,
                                   ngen=n_generation, stats=stats, halloffame=hof,
                                   verbose=True)
    print(" hall of frame :", hof.maxsize)

    # return hall of fame
    return hof

In [5]:
def bestIndividual(hof, X, y):
    """
    Get the best individual
    """
    maxAccurcy = 0.0
    for individual in hof:
        print(type(individual.fitness.values[0]))
        if(individual.fitness.values[0] > maxAccurcy):
            maxAccurcy = individual.fitness.values[0]
            _individual = individual

    _individualHeader = [list(X)[i] for i in range(
        len(_individual)) if _individual[i] == 1]
    return _individual.fitness.values, _individual, _individualHeader

In [6]:
def getArguments():
    """
    Get argumments from command-line
    If pass only dataframe path, pop and gen will be default
    """
    dfPath = sys.argv[1]
    if(len(sys.argv) == 4):
        pop = int(sys.argv[2])
        gen = int(sys.argv[3])
    else:
        pop = 10
        gen = 2
    return dfPath, pop, gen

In [7]:

# # get dataframe path, population number and generation number from command-line argument
# n_pop = 20
# n_gen = 6
# # read dataframe from csv
# df = pd.read_csv('datasets/nuclear.csv', sep=',')

# # encode labels column to numbers
# le = LabelEncoder()
# le.fit(df.iloc[:, -1])
# y = le.transform(df.iloc[:, -1]) # label
# y_test = y[:20]
# X = df.iloc[:, :-1] # data

# # get accuracy with all features
# individual = [1 for i in range(len(X.columns))] # true column (feature)
# print("Accuracy with all features: \t" +
#       str(getFitness(individual, X, y)) + "\n")

# # apply genetic algorithm
# hof = geneticAlgorithm(X, y, n_pop, n_gen)

# # select the best individual
# accuracy, individual, header = bestIndividual(hof, X, y)
# print('Best Accuracy: \t' + str(accuracy))
# print('Number of Features in Subset: \t' + str(individual.count(1)))
# print('Individual: \t\t' + str(individual))
# print('Feature Subset\t: ' + str(header))

# print('\n\ncreating a new classifier with the result')

# # read dataframe from csv one more time
# df = pd.read_csv('datasets/nuclear.csv', sep=',')

# # with feature subset
# X = df[header]

# clf = LogisticRegression()

# scores = cross_val_score(clf, X, y, cv=5)
# print("Accuracy with Feature Subset: \t" + str(avg(scores)) + "\n")

In [8]:
def wine():
    df_red = pd.read_csv('../cso_cnn/datasets/winequality_red.csv')
    df_white = pd.read_csv('../cso_cnn/datasets/winequality_white.csv')
    df_red['color'] = "R"
    df_white['color'] = "W"
    df = pd.concat([df_red, df_white])
    
    print(df.size)
    le = LabelEncoder()
    le.fit(df.iloc[:, -1])
    y = le.transform(df.iloc[:, -1]) # label
    X = df.drop([df.columns[0], 'color'], axis=1)
    return X, y

In [9]:
def titanic():
    train = pd.read_csv('../cso_cnn/datasets/titanic_train.csv')
    test = pd.read_csv('../cso_cnn/datasets/titanic_test.csv')
    
    full_data = [train, test]
    
    PassengerId = test['PassengerId']
    # Some features of my own that I have added in
    # Gives the length of the name
    train['Name_length'] = train['Name'].apply(len)
    test['Name_length'] = test['Name'].apply(len)
    # Feature that tells whether a passenger had a cabin on the Titanic
    train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
    test['Has_Cabin'] = test["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

    # Feature engineering steps taken from Sina
    # Create new feature FamilySize as a combination of SibSp and Parch
    for dataset in full_data:
        dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    # Create new feature IsAlone from FamilySize
    for dataset in full_data:
        dataset['IsAlone'] = 0
        dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    # Remove all NULLS in the Embarked column
    for dataset in full_data:
        dataset['Embarked'] = dataset['Embarked'].fillna('S')
    # Remove all NULLS in the Fare column and create a new feature CategoricalFare
    for dataset in full_data:
        dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
    train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
    # Create a New feature CategoricalAge
    for dataset in full_data:
        age_avg = dataset['Age'].mean()
        age_std = dataset['Age'].std()
        age_null_count = dataset['Age'].isnull().sum()
        age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
        dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
        dataset['Age'] = dataset['Age'].astype(int)
    train['CategoricalAge'] = pd.cut(train['Age'], 5)
    # Define function to extract titles from passenger names
    def get_title(name):
        title_search = re.search(' ([A-Za-z]+)\.', name)
        # If the title exists, extract and return it.
        if title_search:
            return title_search.group(1)
        return ""
    # Create a new feature Title, containing the titles of passenger names
    for dataset in full_data:
        dataset['Title'] = dataset['Name'].apply(get_title)
    # Group all non-common titles into one single grouping "Rare"
    for dataset in full_data:
        dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

        dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

    for dataset in full_data:
        # Mapping Sex
        dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

        # Mapping titles
        title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
        dataset['Title'] = dataset['Title'].map(title_mapping)
        dataset['Title'] = dataset['Title'].fillna(0)

        # Mapping Embarked
        dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

        # Mapping Fare
        dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
        dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
        dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
        dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
        dataset['Fare'] = dataset['Fare'].astype(int)

        # Mapping Age
        dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
        dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
        dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
        dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
        dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;
    # Feature selection
    drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
    train = train.drop(drop_elements, axis = 1)
    train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
    test  = test.drop(drop_elements, axis = 1)
    le = LabelEncoder()
    le.fit(train.iloc[:, 1])
    y = le.transform(train.iloc[:, 1])
    X = train.drop([train.columns[0], 'Survived'], axis=1)
    print(X.size)
    print(y.size)
    return X, y

In [10]:
n_pop = 100
n_gen = 20
# read dataframe from csv
df = pd.read_csv('datasets/iris.csv',)
le = LabelEncoder()
le.fit(df.iloc[:, -1])
y = le.transform(df.iloc[:, -1]) # label
print(le.classes_)
print(y)
X = df.drop([df.columns[0], 'Species'], axis=1)

X,y = titanic()
from sklearn.model_selection import train_test_split

x_train,x_test,y_train, y_test = train_test_split(X , y , test_size=0.4 , random_state=0)

# get accuracy with all features
individual = [1 for i in range(len(x_train.columns))] # true column (feature)
print("Accuracy with all features: \t" +
      str(getFitness(individual, x_train, y_train)) + "\n")


# apply genetic algorithm
hof = geneticAlgorithm(x_train, y_train , n_pop, n_gen)

# select the best individual
accuracy, individual, header = bestIndividual(hof, x_train, y_train)
print('Best Accuracy: \t' + str(accuracy))
print('Number of Features in Subset: \t' + str(individual.count(1)))
print('Individual: \t\t' + str(individual))
print('Feature Subset\t: ' + str(header))

print('\n\ncreating a new classifier with the result')

# read dataframe from csv one more time
# df = pd.read_csv('datasets/iris.csv', sep=',')

# with feature subset
x_train = x_train[header]
x_test = x_test[header]
clf = DecisionTreeClassifier()

scores = cross_val_score(clf, x_train, y_train, cv=5)
print("Accuracy with Feature Subset: \t" + str(avg(scores)) + "\n")

# Import necessary modules
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

# Setup the parameters and distributions to sample from: param_dist
param_dist = {"max_depth": [3, None],
              "max_features": randint(1, len(header)),
              "min_samples_leaf": randint(1, len(header)),
              "criterion": ["gini", "entropy"]}

# Instantiate the RandomizedSearchCV object: tree_cv
tree_cv = RandomizedSearchCV(DecisionTreeClassifier(), param_dist, cv=5)
tree_cv2 = RandomizedSearchCV(DecisionTreeClassifier(), param_dist, cv=5)
# Fit it to the data
tree_cv.fit(x_train, y_train)
predicted = tree_cv.predict(x_test)
# Print the tuned parameters and score

print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))


# model = LogisticRegression()
# model.fit(x_train, y_train)
# predicted = model.predict(x_test)


from sklearn.metrics import f1_score, accuracy_score
scores = f1_score(predicted, y_test, average='micro')
print("Test acc : {}".format(scores))

from sklearn.model_selection import cross_val_predict
score = cross_val_score(tree_cv2, x_train, y_train, cv=5)




['setosa' 'versicolor' 'virginica']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


9801
891
Accuracy with all features: 	(0.9495425786109596,)

xx
gen	nevals	avg     	min	max    
0  	100   	0.797534	0  	0.95522
1  	71    	0.882596	0.589852	0.95522
2  	47    	0.92739 	0.543066	0.962681
3  	65    	0.939931	0.756299	0.962681
4  	62    	0.947206	0.745048	0.962681
5  	63    	0.953515	0.906494	0.966333
6  	62    	0.957833	0.913937	0.966333
7  	59    	0.959226	0.730283	0.966333
8  	63    	0.962542	0.906512	0.966333
9  	64    	0.958427	0.73217 	0.966333
10 	65    	0.962887	0.883783	0.966333
11 	70    	0.963693	0.881914	0.966333
12 	49    	0.964737	0.904573	0.966333
13 	51    	0.959666	0.733884	0.966333
14 	56    	0.964667	0.925066	0.966333
15 	53    	0.962566	0.810964	0.966333
16 	60    	0.962326	0.805356	0.966333
17 	64    	0.957905	0.737762	0.966333
18 	50    	0.962754	0.737762	0.966333
19 	61    	0.963485	0.805356	0.966333
20 	61    	0.961741	0.735892	0.966333
 hall of frame : 2000
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float