In [None]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Genetic Algorithm (GA) for Machine Learning (ML)
URL https://datascienceplus.com/genetic-algorithm-in-machine-learning-using-python/

#### GA limitations
- Not suitable for simple problems with available derivative information
- Frequent calculation of fitness value is computationally expensive for some problems
- Stochastic, i.e., no guarantee of the result solution being optimal
- No guarantee of convergence to the optimal solution if not implemented properly

In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot
%matplotlib inline

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#### Data

In [None]:
#import the breast cancer dataset 
from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()
df = pd.DataFrame(cancer['data'], columns=cancer['feature_names'])
label=cancer["target"]

#splitting the model into training and testing set
X_train, X_test, y_train, y_test = train_test_split(df, 
                                                    label, 
                                                    test_size=0.30, 
                                                    random_state=101)

## ML modeling with Logistic Regression

In [None]:
warning = False

if warning:
    # without warning, better convergence of Logistic Regression, but worse GA effect (below)
    logmodel = LogisticRegression(max_iter=5000)
else:
    # a lot of warning, worse convergence for Logistic regression, but this example shows better effect o GA
    import warnings
    warnings.filterwarnings('ignore')
    logmodel = LogisticRegression(max_iter=100)

# fit model
logmodel.fit(X_train,y_train)

# prediction
predictions = logmodel.predict(X_test)
print("Accuracy = "+ str(accuracy_score(y_test,predictions)))

## Genetic Algorithm (GA)

In [None]:
#defining various steps required for the genetic algorithm
def initilization_of_population(size, n_feat):
    population = []
    for i in range(size):
        chromosome = np.ones(n_feat, dtype=np.bool)
        chromosome[:int(0.3*n_feat)]=False
        np.random.shuffle(chromosome)
        population.append(chromosome)
    return population


# use logistic regression as the fitness score function
def fitness_score(population):
    scores = []
    for chromosome in population:
        logmodel.fit(X_train.iloc[:,chromosome],y_train)
        predictions = logmodel.predict(X_test.iloc[:,chromosome])
        scores.append(accuracy_score(y_test,predictions))
    scores, population = np.array(scores), np.array(population) 
    inds = np.argsort(scores)
    return list(scores[inds][::-1]), list(population[inds,:][::-1])


def selection(pop_after_fit, n_parents):
    population_nextgen = []
    for i in range(n_parents):
        population_nextgen.append(pop_after_fit[i])
    return population_nextgen


def crossover(pop_after_sel):
    population_nextgen=pop_after_sel
    for i in range(len(pop_after_sel)):
        child=pop_after_sel[i]
        child[3:7]=pop_after_sel[(i+1)%len(pop_after_sel)][3:7]
        population_nextgen.append(child)
    return population_nextgen


def mutation(pop_after_cross,mutation_rate):
    population_nextgen = []
    for i in range(0,len(pop_after_cross)):
        chromosome = pop_after_cross[i]
        for j in range(len(chromosome)):
            if random.random() < mutation_rate:
                chromosome[j]= not chromosome[j]
        population_nextgen.append(chromosome)
    #print(population_nextgen)
    return population_nextgen


def generations(size,
                n_feat,
                n_parents,
                mutation_rate,
                n_gen,
                X_train,
                X_test, 
                y_train, 
                y_test):
    best_chromo= []
    best_score= []
    population_nextgen=initilization_of_population(size, n_feat)
    for i in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen)
        print(i, scores[:2])
        
        pop_after_sel = selection(pop_after_fit, n_parents)
        pop_after_cross = crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross, mutation_rate)
        
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])  
    return best_chromo, best_score

## Optimization ML by GA
**Note: long running**

In [None]:
chromo, score=generations(size=200,
                          n_feat=30,
                          n_parents=100,
                          mutation_rate=0.10,
                          n_gen=38,
                          X_train=X_train,
                          X_test=X_test,
                          y_train=y_train,
                          y_test=y_test)

In [None]:
logmodel.fit(X_train.iloc[:,chromo[-1]], y_train)
predictions = logmodel.predict(X_test.iloc[:,chromo[-1]])
print("Accuracy score after GA is= " + str(accuracy_score(y_test,predictions)))

# Other source
- pip install geneticalgorithm https://pypi.org/project/geneticalgorithm/