## Genetic Algorithm (Dynamic Ensemble)

### Notes 
#### V1 
* random binary change mutation operation.
* single-point crossover operation.  

#### V2 
*  bit-flip mutation operation.
*  two-point crossover operation.


#### V3 
* diversity (not only accuracy)

In [18]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.3-py3-none-any.whl.metadata (5.2 kB)
Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3
Note: you may need to restart the kernel to use updated packages.


In [23]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 

from collections import namedtuple
from random import choices, randint, randrange, random 
from typing import List, Callable, Tuple 
from functools import partial

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

from sklearn import datasets 
from deslib.des.knora_e import KNORAE 
from deslib.des.des_p import DESP
from deslib.des.knop import KNOP

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder

from scipy.io import arff

pd.set_option('display.max_columns', None)   

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
car_evaluation = fetch_ucirepo(id=19) 
chess_king_rook_vs_king_pawn = fetch_ucirepo(id=22) 

# fetch dataset 
liver_disorders = fetch_ucirepo(id=60) 

In [13]:
X = liver_disorders.data.features 
y = liver_disorders.data.targets

y.drinks.unique()

array([ 0. ,  0.5,  1. ,  2. ,  3. ,  4. ,  5. ,  6. ,  7. ,  8. ,  9. ,
       10. , 12. , 16. , 20. , 15. ])

In [3]:
def side_by_side(*objs, **kwds):
    from pandas.io.formats.printing import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print (adjoin(space, *reprs))
    print()
    return

In [24]:
### Data 
iris = datasets.load_iris()
breast_cancer = datasets.load_breast_cancer() 
wine = datasets.load_wine()
credit_card = pd.read_excel("datasets/CreditCardClients.xls")
phishing = arff.loadarff('datasets/PhishingData.arff')  

In [6]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names) 
iris_df['target'] = iris.target 

breast_cancer_df = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names) 
breast_cancer_df['target'] = breast_cancer.target  

wine_df = pd.DataFrame(wine.data, columns=wine.feature_names) 
wine_df['target'] = wine.target  

In [7]:
wine_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [36]:
X = breast_cancer_df[breast_cancer.feature_names] 
y = breast_cancer_df.target 

X = iris_df[iris.feature_names] 
y = iris_df.target 

# X = wine_df[wine.feature_names] 
# y = wine_df.target 

# X = credit_card.drop(['default payment next month'], axis=1)
# y = credit_card['default payment next month']

# data (as pandas dataframes) 
X = car_evaluation.data.features 
y = car_evaluation.data.targets 

le = LabelEncoder()

# X = chess_king_rook_vs_king_pawn.data.features 
# y = chess_king_rook_vs_king_pawn.data.targets 


for col in X.columns.to_list(): 
    X[col] = le.fit_transform(X[col]) 

for col in y.columns.to_list():  
    y[col] = le.fit_transform(y[col])


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.30, random_state=42) 
X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train, stratify=y_train, test_size=0.20, random_state=42)  

In [37]:
X

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3,3,0,0,2,1
1,3,3,0,0,2,2
2,3,3,0,0,2,0
3,3,3,0,0,1,1
4,3,3,0,0,1,2
...,...,...,...,...,...,...
1723,1,1,3,2,1,2
1724,1,1,3,2,1,0
1725,1,1,3,2,0,1
1726,1,1,3,2,0,2


In [38]:
side_by_side(wine_df.isnull().sum(), wine_df.count())

alcohol                         0    alcohol                         178
malic_acid                      0    malic_acid                      178
ash                             0    ash                             178
alcalinity_of_ash               0    alcalinity_of_ash               178
magnesium                       0    magnesium                       178
total_phenols                   0    total_phenols                   178
flavanoids                      0    flavanoids                      178
nonflavanoid_phenols            0    nonflavanoid_phenols            178
proanthocyanins                 0    proanthocyanins                 178
color_intensity                 0    color_intensity                 178
hue                             0    hue                             178
od280/od315_of_diluted_wines    0    od280/od315_of_diluted_wines    178
proline                         0    proline                         178
target                          0    target        

In [39]:
Genome = List[int] 
Population = List[Genome] 
FitnessFunc = Callable[[Genome], float]
PopulateFunc = Callable[[], Population]
SelectionFunc = Callable[[Population, FitnessFunc], Tuple[Genome, Genome]]
CrossoverFunc = Callable[[Genome, Genome], Tuple[Genome, Genome]] 
MutationFunc = Callable[[Genome], Genome]
BaseClassifier = namedtuple('BaseClassifier', ['name', 'model'])  

pool = [
    BaseClassifier('XGB', XGBClassifier(random_state=42)), 
    BaseClassifier('XGB2', XGBClassifier(learning_rate=0.1, n_estimators=250, random_state=42)),  
    BaseClassifier('RF', RandomForestClassifier(random_state=42)), 
    BaseClassifier('RF2', RandomForestClassifier(n_estimators=230, random_state=42)), 
    BaseClassifier('LR', LogisticRegression(random_state=42)), 
    BaseClassifier('SVC', SVC(random_state=42)), 
    BaseClassifier('DT', DecisionTreeClassifier(max_depth=10, random_state=42)),
    BaseClassifier('DT2', DecisionTreeClassifier(max_depth=5, random_state=42)),
    BaseClassifier('LGBM', LGBMClassifier(random_state=42)), 
    BaseClassifier('LGBM2', LGBMClassifier(learning_rate=0.02, n_estimators=250, random_state=42)), 
    BaseClassifier('KNN', KNeighborsClassifier()), 
    # BaseClassifier('GB', GradientBoostingClassifier(random_state=42)), 
    # BaseClassifier('AB', AdaBoostClassifier(random_state=42)),
]



def genome_to_things(genome: Genome, pool: [BaseClassifier]) -> [BaseClassifier]: 
    result = [] 
    for i, model in enumerate(pool): 
        if genome[i] == 1: 
            result += [model.name] 

    return result 



def generate_genome(length: int) -> Genome: 
    return choices([0, 1], k=length) 


def generate_population(size: int, genome_length: int) -> Population: 
    return [generate_genome(genome_length) for _ in range(size)] 


def fitness(genome: Genome, pool: [BaseClassifier]) -> float: 
    # Initialize the DES model
    # models = [classifier.model for classifier in pool]  
    models = [pool[i].model for i, val in enumerate(genome) if val == 1] 
    if len(models) == 1: 
        return 0 
    # model_names = [classifier.name for classifier in pool]  
    
    knorae = DESP(models)
    
    knorae.fit(X_dsel, y_dsel)

    result = knorae.score(X_test, y_test) 

    return result


def selection_pair(population: Population, fitness_func: FitnessFunc) -> Population: 
    return choices(
        population=population, 
        weights=[fitness_func(genome) for genome in population], 
        k=2
    )


def single_point_crossover(a: Genome, b: Genome) -> Tuple[Genome, Genome]: 
    if len(a) != len(b): 
        raise ValueError("Genome a and b must be the same length")


    length = len(a) 
    if length < 2: 
        return a, b 

    p = randint(1, length - 1) 
    return a[0:p] + b[p:], b[0:p] + a[p:] 


def mutation(genome: Genome, num: int = 1, probability: float = 0.5) -> Genome: 
    for _ in range(num): 
        index = randrange(len(genome)) 
        genome[index] = genome[index] if random() > probability else abs(genome[index] - 1)

    return genome 

In [40]:
# fit base classifiers 
for i, base_classifier in enumerate(pool):  
    base_classifier.model.fit(X_train, y_train) 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000233 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21
[LightGBM] [Info] Number of data points in the train set: 967, number of used features: 6
[LightGBM] [Info] Start training from score -1.503560
[LightGBM] [Info] Start training from score -3.236612
[LightGBM] [Info] Start training from score -0.355051
[LightGBM] [Info] Start training from score -3.290680
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21
[LightGBM] [Info] Number of data points in the train set: 967, number of used features: 6
[LightGBM] [Info] Start training from score -1.503560
[LightGBM] [Info] Start training from score -3.236612
[LightGBM] [Info] Start training from score -0.355051
[LightGBM] [Info] Start training from score -3.290680


In [41]:
def run_evolution(
        populate_func: PopulateFunc,
        fitness_func: FitnessFunc,
        fitness_limit: int,
        selection_func: SelectionFunc = selection_pair,
        crossover_func: CrossoverFunc = single_point_crossover,
        mutation_func: MutationFunc = mutation,
        generation_limit: int = 100) \
        -> Tuple[Population, int]:
    population = populate_func()

    for i in range(generation_limit):
        population = sorted(population, key=lambda genome: fitness_func(genome), reverse=True)
        print(genome_to_things(population[0], pool))
        print(f"Best Score: {fitness(population[0], pool):.3f}") 

        if fitness_func(population[0]) >= fitness_limit:
            break

        next_generation = population[0:2]

        for j in range(int(len(population) / 2) - 1):
            parents = selection_func(population, fitness_func)
            offspring_a, offspring_b = crossover_func(parents[0], parents[1])
            offspring_a = mutation_func(offspring_a)
            offspring_b = mutation_func(offspring_b)
            next_generation += [offspring_a, offspring_b]

        population = next_generation
        # print(genome_to_things(population[0], pool))
        # print(f"Best Score: {fitness(population[0], pool):.3f}")

    return population, i

In [42]:
def genome_to_string(genome: Genome) -> str:
    return "".join(map(str, genome))

In [44]:
population, generations = run_evolution(
    populate_func=partial(
        generate_population, size=10, genome_length=len(pool)
    ), 
    fitness_func=partial(
        fitness, pool=pool 
    ), 
    fitness_limit=1.0, 
    generation_limit=30 
)

print(f"Number of generation: {generations}")
print(f"best solution: {genome_to_things(population[0], pool)}")

['XGB2', 'RF', 'LR', 'SVC', 'LGBM', 'LGBM2']
Best Score: 0.975
['XGB2', 'RF', 'LR', 'SVC', 'LGBM', 'LGBM2']
Best Score: 0.975
['XGB2', 'RF', 'LR', 'SVC', 'LGBM', 'LGBM2']
Best Score: 0.975
['XGB2', 'RF', 'LR', 'SVC', 'LGBM', 'LGBM2']
Best Score: 0.975
['XGB2', 'RF', 'LR', 'SVC', 'LGBM', 'LGBM2']
Best Score: 0.975
['XGB2', 'RF', 'LR', 'SVC', 'LGBM', 'LGBM2']
Best Score: 0.975
['XGB', 'XGB2', 'SVC', 'DT', 'LGBM', 'LGBM2']
Best Score: 0.977
['XGB', 'XGB2', 'SVC', 'DT', 'LGBM', 'LGBM2']
Best Score: 0.977
['XGB', 'XGB2', 'SVC', 'DT', 'LGBM', 'LGBM2']
Best Score: 0.977
['XGB', 'XGB2', 'SVC', 'DT', 'LGBM', 'LGBM2']
Best Score: 0.977
['XGB2', 'SVC', 'LGBM', 'LGBM2']
Best Score: 0.979
['XGB2', 'SVC', 'LGBM', 'LGBM2']
Best Score: 0.979
['XGB2', 'SVC', 'LGBM', 'LGBM2']
Best Score: 0.979
['XGB2', 'SVC', 'LGBM', 'LGBM2']
Best Score: 0.979
['XGB2', 'SVC', 'LGBM', 'LGBM2']
Best Score: 0.979
['XGB2', 'LGBM', 'LGBM2']
Best Score: 0.981
['XGB2', 'LGBM', 'LGBM2']
Best Score: 0.981
['XGB2', 'LGBM', 'LGBM2