In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
from tqdm import tqdm 

from collections import namedtuple
from random import choices, randint, randrange, random 
from typing import List, Callable, Tuple 
from functools import partial

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

from sklearn import datasets 
from deslib.des.knora_e import KNORAE 
from deslib.des.des_p import DESP
from deslib.des.knop import KNOP
  

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder

from scipy.io import arff

pd.set_option('display.max_columns', None)   

In [2]:
BaseClassifier = namedtuple('BaseClassifier', ['model', 'features_set'])

In [3]:
import itertools 
from copy import deepcopy 

def generate_featureset_combination(features_set): 
    # Generate combinations of sets
    set_combinations = []
    for r in range(1, len(features_set) + 1):
        set_combinations.extend(itertools.combinations(features_set, r)) 

    # Merge combinations of sets
    merged_set_combinations = [] 
    for combination in set_combinations:
        merged_features = [] 
        for features in combination: 
            merged_features += features 
    
        merged_set_combinations.append(merged_features)

    print(f"Total combinations of feature sets: {len(merged_set_combinations)}")

    return merged_set_combinations 


def generate_pool(featureset_combinations, models):     
    pool = [] 

    for model in models: 
        for feature_set in featureset_combinations: 
            pool.append(BaseClassifier(deepcopy(model), feature_set)) 


    print(f"Pool size: {len(pool)}") 

    return pool     

In [4]:
from sklearn.metrics import accuracy_score  
from des.knorau import * 
from mlxtend.classifier import EnsembleVoteClassifier 
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline  

Genome = List[int] 
Population = List[Genome] 
FitnessFunc = Callable[[Genome], float]
PopulateFunc = Callable[[], Population]
SelectionFunc = Callable[[Population, FitnessFunc], Tuple[Genome, Genome]]
CrossoverFunc = Callable[[Genome, Genome], Tuple[Genome, Genome]] 
MutationFunc = Callable[[Genome], Genome]


def generate_genome(length: int) -> Genome: 
    return choices([0, 1], k=length) 


def generate_population(size: int, genome_length: int) -> Population: 
    return [generate_genome(genome_length) for _ in range(size)] 


# def fitness(genome: Genome, pool: [BaseClassifier]) -> float: 
#     print(genome)
#     # Initialize the DES model
#     models       = [pool[i].model for i, val in enumerate(genome) if val == 1] 
#     feature_sets = [pool[i].features_set for i, val in enumerate(genome) if val == 1] 
    
#     if len(models) <= 1: 
#         return 0 
    
#     knorau = KNORAU(models, feature_sets, k=7) 
    
#     knorau.fit(X_dsel, y_dsel)

#     preds = knorau.predict(X_test)   

#     result = accuracy_score(y_test, preds)

#     return result


def fitness(genome: Genome, pool: [BaseClassifier]) -> float: 
    # print(genome)
    # Initialize the DES model
    models       = [pool[i].model for i, val in enumerate(genome) if val == 1] 
    feature_sets = [pool[i].features_set for i, val in enumerate(genome) if val == 1] 
    
    if len(models) <= 1: 
        return 0 
    
    pipelines = [] 
    
    for i in range(len(models)): 
        pipe = make_pipeline(ColumnSelector(cols=feature_sets[i]), models[i]) 
        pipelines.append(pipe)
    
    sclf = EnsembleVoteClassifier(clfs=pipelines, voting='soft') 
        
    sclf.fit(X_train, y_train) 

    preds = sclf.predict(X_test)   

    result = accuracy_score(y_test, preds)

    return result


def selection_pair(population: Population, fitness_func: FitnessFunc) -> Population: 
    return choices(
        population=population, 
        weights=[fitness_func(genome) for genome in population], 
        k=2
    )


def single_point_crossover(a: Genome, b: Genome) -> Tuple[Genome, Genome]: 
    if len(a) != len(b): 
        raise ValueError("Genome a and b must be the same length")


    length = len(a) 
    if length < 2: 
        return a, b 

    p = randint(1, length - 1) 
    return a[0:p] + b[p:], b[0:p] + a[p:] 


def mutation(genome: Genome, num: int = 1, probability: float = 0.5) -> Genome: 
    for _ in range(num): 
        index = randrange(len(genome)) 
        genome[index] = genome[index] if random() > probability else abs(genome[index] - 1)

    return genome 


def run_evolution(
        populate_func: PopulateFunc,
        fitness_func: FitnessFunc,
        fitness_limit: int,
        selection_func: SelectionFunc = selection_pair,
        crossover_func: CrossoverFunc = single_point_crossover,
        mutation_func: MutationFunc = mutation,
        generation_limit: int = 100) \
        -> Tuple[Population, int]:
    population = populate_func()

    for i in range(generation_limit):
        population = sorted(population, key=lambda genome: fitness_func(genome), reverse=True)
        print(f"Genome {population[0]}")
        print(f"Best Score: {fitness(population[0], pool):.3f}") 

        if fitness_func(population[0]) >= fitness_limit:
            break

        next_generation = population[0:2]

        for j in range(int(len(population) / 2) - 1):
            parents = selection_func(population, fitness_func)
            offspring_a, offspring_b = crossover_func(parents[0], parents[1])
            offspring_a = mutation_func(offspring_a)
            offspring_b = mutation_func(offspring_b)
            next_generation += [offspring_a, offspring_b]

        population = next_generation
        # print(genome_to_things(population[0], pool))
        # print(f"Best Score: {fitness(population[0], pool):.3f}")

    return population, i

In [5]:
mriDF            = pd.read_csv("datasets/adni/mri_statistics.csv") 
assessmentDF     = pd.read_csv("datasets/adni/assessment_statistics.csv")
cognitiveScoreDF = pd.read_csv("datasets/adni/cogniteive_score_statistics.csv") 
baselineDF       = pd.read_csv("datasets/adni/Baseline_final.csv")  

In [6]:
completeDataDF = pd.merge(cognitiveScoreDF, assessmentDF, on="RID", how="inner")  
completeDataDF = pd.merge(completeDataDF, mriDF, on="RID", how="inner")  
completeDataDF = pd.merge(completeDataDF, baselineDF, on="RID", how="inner")  

print("Dataset shape: {}".format(completeDataDF.shape)) 

Dataset shape: (1371, 342)


In [7]:
TARGET_COLUMN = "DX"  

mapping          = {"AD": 0, "sMCI": 1, "CN": 2, "pMCI": 3}   
gender_mapping   = {'Male': 1, 'Female': 0}
marriage_mapping = {'Married': 1, 'Widowed': 2, 'Divorced': 3, 'Never married': 0}

dataset = completeDataDF.drop(['RID'], axis=1) 

dataset['DX'] = dataset['DX'].map(mapping)  
dataset['PTGENDER'] = dataset['PTGENDER'].map(gender_mapping) 
dataset['PTMARRY'] = dataset['PTMARRY'].map(marriage_mapping)  

X = dataset.drop([TARGET_COLUMN], axis=1)
y = dataset[TARGET_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.30, random_state=42) 
X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train, stratify=y_train, test_size=0.20, random_state=42)  

In [8]:
### define the columns of each modalities 
cs_columns     = cognitiveScoreDF.drop(['RID'], axis=1).columns.tolist()
nt_columns     = assessmentDF.drop(['RID'], axis=1).columns.tolist() 
mri_columns    = mriDF.drop(['RID'], axis=1).columns.tolist() 
static_columns = baselineDF.drop(['RID', 'DX'], axis=1).columns.tolist()  

In [9]:
pool_classifiers = [
    RandomForestClassifier(random_state=45), 
    XGBClassifier(random_state=45),  
    # LGBMClassifier(verbose_eval = -1, random_state=45),
    KNeighborsClassifier(), 
    DecisionTreeClassifier(random_state=45) 
    
]

feature_set = [
    cs_columns, 
    nt_columns, 
    mri_columns, 
    static_columns
]

In [10]:
feature_sets = generate_featureset_combination(feature_set) 
pool = generate_pool(feature_sets, pool_classifiers)

Total combinations of feature sets: 15
Pool size: 60


In [11]:
# for base_classifier in tqdm(pool):  
#     base_classifier.model.fit(X_train[base_classifier.features_set], y_train) 

In [12]:
population, generations = run_evolution(
    populate_func=partial(
        generate_population, size=10, genome_length=len(pool)
    ), 
    fitness_func=partial(
        fitness, pool=pool 
    ), 
    fitness_limit=0.9, 
    generation_limit=30 
)

print(f"Number of generation: {generations}")

Genome [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0]
Best Score: 0.881
Genome [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0]
Best Score: 0.881
Genome [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0]
Best Score: 0.881
Genome [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0]
Best Score: 0.881
Genome [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 