* NACC data documentation: https://files.alz.washington.edu/documentation/uds3-rdd.pdf 

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
from tqdm import tqdm 

from collections import namedtuple
from random import choices, randint, randrange, random 
from typing import List, Callable, Tuple 
from functools import partial

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

from sklearn import datasets 
from deslib.des.knora_e import KNORAE 
from deslib.des.des_p import DESP
from deslib.des.knop import KNOP
  

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import (accuracy_score)

from scipy.io import arff

pd.set_option('display.max_columns', None)   

In [2]:
from diversity import * 

In [3]:
def side_by_side(*objs, **kwds):
    from pandas.io.formats.printing import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print (adjoin(space, *reprs))
    print()
    return

In [4]:
BaseClassifier = namedtuple('BaseClassifier', ['name', 'model', 'features_set'])

In [5]:
import itertools 
from copy import deepcopy 


def generate_featureset_combination(features_set): 
    # Generate combinations of sets
    set_combinations = []
    for r in range(1, len(features_set) + 1):
        set_combinations.extend(itertools.combinations(list(features_set.keys()), r)) 

    # Merge combinations of sets
    merged_set_combinations = {} 
    for combination in set_combinations:
        merged_features = [] 
        name = ""
        for features in combination: 
            merged_features += features_set[features]
            name += f"{features}+" 

        merged_set_combinations[name[:-1]] = merged_features
    
    print(f"Total combinations of feature sets: {len(merged_set_combinations)}")

    return merged_set_combinations 
    

def generate_pool(featureset_combinations, models):     
    pool = [] 

    for model in list(models.keys()): 
        for feature_set in list(featureset_combinations.keys()): 
            pool.append(BaseClassifier(f"{model} - ({feature_set})", deepcopy(models[model]), featureset_combinations[feature_set])) 


    print(f"Pool size: {len(pool)}") 

    return pool  


def select_subpool(X_dsel: pd.DataFrame, y_dsel: pd.Series, pool: BaseClassifier) -> pd.DataFrame: 
    # TODO 
    # Accuracy of each model 
    # Diversity of each model with each other model 

    prediction_dict = {} 
    accuracy_dict   = {} 

    for bc in pool: 
        preds = bc.model.predict(X_dsel[bc.features_set]) 
        prediction_dict[bc.name] = preds
        accuracy_dict[bc.name]   = accuracy_score(y_dsel, prediction_dict[bc.name]) 

    # Calculate diversity 
    df_diversity = pd.DataFrame(columns=['model', 'score'] + list(prediction_dict.keys())) 
    
    for main_model in list(prediction_dict.keys()): 
        row_dict = {"model": main_model, "score": accuracy_dict[main_model]} 
        
        for second_model in list(prediction_dict.keys()): 
            row_dict[second_model] = disagreement_measure(y_dsel, prediction_dict[main_model], prediction_dict[second_model]) 

        df_diversity = pd.concat([df_diversity, pd.DataFrame([row_dict])], ignore_index=True) 
            

    return df_diversity 


def measure_model_diversity(df_diversity: pd.DataFrame, threshold): 
    model_names = df_diversity.model.tolist() 
    diversity_score_dict = {}

    for main_model in model_names: 
        diversity_score_dict[main_model] = 0 
        for second_model in model_names:  
            if df_diversity[df_diversity.model == main_model][second_model].iloc[0] < threshold: 
                diversity_score_dict[main_model] += 1 

    return diversity_score_dict 


def get_survivals(df_diversity: pd.DataFrame, threshold): 
    model_names = df_diversity.sort_values(by=['score'], ascending=False).model.tolist() 
    short_list = [] 

    for main_model in model_names: 
        for second_model in model_names:  
            if df_diversity[df_diversity.model == main_model][second_model].iloc[0] < threshold:  
                model_names.remove(second_model) 
        short_list.append(main_model) 
    return short_list

In [6]:
from sklearn.metrics import accuracy_score  
from des.knorau import * 
from mlxtend.classifier import EnsembleVoteClassifier 
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline  

Genome = List[int] 
Population = List[Genome] 
FitnessFunc = Callable[[Genome], float]
PopulateFunc = Callable[[], Population]
SelectionFunc = Callable[[Population, FitnessFunc], Tuple[Genome, Genome]]
CrossoverFunc = Callable[[Genome, Genome], Tuple[Genome, Genome]] 
MutationFunc = Callable[[Genome], Genome]


def generate_genome(length: int) -> Genome: 
    return choices([0, 1], k=length) 


def generate_population(size: int, genome_length: int) -> Population: 
    return [generate_genome(genome_length) for _ in range(size)] 


# def fitness(genome: Genome, pool: [BaseClassifier]) -> float: 
#     print(genome)
#     # Initialize the DES model
#     models       = [pool[i].model for i, val in enumerate(genome) if val == 1] 
#     feature_sets = [pool[i].features_set for i, val in enumerate(genome) if val == 1] 
    
#     if len(models) <= 1: 
#         return 0 
    
#     knorau = KNORAU(models, feature_sets, k=7) 
    
#     knorau.fit(X_dsel, y_dsel)

#     preds = knorau.predict(X_test)   

#     result = accuracy_score(y_test, preds)

#     return result


def fitness(genome: Genome, pool: [BaseClassifier]) -> float: 
    # print(genome)
    # Initialize the DES model
    models       = [pool[i].model for i, val in enumerate(genome) if val == 1] 
    feature_sets = [pool[i].features_set for i, val in enumerate(genome) if val == 1] 
    
    if len(models) <= 1: 
        return 0 
    
    pipelines = [] 
    
    for i in range(len(models)): 
        pipe = make_pipeline(ColumnSelector(cols=feature_sets[i]), models[i]) 
        pipelines.append(pipe)
    
    sclf = EnsembleVoteClassifier(clfs=pipelines, voting='soft') 
        
    sclf.fit(X_train, y_train) 

    preds = sclf.predict(X_test)   

    result = accuracy_score(y_test, preds)

    return result


def selection_pair(population: Population, fitness_func: FitnessFunc) -> Population: 
    return choices(
        population=population, 
        weights=[fitness_func(genome) for genome in population], 
        k=2
    )


def single_point_crossover(a: Genome, b: Genome) -> Tuple[Genome, Genome]: 
    if len(a) != len(b): 
        raise ValueError("Genome a and b must be the same length")


    length = len(a) 
    if length < 2: 
        return a, b 

    p = randint(1, length - 1) 
    return a[0:p] + b[p:], b[0:p] + a[p:] 


def mutation(genome: Genome, num: int = 1, probability: float = 0.5) -> Genome: 
    for _ in range(num): 
        index = randrange(len(genome)) 
        genome[index] = genome[index] if random() > probability else abs(genome[index] - 1)

    return genome 


def run_evolution(
        populate_func: PopulateFunc,
        fitness_func: FitnessFunc,
        fitness_limit: int,
        selection_func: SelectionFunc = selection_pair,
        crossover_func: CrossoverFunc = single_point_crossover,
        mutation_func: MutationFunc = mutation,
        generation_limit: int = 100) \
        -> Tuple[Population, int]:
    population = populate_func()

    for i in range(generation_limit):
        population = sorted(population, key=lambda genome: fitness_func(genome), reverse=True)
        print(f"Genome {population[0]}")
        print(f"Best Score: {fitness(population[0], pool):.3f}") 

        if fitness_func(population[0]) >= fitness_limit:
            break

        next_generation = population[0:2]

        for j in range(int(len(population) / 2) - 1):
            parents = selection_func(population, fitness_func)
            offspring_a, offspring_b = crossover_func(parents[0], parents[1])
            offspring_a = mutation_func(offspring_a)
            offspring_b = mutation_func(offspring_b)
            next_generation += [offspring_a, offspring_b]

        population = next_generation
        # print(genome_to_things(population[0], pool))
        # print(f"Best Score: {fitness(population[0], pool):.3f}")

    return population, i

### ADNI 

In [7]:
mriDF            = pd.read_csv("datasets/adni/mri_statistics.csv") 
assessmentDF     = pd.read_csv("datasets/adni/assessment_statistics.csv")
cognitiveScoreDF = pd.read_csv("datasets/adni/cogniteive_score_statistics.csv") 
baselineDF       = pd.read_csv("datasets/adni/Baseline_final.csv")  

In [8]:
completeDataDF = pd.merge(cognitiveScoreDF, assessmentDF, on="RID", how="inner")  
completeDataDF = pd.merge(completeDataDF, mriDF, on="RID", how="inner")  
completeDataDF = pd.merge(completeDataDF, baselineDF, on="RID", how="inner")  

print("Dataset shape: {}".format(completeDataDF.shape)) 

Dataset shape: (1371, 342)


In [9]:
TARGET_COLUMN = "DX"  

mapping          = {"AD": 0, "sMCI": 1, "CN": 2, "pMCI": 3}   
gender_mapping   = {'Male': 1, 'Female': 0}
marriage_mapping = {'Married': 1, 'Widowed': 2, 'Divorced': 3, 'Never married': 0}

dataset = completeDataDF.drop(['RID'], axis=1) 

dataset['DX'] = dataset['DX'].map(mapping)  
dataset['PTGENDER'] = dataset['PTGENDER'].map(gender_mapping) 
dataset['PTMARRY'] = dataset['PTMARRY'].map(marriage_mapping)  

X = dataset.drop([TARGET_COLUMN], axis=1)
y = dataset[TARGET_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.30, random_state=42) 
X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train, stratify=y_train, test_size=0.20, random_state=42)  

In [10]:
### define the columns of each modalities 
cs_columns     = cognitiveScoreDF.drop(['RID'], axis=1).columns.tolist()
nt_columns     = assessmentDF.drop(['RID'], axis=1).columns.tolist() 
mri_columns    = mriDF.drop(['RID'], axis=1).columns.tolist() 
static_columns = baselineDF.drop(['RID', 'DX'], axis=1).columns.tolist()  

In [13]:
pool_classifiers = {
    "RF": RandomForestClassifier(random_state=45), 
    "XGB": XGBClassifier(random_state=45),  
    # "LGBM": LGBMClassifier(verbose_eval = -1, random_state=45),
    "KNN": KNeighborsClassifier(), 
    "DT": DecisionTreeClassifier(random_state=45) 
    
}

feature_set = {
    "CS": cs_columns, 
    "NT": nt_columns, 
    "MRI": mri_columns, 
    "Static": static_columns
}

In [14]:
feature_sets = generate_featureset_combination(feature_set) 
pool = generate_pool(feature_sets, pool_classifiers)

Total combinations of feature sets: 15
Pool size: 60


In [15]:
feature_sets.keys()

dict_keys(['CS', 'NT', 'MRI', 'Static', 'CS+NT', 'CS+MRI', 'CS+Static', 'NT+MRI', 'NT+Static', 'MRI+Static', 'CS+NT+MRI', 'CS+NT+Static', 'CS+MRI+Static', 'NT+MRI+Static', 'CS+NT+MRI+Static'])

In [16]:
for base_classifier in tqdm(pool):  
    base_classifier.model.fit(X_train[base_classifier.features_set], y_train) 

100%|███████████████████████████████████████████| 60/60 [00:10<00:00,  5.77it/s]


In [17]:
df = select_subpool(X_dsel, y_dsel.to_numpy(), pool)

In [18]:
df.head(30)

Unnamed: 0,model,score,RF - (CS),RF - (NT),RF - (MRI),RF - (Static),RF - (CS+NT),RF - (CS+MRI),RF - (CS+Static),RF - (NT+MRI),RF - (NT+Static),RF - (MRI+Static),RF - (CS+NT+MRI),RF - (CS+NT+Static),RF - (CS+MRI+Static),RF - (NT+MRI+Static),RF - (CS+NT+MRI+Static),XGB - (CS),XGB - (NT),XGB - (MRI),XGB - (Static),XGB - (CS+NT),XGB - (CS+MRI),XGB - (CS+Static),XGB - (NT+MRI),XGB - (NT+Static),XGB - (MRI+Static),XGB - (CS+NT+MRI),XGB - (CS+NT+Static),XGB - (CS+MRI+Static),XGB - (NT+MRI+Static),XGB - (CS+NT+MRI+Static),KNN - (CS),KNN - (NT),KNN - (MRI),KNN - (Static),KNN - (CS+NT),KNN - (CS+MRI),KNN - (CS+Static),KNN - (NT+MRI),KNN - (NT+Static),KNN - (MRI+Static),KNN - (CS+NT+MRI),KNN - (CS+NT+Static),KNN - (CS+MRI+Static),KNN - (NT+MRI+Static),KNN - (CS+NT+MRI+Static),DT - (CS),DT - (NT),DT - (MRI),DT - (Static),DT - (CS+NT),DT - (CS+MRI),DT - (CS+Static),DT - (NT+MRI),DT - (NT+Static),DT - (MRI+Static),DT - (CS+NT+MRI),DT - (CS+NT+Static),DT - (CS+MRI+Static),DT - (NT+MRI+Static),DT - (CS+NT+MRI+Static)
0,RF - (CS),0.828125,0.0,0.197917,0.458333,0.276042,0.083333,0.078125,0.067708,0.203125,0.21875,0.416667,0.104167,0.114583,0.083333,0.213542,0.09375,0.0625,0.208333,0.442708,0.322917,0.083333,0.057292,0.067708,0.21875,0.213542,0.380208,0.098958,0.088542,0.083333,0.208333,0.098958,0.203125,0.3125,0.546875,0.411458,0.229167,0.546875,0.208333,0.546875,0.307292,0.546875,0.546875,0.234375,0.546875,0.546875,0.546875,0.15625,0.260417,0.598958,0.479167,0.166667,0.177083,0.145833,0.276042,0.25,0.442708,0.177083,0.15625,0.177083,0.234375,0.161458
1,RF - (NT),0.807292,0.197917,0.0,0.416667,0.317708,0.135417,0.192708,0.192708,0.057292,0.0625,0.375,0.104167,0.125,0.177083,0.067708,0.125,0.21875,0.104167,0.380208,0.354167,0.15625,0.171875,0.182292,0.083333,0.088542,0.369792,0.140625,0.161458,0.15625,0.083333,0.151042,0.255208,0.291667,0.494792,0.442708,0.25,0.494792,0.25,0.494792,0.296875,0.494792,0.494792,0.265625,0.494792,0.494792,0.494792,0.260417,0.1875,0.546875,0.46875,0.229167,0.229167,0.229167,0.192708,0.15625,0.453125,0.208333,0.229167,0.260417,0.203125,0.213542
2,RF - (MRI),0.494792,0.458333,0.416667,0.0,0.390625,0.416667,0.411458,0.442708,0.369792,0.416667,0.125,0.427083,0.4375,0.416667,0.380208,0.416667,0.4375,0.416667,0.213542,0.40625,0.4375,0.442708,0.432292,0.40625,0.390625,0.296875,0.442708,0.453125,0.4375,0.385417,0.432292,0.432292,0.385417,0.369792,0.390625,0.385417,0.369792,0.427083,0.369792,0.369792,0.369792,0.369792,0.390625,0.369792,0.369792,0.369792,0.458333,0.447917,0.338542,0.447917,0.4375,0.447917,0.447917,0.453125,0.4375,0.359375,0.458333,0.4375,0.479167,0.432292,0.442708
3,RF - (Static),0.635417,0.276042,0.317708,0.390625,0.0,0.265625,0.270833,0.270833,0.291667,0.317708,0.328125,0.286458,0.296875,0.286458,0.302083,0.276042,0.296875,0.286458,0.427083,0.171875,0.307292,0.291667,0.28125,0.317708,0.291667,0.28125,0.302083,0.291667,0.265625,0.307292,0.302083,0.3125,0.380208,0.479167,0.302083,0.328125,0.479167,0.286458,0.479167,0.375,0.479167,0.479167,0.333333,0.479167,0.479167,0.479167,0.286458,0.390625,0.541667,0.317708,0.296875,0.328125,0.255208,0.364583,0.369792,0.385417,0.317708,0.286458,0.307292,0.333333,0.3125
4,RF - (CS+NT),0.859375,0.083333,0.135417,0.416667,0.265625,0.0,0.098958,0.067708,0.130208,0.135417,0.375,0.052083,0.0625,0.09375,0.151042,0.041667,0.125,0.135417,0.411458,0.333333,0.083333,0.088542,0.088542,0.166667,0.130208,0.359375,0.078125,0.078125,0.083333,0.145833,0.078125,0.182292,0.291667,0.536458,0.421875,0.21875,0.536458,0.1875,0.536458,0.286458,0.536458,0.536458,0.234375,0.536458,0.536458,0.536458,0.1875,0.260417,0.588542,0.479167,0.166667,0.197917,0.166667,0.244792,0.229167,0.421875,0.145833,0.145833,0.208333,0.223958,0.151042
5,RF - (CS+MRI),0.822917,0.078125,0.192708,0.411458,0.270833,0.098958,0.0,0.083333,0.197917,0.223958,0.369792,0.098958,0.119792,0.046875,0.197917,0.078125,0.109375,0.171875,0.40625,0.307292,0.109375,0.072917,0.083333,0.203125,0.1875,0.34375,0.083333,0.104167,0.088542,0.182292,0.083333,0.208333,0.307292,0.53125,0.416667,0.203125,0.53125,0.213542,0.53125,0.302083,0.53125,0.53125,0.208333,0.53125,0.53125,0.53125,0.151042,0.244792,0.572917,0.432292,0.192708,0.192708,0.140625,0.270833,0.255208,0.40625,0.171875,0.161458,0.203125,0.260417,0.166667
6,RF - (CS+Static),0.84375,0.067708,0.192708,0.442708,0.270833,0.067708,0.083333,0.0,0.197917,0.192708,0.401042,0.109375,0.098958,0.088542,0.208333,0.088542,0.088542,0.171875,0.447917,0.317708,0.078125,0.072917,0.052083,0.203125,0.166667,0.364583,0.09375,0.083333,0.067708,0.182292,0.09375,0.197917,0.328125,0.541667,0.416667,0.213542,0.541667,0.203125,0.541667,0.322917,0.541667,0.541667,0.21875,0.541667,0.541667,0.541667,0.171875,0.286458,0.59375,0.453125,0.130208,0.203125,0.151042,0.291667,0.265625,0.40625,0.151042,0.140625,0.203125,0.239583,0.145833
7,RF - (NT+MRI),0.78125,0.203125,0.057292,0.369792,0.291667,0.130208,0.197917,0.197917,0.0,0.078125,0.328125,0.130208,0.151042,0.171875,0.03125,0.119792,0.213542,0.119792,0.364583,0.348958,0.171875,0.1875,0.1875,0.109375,0.09375,0.333333,0.166667,0.177083,0.182292,0.109375,0.166667,0.239583,0.276042,0.479167,0.395833,0.223958,0.479167,0.213542,0.479167,0.28125,0.479167,0.479167,0.239583,0.479167,0.479167,0.479167,0.255208,0.192708,0.541667,0.442708,0.244792,0.234375,0.234375,0.1875,0.182292,0.447917,0.234375,0.244792,0.255208,0.208333,0.229167
8,RF - (NT+Static),0.796875,0.21875,0.0625,0.416667,0.317708,0.135417,0.223958,0.192708,0.078125,0.0,0.375,0.145833,0.125,0.208333,0.067708,0.15625,0.239583,0.083333,0.432292,0.364583,0.166667,0.203125,0.203125,0.145833,0.057292,0.359375,0.182292,0.161458,0.1875,0.114583,0.192708,0.244792,0.291667,0.494792,0.442708,0.260417,0.494792,0.239583,0.494792,0.296875,0.494792,0.494792,0.276042,0.494792,0.494792,0.494792,0.270833,0.1875,0.557292,0.46875,0.21875,0.239583,0.239583,0.192708,0.177083,0.432292,0.21875,0.21875,0.260417,0.192708,0.234375
9,RF - (MRI+Static),0.536458,0.416667,0.375,0.125,0.328125,0.375,0.369792,0.401042,0.328125,0.375,0.0,0.385417,0.395833,0.375,0.338542,0.375,0.395833,0.375,0.255208,0.34375,0.395833,0.401042,0.390625,0.364583,0.348958,0.223958,0.401042,0.411458,0.395833,0.34375,0.390625,0.390625,0.375,0.411458,0.359375,0.364583,0.411458,0.385417,0.411458,0.359375,0.411458,0.411458,0.369792,0.411458,0.411458,0.411458,0.416667,0.375,0.359375,0.40625,0.40625,0.40625,0.416667,0.411458,0.375,0.307292,0.40625,0.395833,0.4375,0.380208,0.390625


In [31]:
# fig, ax = plt.subplots(figsize=(10, 10))
# sns.heatmap(df.drop(['score'], axis=1).set_index(['model']), linewidth=.5)

In [21]:
# measure_model_diversity(df, 0.05)
short_list = get_survivals(df, 0.07) 

print(f"Short list size: {len(short_list)}")

short_pool = [] 

for bc in pool: 
    if bc.name in short_list: 
        short_pool.append(bc)

Short list size: 23


In [None]:
population, generations = run_evolution(
    populate_func=partial(
        generate_population, size=10, genome_length=len(short_pool)
    ), 
    fitness_func=partial(
        fitness, pool=short_pool 
    ), 
    fitness_limit=0.9, 
    generation_limit=30 
)

print(f"Number of generation: {generations}")

Genome [1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1]
Best Score: 0.879
Genome [1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1]
Best Score: 0.879


### Alzheimer NACC 

In [6]:
nacc_dataset = pd.read_csv("datasets/nacc/investigator_nacc64.csv")

In [7]:
demographics = ['BIRTHMO', 'BIRTHYR', 'SEX', 'HISPANIC', 'HISPOR', 'RACE', 'RACESEC', 'PRIMLANG', 'EDUC', 'MARISTAT', 
                'NACCLIVS', 'INDEPEND', 'RESIDENC', 'HANDED', 'NACCAGE', 'NACCAGEB', 'NACCNIHR']


physical = ['HEIGHT', 'WEIGHT', 'NACCBMI', 'BPSYS', 'BPDIAS', 'HRATE', 'VISION', 'VISCORR', 'VISWCORR', 'HEARING', 'HEARAID', 'HEARWAID']


medications = ['ANYMEDS', 'DRUG1', 'DRUG2', 'DRUG3', 'DRUG4', 'DRUG5', 'DRUG6', 'DRUG7', 'DRUG8', 'DRUG9', 'DRUG10', 
               'DRUG11', 'DRUG12', 'DRUG13', 'DRUG14', 'DRUG15', 'DRUG16', 'DRUG17', 'DRUG18', 'DRUG19', 'DRUG20', 
               'DRUG21', 'DRUG22', 'DRUG23', 'DRUG24', 'DRUG25', 'DRUG26', 'DRUG27', 'DRUG28', 'DRUG29', 'DRUG30', 
               'DRUG31', 'DRUG32', 'DRUG33', 'DRUG34', 'DRUG35', 'DRUG36', 'DRUG37', 'DRUG38', 'DRUG39', 'DRUG40', 
               'NACCAMD', 'NACCAHTN', 'NACCHTNC', 'NACCACEI', 'NACCAAAS', 'NACCBETA', 'NACCCCBS', 'NACCDIUR', 'NACCVASD', 
               'NACCANGI', 'NACCLIPL', 'NACCNSD', 'NACCAC', 'NACCADEP', 'NACCAPSY', 'NACCAANX', 'NACCADMD', 'NACCPDMD', 
               'NACCEMD', 'NACCEPMD', 'NACCDBMD']


health_history = ['TOBAC30', 'TOBAC100', 'SMOKYRS', 'PACKSPER', 'QUITSMOK', 'CVHATT', 'CVAFIB', 'CVANGIO', 
                  'CVBYPASS', 'CVPACE', 'CVCHF', 'CVOTHR', 'CBSTROKE', 'NACCSTYR', 'CBTIA', 'NACCTIYR', 
                  'PD', 'PDOTHR', 'SEIZURES', 'TRAUMBRF', 'TRAUMEXT', 'TRAUMCHR', 'NCOTHR', 
                  'DIABETES', 'HYPERTEN', 'HYPERCHO', 'B12DEF', 'THYROID', 'INCONTU', 'INCONTF', 'ALCOHOL', 
                  'ABUSOTHR', 'DEP2YRS', 'DEPOTHR', 'PSYCDIS', 'NACCTBI']

neuropsychiatric_inventory_questionnaire = ['NPIQINF', 'DEL', 'DELSEV', 'HALL', 'HALLSEV', 'AGIT', 'AGITSEV', 'DEPD', 'DEPDSEV', 
                                            'ANX', 'ANXSEV', 'ELAT', 'ELATSEV', 'APA', 'APASEV', 'DISN', 'DISNSEV', 'IRR', 'IRRSEV', 
                                            'MOT', 'MOTSEV', 'NITE', 'NITESEV', 'APP', 'APPSEV']

geriatric_depression_scale = ['NOGDS', 'SATIS', 'DROPACT', 'EMPTY', 'BORED', 'SPIRITS', 'AFRAID', 'HAPPY', 'HELPLESS', 'STAYHOME', 
                              'MEMPROB', 'WONDRFUL', 'WRTHLESS', 'ENERGY', 'HOPELESS', 'BETTER', 'NACCGDS']

all_columns = demographics + physical + health_history + neuropsychiatric_inventory_questionnaire + geriatric_depression_scale

In [8]:
TARGET = 'CDRGLOB' # Global Clinical Dementia Rating 
VISIT_ORDER = 1  # NACCVNUM 

dataset = nacc_dataset[nacc_dataset.NACCVNUM == VISIT_ORDER] 
all_columns.append(TARGET)

dataset[dataset[TARGET].isin([1. ,  0. ,  2. ,  3.])] 
dataset[TARGET] = dataset[TARGET].astype(int)

dataset.fillna(0, inplace=True) 

X = dataset[all_columns]
y = dataset[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.30, random_state=42) 
X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train, stratify=y_train, test_size=0.25, random_state=42)   

In [9]:
side_by_side(X_test.isnull().sum(), X_test.count())  

BIRTHMO     0                BIRTHMO     15078        
BIRTHYR     0                BIRTHYR     15078        
SEX         0                SEX         15078        
HISPANIC    0                HISPANIC    15078        
HISPOR      0                HISPOR      15078        
           ..                            ...          
ENERGY      0                ENERGY      15078        
HOPELESS    0                HOPELESS    15078        
BETTER      0                BETTER      15078        
NACCGDS     0                NACCGDS     15078        
CDRGLOB     0                CDRGLOB     15078        
Length: 108, dtype: int64    Length: 108, dtype: int64



In [10]:
pool_classifiers = [
    RandomForestClassifier(random_state=45), 
    XGBClassifier(random_state=45),  
    # LGBMClassifier(verbose_eval = -1, random_state=45),
    # KNeighborsClassifier(), 
    DecisionTreeClassifier(random_state=45) 
    
]

feature_set = [
    demographics, 
    physical, 
    health_history, 
    neuropsychiatric_inventory_questionnaire, 
    geriatric_depression_scale
]

In [11]:
feature_sets = generate_featureset_combination(feature_set) 
pool = generate_pool(feature_sets, pool_classifiers)

Total combinations of feature sets: 31
Pool size: 93


In [None]:
population, generations = run_evolution(
    populate_func=partial(
        generate_population, size=10, genome_length=len(pool)
    ), 
    fitness_func=partial(
        fitness, pool=pool 
    ), 
    fitness_limit=0.9, 
    generation_limit=30 
)

print(f"Number of generation: {generations}")

Genome [0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0]
Best Score: 0.813
Genome [0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1]
Best Score: 0.815
Genome [0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1]
Best Score: 0.815
Genome [0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,