# Exploration of Dimension Reduction
<span style="font-weight:bold; font-size:17pt; color:#666666;">Genetic Algorithm for feature selection</span>
<hr>

In [2]:
%pylab inline
%autosave 25

import h5py
import random
import pandas as pd
import pickle

from abc import ABC, abstractmethod
from deap import base
from deap import creator
from deap import tools
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

Populating the interactive namespace from numpy and matplotlib


Autosaving every 25 seconds


## Data location

Change this when you get a new data set.

In [3]:
data_loc = '../data/FDA-COVID19_files_v1.0/'

## GA code

Reference material:

* [deap](https://deap.readthedocs.io/en/master/)
* [deap: Knapsack problem](https://deap.readthedocs.io/en/master/examples/ga_knapsack.html)

Things to tune:

* **target_ratio** = 0.1 <- percentage of features to keep

other parameters are adjusted on an exploration vs. stability continuum:

* **pop_size** = 50 <- (int) +explore

* **INDPB** = 0.05 <- (probability float) +explore

* **TOURNSIZE** = 3 <- (int) +stable

* **CXPB** = 0.5 <- (probability float) +explore

* **MUTPB** = 0.2 (probability float) +explore

In [4]:
# TODO: this likely should be tuned to the data
model = RandomForestClassifier()


def mask_value(mask, features, labels):
    """Returns the mean roc_auc_score of a random forest model trained with
    the indicated subset of features."""
    features_tmp = features[:, mask == 1]
    np.random.seed(42)
    scores = []
    mini_batches_generator = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
    try:
        for training_index, validation_index in mini_batches_generator.split(features_tmp, labels):
            training_features = features_tmp[training_index]
            training_labels = np.ravel(labels[training_index])
            validation_features = features_tmp[validation_index]
            validation_labels = np.ravel(labels[validation_index])
            model.fit(training_features, training_labels)
            predictions = model.predict_proba(validation_features)[:, 1]
            scores.append(roc_auc_score(validation_labels, predictions))
        return np.mean(scores)
    except ValueError:
        return 0


def mask_opt_function(mask, features, labels):
    """The function being maximized by genetic algorithm.
     It attempts to balance the output of mask_value with the feature dimension."""
    feature_dim = np.sum(mask)
    model_auc = mask_value(mask, features, labels)
    return model_auc ** 2 / (1 + feature_dim),


def genetic_algorithm(features, labels, num_gens):
    _, num_features = np.shape(features)
    np.random.seed(42)
    print_freq = 10
    target_ratio = 0.1
    pop_size = 50
    INDPB = 0.05
    TOURNSIZE = 3
    CXPB = 0.5
    MUTPB = 0.2

    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", np.ndarray, typecode='b', fitness=creator.FitnessMax)
    toolbox = base.Toolbox()
    toolbox.register("attr_bool", np.random.choice, 2, p=[1 - target_ratio, target_ratio])
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, num_features)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("evaluate", mask_opt_function, features=features, labels=labels)
    toolbox.register("mate", tools.cxOnePoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=INDPB)
    toolbox.register("select", tools.selTournament, tournsize=TOURNSIZE)

    max_record = []
    pop = toolbox.population(n=pop_size)
    fitnesses = list(map(toolbox.evaluate, pop))
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
    gen = 0
    while gen < num_gens:
        # Select the next generation individuals
        offspring = toolbox.select(pop, len(pop))
        # Clone the selected individuals
        offspring = list(map(toolbox.clone, offspring))
        # Apply crossover and mutation on the offspring
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if np.random.random() < CXPB:
                toolbox.mate(child1, child2)
                del child1.fitness.values
                del child2.fitness.values
        for mutant in offspring:
            if np.random.random() < MUTPB:
                toolbox.mutate(mutant)
                del mutant.fitness.values
        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
        pop[:] = offspring
        scores = [toolbox.evaluate(ind) for ind in pop]
        if (gen % print_freq == 0) or (gen == num_gens - 1):
            best_mask = pop[np.argmax(scores)]
            max_score = np.round(np.max(scores), 3)
            print('Best ROC-AUC: {},'
                  ' Feature Dimension: {}'.format(max_score, np.sum(best_mask)))
            max_record.append([max_score, int(np.sum(best_mask))])
        gen += 1
    return max_record, np.array(pop[np.argmax(scores)])

In [5]:
class BaseFeatureSelectionModel(ABC):
    @abstractmethod
    def fit(self, features, labels, params):
        pass

    @abstractmethod
    def transform(self, features, num_keep_features):
        pass

    @abstractmethod
    def save(self, location):
        pass

    @abstractmethod
    def load(self, location):
        pass

class GeneticFeatureSelection(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.feature_mask = None
        self.opt_record = None

    def fit(self, features, labels, params):
        num_gens = params.get('num_gens', 100)

        self.opt_record, self.feature_mask = genetic_algorithm(features, labels, num_gens)

    def transform(self, features):
        return features[:, self.feature_mask == 1]

    def save(self, location, serialized_feature_selector=None):
        pickle_out = open(location,"wb")
        pickle.dump(self, pickle_out)
        pickle_out.close()

    def load(self, location):
        serialized_fs = open(location, 'rb')
        fs = pickle.load(serialized_fs)
        self.opt_record = fs.opt_record
        self.feature_mask = fs.feature_mask

## Test using dummy inputs

In [6]:
# synthetic data
n_samples, n_dims = 1000, 100
input_array = np.random.sample(size=(n_samples, n_dims))
activity_labels = np.random.choice(2, size=n_samples, p=(0.95, 0.05))

# Random Forest Method: usage example
model_location = data_loc+'genetic_alg.mdl'

feature_selector = GeneticFeatureSelection()
feature_selector.fit(input_array, activity_labels, {'num_gens': 1})
feature_selector.save(model_location)

feature_selector2 = GeneticFeatureSelection()
feature_selector2.load(model_location)
reduced_features = feature_selector2.transform(input_array)
print(reduced_features)

# basic assertions on shape and number of results
results_shape = reduced_features.shape
assert(len(results_shape) is 2)
assert(results_shape[0] == 1000)
assert(results_shape[1] > 1)



Best ROC-AUC: 0.074, Feature Dimension: 3
[[0.21680072 0.7164187  0.61530106]
 [0.33611435 0.4118338  0.46209917]
 [0.36317541 0.66106757 0.98827968]
 ...
 [0.48051528 0.23755224 0.88459333]
 [0.36934608 0.61523007 0.19875022]
 [0.63152305 0.10875524 0.37207128]]


## Load the real cid-pid data

In [6]:
store = pd.HDFStore(data_loc + 'sampled_data.h5')
df_features = pd.DataFrame(store['df' ])
store.close()
print('rows: {:,}, columns: {:,}'.format(len(df_features), len(df_features.columns)))

rows: 22,172, columns: 17,076


## Run data against GA

In [24]:
# split data into train and test sets
X_train = df_features.copy()
X_train.drop(columns=['cid', 'pid', 'activity'], inplace=True)
input_array = X_train.values
del X_train
labels = df_features['activity'].values

feature_selector = GeneticFeatureSelection()

In [25]:
%%time

feature_selector.fit(input_array, labels, {'num_gens': 1})

Best ROC-AUC: 0.0, Feature Dimension: 1638
CPU times: user 17min 11s, sys: 1min 47s, total: 18min 59s
Wall time: 18min 59s


In [41]:
selected_features = np.array(df_features.columns.tolist())[3:][feature_selector.feature_mask == 1]
selected_features

array(['KEY', '*KP', '*EE', ..., '4071', '4075', '4091'], dtype='<U14')