In [1]:
from feature_selection import GeneSift

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## GeneSift Evaluation

To start with, let's evaluate GeneSift's performance.

In [12]:
# Prepare data...

# Read in the data (ignoring the 'Unnamed: 32' feature, and removing the id).
data = pd.read_csv(r'breast_cancer_data.csv').drop(columns=['id', 'Unnamed: 32'])

# Convert diagnoses into binary output (0 or 1).
data = pd.get_dummies(data, 'diagnosis').drop(columns=['diagnosis_B']).rename({'diagnosis_M' : 'diagnosis'}, axis=1)

# Seperate labels from predictors
y = data['diagnosis']
X = data.drop(columns=['diagnosis'])

# Split the data into training and testing sets (arbitrarily an 80-20% split).
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=0)

# EA parameters...
# These are literally the first values we've tried. They seem to work fine.
# Hence, there is plenty of room for experimentation!

# Population size (i.e. the number of candidate solutions in each generation).
pop_size = 100
# Length of each candidate solution  (i.e. the number of features.)
candidate_length = len(X.columns)
# Limit on the number of generations to prevent excessive computation.
gen_limit = 250
# Size of the mating pool (must be even and smaller than pop_size).
pool_size = 50
# Size of the tournament for tournament selection (must be smaller than pool_size).
tournament_size = 20
# Crossover rate.
crossover_rate = 0.9
# Mutation rate.
mutation_rate = 0.2
# Threshold for improvement (used to decide when to terminate early).
improve_threshold = 0.0001

# Instantiate a GeneSift selector.
geneSift = GeneSift(pop_size, candidate_length, gen_limit, pool_size, tournament_size, crossover_rate, mutation_rate, improve_threshold)

# Establish the data.
geneSift.establish_data(X_train, y_train)

# Find the optimal features.
selection = geneSift.find_optimal_features()

# Report the final fitness.
print('Final fitness:', geneSift.fitness_function(selection))

Final fitness: 0.989010989010989


Which features are selected?

In [13]:
counts = [0] * candidate_length
algorithm_runs = 50

for i in range(algorithm_runs):
    selection = geneSift.find_optimal_features()

    for j in range(candidate_length):
        counts[j] += selection[j]

probabilities = [count / algorithm_runs for count in counts]

In [14]:
print(probabilities)

[0.48, 0.72, 0.38, 0.34, 0.62, 0.48, 0.62, 0.24, 0.48, 0.6, 0.82, 0.44, 0.48, 0.44, 0.58, 0.26, 0.54, 0.46, 0.5, 0.7, 0.56, 0.1, 0.5, 0.44, 0.9, 0.46, 0.58, 0.38, 0.58, 0.3]


In [15]:
interpretations = []
for prob in probabilities:
    if prob < 0.2:
        interpretations.append('Almost never')
    elif prob < 0.4:
        interpretations.append('Infrequently')
    elif prob <= 0.6:
        interpretations.append('Pretty much at random')
    elif prob < 0.8:
        interpretations.append('Frequently')
    else:
        interpretations.append('Almost always')

print(interpretations)

['Pretty much at random', 'Frequently', 'Infrequently', 'Infrequently', 'Frequently', 'Pretty much at random', 'Frequently', 'Infrequently', 'Pretty much at random', 'Pretty much at random', 'Almost always', 'Pretty much at random', 'Pretty much at random', 'Pretty much at random', 'Pretty much at random', 'Infrequently', 'Pretty much at random', 'Pretty much at random', 'Pretty much at random', 'Frequently', 'Pretty much at random', 'Almost never', 'Pretty much at random', 'Pretty much at random', 'Almost always', 'Pretty much at random', 'Pretty much at random', 'Infrequently', 'Pretty much at random', 'Infrequently']


In [16]:
print('Almost never count:', len([i for i in interpretations if i == 'Almost never']))
print('Infrequently count:', len([i for i in interpretations if i == 'Infrequently']))
print('Pretty much at random count:', len([i for i in interpretations if i == 'Pretty much at random']))
print('Frequently count:', len([i for i in interpretations if i == 'Frequently']))
print('Almost always count:', len([i for i in interpretations if i == 'Almost always']))

Almost never count: 1
Infrequently count: 6
Pretty much at random count: 17
Frequently count: 4
Almost always count: 2


## Comparison to Other Algorithms

In [17]:
def evaluation(X, y, selection):
    # Cast the candidate solution to a boolean array.
    selected_features = [bool(x) for x in selection]

    # Our X and y are the selected features and the diagnosis.
    X = X[X.columns[selected_features]]

    # Split the data into training and testing sets (arbitrarily an 80-20% split).
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=0)

    # Normlaise the data for numerical stability.
    # We normalise after splitting to prevent data leakage.
    ss_train = StandardScaler()
    X_train = ss_train.fit_transform(X_train)

    ss_test = StandardScaler()
    X_test = ss_test.fit_transform(X_test)

    # Define and train a logistic regression model.
    model = LogisticRegression()

    model.fit(X_train, y_train)

    # Determine the accuracy of the model (and hence the fitness of the candidate solution).
    y_pred = model.predict(X_test)
    return accuracy_score(y_true=y_test, y_pred=y_pred)

In [20]:
# GeneSift selection's fitness...
selection = geneSift.find_optimal_features()
geneSift_fitness = evaluation(X, y, selection)
print("Fitness of GeneSift feature selection:", geneSift_fitness)

Fitness of GeneSift feature selection: 0.9824561403508771


In [21]:
# Low variance feature selection...
from sklearn.feature_selection import VarianceThreshold

p = 0.8

selection_lv = VarianceThreshold(threshold=p * (1 - p))
selection_lv.fit_transform(X_train, y_train)
selected_features = selection_lv.get_feature_names_out()

# Convert this selection into a boolean array.
low_variance_selection = []
for feature in X.columns:
    if feature in selected_features:
        low_variance_selection.append(1)
    else:
        low_variance_selection.append(0)

low_variance_fitness = evaluation(X, y, low_variance_selection)
print("Fitness of low variance feature selection:", low_variance_fitness) 

print("We improve on low variance feature selection by {}%.".format(
    round(((geneSift_fitness / low_variance_fitness) - 1) * 100, 3)
))

Fitness of low variance feature selection: 0.9473684210526315
We improve on low variance feature selection by 3.704%.


In [23]:
# Univariate feature selection...
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

k = len([1 for i in range(len(selection)) if selection[i] == 0])
print('Using k =', k)

# Make the selection.
selection_u = SelectKBest(f_classif, k=k).fit_transform(X, y)

univariate_selection = []
for i in range(len(X.columns)):
    column = data.iloc[:,i].values
    
    if column in selection_u.transpose():
        univariate_selection.append(1)
    else:
        univariate_selection.append(0)

univariate_fitness = evaluation(X, y, univariate_selection)
print("Fitness of low variance feature selection:", univariate_fitness) 

print("We improve on low variance feature selection by {}%.".format(
    round(((geneSift_fitness / univariate_fitness) - 1) * 100, 3)
))

Using k = 14
Fitness of low variance feature selection: 0.9473684210526315
We improve on low variance feature selection by 3.704%.


In [24]:
# Recursive Feature Elimination (RFE)...
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE

n = len([1 for i in range(len(selection)) if selection[i] == 0])
print('Using n =', k)

# Make the selection.
selection_rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=n)
selection_rfe.fit_transform(X, y)
selected_features = selection_rfe.get_feature_names_out()

# Convert this selection into a boolean array.
rfe_selection = []
for feature in X.columns:
    if feature in selected_features:
        rfe_selection.append(1)
    else:
        rfe_selection.append(0)

rfe_fitness = evaluation(X, y, rfe_selection)
print("Fitness of RFE:", rfe_fitness) 

print("We improve on RFE by {}%.".format(
    round(((geneSift_fitness / rfe_fitness) - 1) * 100, 3)
))

Using n = 14
Fitness of RFE: 0.9649122807017544
We improve on RFE by 1.818%.
