# FASTENER Usage Example
The example demonstrates how to use FASTENER algorithm for feature selection on a dataset. `scikit-learn`'s breast cancer dataset is used in this demonstration. FASTENER returns a Pareto front of relevant feature sets, where we try to optimize $F_1$ score for a particular number of features.

The following steps are shown in the example:
* data preparation
* feature evaluation function `eval_func` implementation, which is responsible for calculating information gain of a particular feature
* setting up the FASTENER (including basic description of the options)
* running the FASTENER loop
* reading the fastener results

In [9]:
# import dataset
from sklearn.datasets import load_breast_cancer

# import preprocessing tools
from sklearn import preprocessing
import numpy as np
import pandas as pd

# import learning/evaluation
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

# typing
from typing import Dict, List, Callable, Any, Tuple, Optional, \
    Counter as CounterType, Set

# FASTENER specific imports
from src.random_utils import shuffle
from src import random_utils
from src.item import Item, EvalItem, Result, Population, flatten_population, FitnessFunction, \
    Genes, EvalItem, RandomFlipMutationStrategy, RandomEveryoneWithEveryone, \
    IntersectionMating, UnionMating, IntersectionMatingWithInformationGain, \
    IntersectionMatingWithWeightedRandomInformationGain, UnevaluatedPopulation, \
    MatingStrategy, MutationStrategy, MatingSelectionStrategy
from src import fastener

## Data preparation 

In [None]:
# loading breast cancer dataset
# scikit-learn 0.22+ is needed
cancer = load_breast_cancer(as_frame=True)
X_df = cancer.data
y_df = cancer.target

# basic dataset split
n_sample = X_df.shape[0]
n_test = int(n_sample * 0.8)

labels_train = y_df.to_numpy()[:n_test]
labels_test = y_df.to_numpy()[n_test:]

XX_train = X_df.to_numpy()[:n_test, :]
XX_test = X_df.to_numpy()[n_test:, :]

## Evaluation function 

In [None]:
def eval_fun(model: Any, genes: "Genes", shuffle_indices: Optional[List[int]] = None) -> "Result":
    test_data = XX_test[:, genes]
    if shuffle_indices:
        test_data = test_data.copy()
        for j in shuffle_indices:
            shuffle(test_data[:, j])
    pred = model.predict(test_data)
    res = Result(f1_score(labels_test, pred))
    return res

## Setting configuration parameters 

In [None]:
number_of_genes = XX_train.shape[1]
general_model = DecisionTreeClassifier
#output folder name must be changed every time the algorithm is run
output_folder_name="output"

#to start the algorithm initial_genes or initial_population must be provided
initial_genes = [
    [0]
]

# Select mating selection strategies (RandomEveryoneWithEveryone, NoMating) and mating strategy
# (UnionMating, IntersectionMating, IntersectionMatingWithInformationGain, IntersectionMatingWithWeightedRandomInformationGain) 
mating = RandomEveryoneWithEveryone(pool_size=3, mating_strategy=IntersectionMatingWithWeightedRandomInformationGain())

# Random mutation (probability of gene mutating: 1 / number_of_genes)
mutation = RandomFlipMutationStrategy(1 / number_of_genes)

entropy_optimizer = fastener.EntropyOptimizer(
    general_model, XX_train, labels_train, eval_fun,
    number_of_genes, mating, mutation, initial_genes=initial_genes,
    config=fastener.Config(output_folder=output_folder_name, random_seed=2020, reset_to_pareto_rounds=5)
)

## Running the algorithm

In [None]:
entropy_optimizer.mainloop()

## Reading results

In [None]:
# read log from last generation
object = pd.read_pickle(f'log/{output_folder_name}/generation_1000.pickle')

# list of best-scoring EvalItem objects for each number of features
best = list(object.front.values())

for item in best:
    # names of best features
    selected_features =X_df.iloc[:, item.genes].columns.tolist()
    
    X = X_df[selected_features].values.astype(float)
    y = y_df.values.astype(float)
        
    # evaluates each set of features with cross validation
    model = DecisionTreeClassifier()
    cvs = cross_val_score(model, X, y, cv=10)
    print("Features:", selected_features)
    print("Accuracy: ", cvs.mean(), " stdev: ", cvs.std(), "\n")