In [1]:
import warnings

from pandas.core.indexing import check_dict_or_set_indexers

warnings.filterwarnings("ignore")

from fss_funcs import fss
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
import numpy as np


In [2]:
df = pd.read_csv("csv/outlier_filtered.csv")

response_var = 'Diabetes_012'
features = list(df.columns)
features.remove(response_var)

print(features, response_var)
# Pretty-print using tabulate
df.head(1)

['BMI_q_normal', 'MentHlth', 'PhysHlth_q_uniform', 'GenHlth_q_uniform', 'Age_q_uniform', 'Education_coxbox', 'Income_q_uniform', 'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex'] Diabetes_012


Unnamed: 0,BMI_q_normal,MentHlth,PhysHlth_q_uniform,GenHlth_q_uniform,Age_q_uniform,Education_coxbox,Income_q_uniform,Diabetes_012,HighBP,HighChol,...,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,DiffWalk,Sex
0,1.60221,1.998592,0.891892,1.0,0.581582,-1.109347,0.117117,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [3]:
starting_point = [
    'all', 'none'
]
search_organization = [
    'backward', 'forward'  # , 'stepwise', 'metaheuristic'
]
""""
Univariate:
    Parametric methods:
        Discrete predictors:
            Mutual information Blanco et al. (2005)
            Gain ratio Hall and Smith (1998)
            Symmetrical uncertainty Hall (1999)
            Chi-squared Forman (2003)
            Odds ratio Mladenic and Grobelnik (1999)
            Bi-normal separation Forman (2003
        Continuous predictors:
            t-test family Jafari and Azuaje (2006)
            ANOVA Jafari and Azuaje (2006)
    Model-free methods:
        Threshold number of misclassification (TNoM) Ben-dor et al. (2000)
        P-metric Slonim et al. (2000)
        Mann-Whitney test Thomas et al. (2001)
        Kruskal-Wallis test Lan and Vucetic (2011)
        Between-groups to within-groups sum of squares Dudoit et al. (2002)
        Scores based on estimating density functions Inza et al. (2004)
Multivariate:
    RELIEF Kira and Rendell (1992)
    Correlation-based feature selection Hall (1999)
    Conditional mutual information Fleuret (2004)
"""
uni_filter_options = [
    "f_classif","mutual_information"
]
multi_filter_options = [
    'multivariate_cmi'
]
"""
Deterministic heuristics:
    Sequential feature selection Fu (1968)
    Sequential forward feature selection Fu (1968)
    Sequential backward elimination Marill and Green (1963)
    Greedy hill climbing John et al. (1994)
    Best first Xu et al. (1988)
    Plus-L-Minus-r algorithm Stearns (1976)
    Floating search selection Pudil et al. (1994)
    Tabu search Zhang and Sun (2002)
    Branch and bound Lawler and Wood (1966)
Non-deterministic heuristics:
    Single-solution metaheuristics:
        Simulated annealing Doak (1992)
        Las Vegas algorithm Liu and Motoda (1998)
        Greedy randomized adaptive search procedure Bermejo et al. (2011)
        Variable neighborhood search Garcia-Torres et al. (2005)
    Population-based metaheuristics:
        Scatter search Garcia-Lopez et al. (2006)
        Ant colony optimization Al-An (2005)
        Particle swarm optimization Lin et al. (2008)
        Evolutionary algorithms:
            Genetic algorithms Siedlecki and Sklansky (1989)
            Estimation of distribution algorithms Inza et al. (2000)
            Differential evolution Khushaba et al. (2008)
            Genetic programming Muni et al. (2004)
            Evolution strategies Vatolkin et al. (2009)
"""
wrapper_options = [
    'sequential_feature_selection',
]
stopping_criterion = [
    # 'performance_plateau',
    'limit_fea_8', 'limit_fea_10', 'limit_fea_12', 'limit_fea_14', 'limit_fea_16'
]
wrapper_models = [
    LogisticRegression(random_state=1, max_iter=1000),
    MLPClassifier(solver='adam', alpha=1e-5,
                  hidden_layer_sizes=(32, 32), random_state=1, max_iter=1000, early_stopping=True),
    AdaBoostClassifier(n_estimators=50, learning_rate=1.0, random_state=1),
]

In [4]:
evaluator = "wrapper_filter"
print(
    f"start,end,direction,subevaluator,model,{str(features)[1:-1]}".replace("'",""))
subdf = df.sample(frac=0.2, random_state=1)

for subevaluator in wrapper_options:
    start = "all"
    for direction in search_organization:
        if (start == 'none' and direction == 'backward') or (start == 'all' and direction == 'forward'):
            continue
        for end in stopping_criterion:
                for model in [
                            LogisticRegression(random_state=1, max_iter=1000),
                            MLPClassifier(solver='adam', alpha=1e-5,
                                          hidden_layer_sizes=(32, 32), random_state=1, max_iter=1000, early_stopping=True),
                            AdaBoostClassifier(n_estimators=50, learning_rate=1.0, random_state=1),
                        ]:
                    print(f"Evaluating {model.__class__}", end="")
                    
                    fs = fss(start, end, direction, subevaluator, model=model, df=df, features=features,
                             target=response_var)
                    if len(fs) > 1:
                        model_srt = str(model).replace("alpha=1e-05, early_stopping=True, ", "").replace(", random_state=1", "").replace(" ", "").replace("\n", "").replace(",", "|")
                        print(
                            f"\r{start},{end},{direction},{evaluator},{model_srt},{np.array2string(fs.astype(int), separator=',')[1:-1]}")

start,end,direction,subevaluator,model,BMI_q_normal, MentHlth, PhysHlth_q_uniform, GenHlth_q_uniform, Age_q_uniform, Education_coxbox, Income_q_uniform, HighBP, HighChol, CholCheck, Smoker, Stroke, HeartDiseaseorAttack, PhysActivity, Fruits, Veggies, HvyAlcoholConsump, AnyHealthcare, NoDocbcCost, DiffWalk, Sex
all,limit_fea_8,backward,wrapper_filter,LogisticRegression(max_iter=1000),1,0,0,1,1,0,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0
all,limit_fea_8,backward,wrapper_filter,MLPClassifier(hidden_layer_sizes=(32|32)|max_iter=1000),1,0,0,1,1,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,0
all,limit_fea_8,backward,wrapper_filter,AdaBoostClassifier(random_state=1),1,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,1
all,limit_fea_10,backward,wrapper_filter,LogisticRegression(max_iter=1000),1,0,0,1,1,0,1,1,1,0,0,0,1,0,0,0,1,0,0,1,1
all,limit_fea_10,backward,wrapper_filter,MLPClassifier(hidden_layer_sizes=(32|32)|max_iter=1000),1,0,0,1,1,0,0,1,0,1,1,0,1,1,0,0,1,1,0,0,0
all,limit_fea_10,backward,wrapper_filter,AdaBoostClassifier(r

KeyboardInterrupt: 

In [4]:
evaluator = "wrapper_filter"
print(
    f"start,end,direction,subevaluator,model,{str(features)[1:-1]}".replace("'",""))
subdf = df.sample(frac=0.2, random_state=1)

for subevaluator in wrapper_options:
    start = "none"
    for direction in search_organization:
        if (start == 'none' and direction == 'backward') or (start == 'all' and direction == 'forward'):
            continue
        for end in stopping_criterion:
                for model in [
                            LogisticRegression(random_state=1, max_iter=1000),
                            MLPClassifier(solver='adam', alpha=1e-5,
                                          hidden_layer_sizes=(32, 32), random_state=1, max_iter=1000, early_stopping=True),
                            AdaBoostClassifier(n_estimators=50, learning_rate=1.0, random_state=1),
                        ]:
                    print(f"Evaluating {model.__class__}", end="")
                    
                    fs = fss(start, end, direction, subevaluator, model=model, df=df, features=features,
                             target=response_var)
                    if len(fs) > 1:
                        model_srt = str(model).replace("alpha=1e-05, early_stopping=True, ", "").replace(", random_state=1", "").replace(" ", "").replace("\n", "").replace(",", "|")
                        print(
                            f"\r{start},{end},{direction},{evaluator},{model_srt},{np.array2string(fs.astype(int), separator=',')[1:-1]}")

start,end,direction,subevaluator,model,BMI_q_normal, MentHlth, PhysHlth_q_uniform, GenHlth_q_uniform, Age_q_uniform, Education_coxbox, Income_q_uniform, HighBP, HighChol, CholCheck, Smoker, Stroke, HeartDiseaseorAttack, PhysActivity, Fruits, Veggies, HvyAlcoholConsump, AnyHealthcare, NoDocbcCost, DiffWalk, Sex
None,limit_fea_8,backward,wrapper_filter,LogisticRegression(max_iter=1000),1,0,0,1,1,0,1,1,1,0,0,0,1,0,0,0,1,0,0,0,0
None,limit_fea_8,backward,wrapper_filter,MLPClassifier(hidden_layer_sizes=(32|32)|max_iter=1000),1,0,0,1,1,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,0
None,limit_fea_8,backward,wrapper_filter,AdaBoostClassifier(random_state=1),1,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,1
None,limit_fea_10,backward,wrapper_filter,LogisticRegression(max_iter=1000),1,0,0,1,1,0,1,1,1,0,0,0,1,0,0,0,1,0,0,1,1
None,limit_fea_10,backward,wrapper_filter,MLPClassifier(hidden_layer_sizes=(32|32)|max_iter=1000),1,0,0,1,1,0,0,1,0,1,1,0,1,1,0,0,1,1,0,0,0
None,limit_fea_10,backward,wrapper_filter,AdaBoostClassi

In [4]:

evaluator = "multi_filter"
print(
    f"start,end,direction,subevaluator,model,{str(features)[1:-1]}".replace("'",""))

for start in starting_point:
    for direction in search_organization:
        if (start == 'none' and direction == 'backward') or (start == 'all' and direction == 'forward'):
            continue

        for end in stopping_criterion:
                for subevaluator in multi_filter_options:
                    fs = fss(start, end, direction, subevaluator, df=df, features=features, target=response_var)
                    if len(fs) > 1:
                        print(
                            f"{start},{end},{direction},{evaluator},{subevaluator},{np.array2string(fs.astype(int), separator=',')[1:-1]}")

start,end,direction,subevaluator,model,BMI_q_normal, MentHlth, PhysHlth_q_uniform, GenHlth_q_uniform, Age_q_uniform, Education_coxbox, Income_q_uniform, HighBP, HighChol, CholCheck, Smoker, Stroke, HeartDiseaseorAttack, PhysActivity, Fruits, Veggies, HvyAlcoholConsump, AnyHealthcare, NoDocbcCost, DiffWalk, Sex
all,limit_fea_8,backward,multi_filter,multivariate_cmi,1,0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0
all,limit_fea_10,backward,multi_filter,multivariate_cmi,1,0,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0,0,0,1,0
all,limit_fea_12,backward,multi_filter,multivariate_cmi,1,0,1,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,1,0
all,limit_fea_14,backward,multi_filter,multivariate_cmi,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,1,0
all,limit_fea_16,backward,multi_filter,multivariate_cmi,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,1,0
none,limit_fea_8,forward,multi_filter,multivariate_cmi,1,0,0,1,1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,1,0
none,limit_fea_10,forward,multi_filter,multivariate_cmi,1,0,0,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,0,1,0
none

In [7]:
evaluator = "uni_filter"
print(
    f"start,end,direction,subevaluator,model,{str(features)[1:-1]}".replace("'",""))

for start in starting_point:
    for direction in search_organization:
        if (start == 'none' and direction == 'backward') or (start == 'all' and direction == 'forward'):
            continue

        for end in stopping_criterion:
                for subevaluator in uni_filter_options:
                    fs = fss(start, end, direction, subevaluator, df=df, features=features, target=response_var)
                    if len(fs) > 1:
                        print(
                            f"{start},{end},{direction},{evaluator},{subevaluator},{np.array2string(fs.astype(int), separator=',')[1:-1]}")

start,end,direction,subevaluator,model,BMI_q_normal, MentHlth, PhysHlth_q_uniform, GenHlth_q_uniform, Age_q_uniform, Education_coxbox, Income_q_uniform, HighBP, HighChol, CholCheck, Smoker, Stroke, HeartDiseaseorAttack, PhysActivity, Fruits, Veggies, HvyAlcoholConsump, AnyHealthcare, NoDocbcCost, DiffWalk, Sex
none,limit_fea_8,forward,uni_filter,f_classif,1,0,0,1,1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,1,0
none,limit_fea_8,forward,uni_filter,mutual_information,0,0,0,1,0,0,0,1,1,1,0,0,0,1,1,1,0,1,0,0,0
none,limit_fea_10,forward,uni_filter,f_classif,1,0,1,1,1,1,1,1,1,0,0,0,1,0,0,0,0,0,0,1,0
none,limit_fea_10,forward,uni_filter,mutual_information,1,0,0,1,0,0,1,1,1,1,0,0,0,1,1,1,0,1,0,0,0
none,limit_fea_12,forward,uni_filter,f_classif,1,0,1,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0,0,1,0
none,limit_fea_12,forward,uni_filter,mutual_information,1,0,0,1,1,1,1,1,1,1,0,0,0,1,1,1,0,1,0,0,0
none,limit_fea_14,forward,uni_filter,f_classif,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,1,0
none,limit_fea_14,forward,uni_filter,mut

In [6]:
fss_stats = pd.read_csv("./csv/fss.csv")

uni_df = fss_stats[fss_stats['subevaluator'] == "uni_filter"]
multi_df = fss_stats[fss_stats['subevaluator'] == "multi_filter"]
wrapper_df = fss_stats[fss_stats['subevaluator'] == "wrapper_filter"]

In [7]:
uni_df.loc[uni_df['model'] == 'f_classif', uni_df.columns[5:]] *= 1.05 # Tiebreaker

In [8]:
print("Features kept in univariate feature selection")
for i in [8,10,12,14,16]:
    rows = uni_df[uni_df['end'] == 'limit_fea_' + str(i)].iloc[:, 5:]
    print(f"K={i:02d} -> {sorted([uni_df.columns.get_loc(col)-5 for col in rows.mean(axis=0).nlargest(i).index])}")

Features kept in univariate feature selection
K=08 -> [0, 3, 4, 6, 7, 8, 12, 19]
K=10 -> [0, 2, 3, 4, 5, 6, 7, 8, 12, 19]
K=12 -> [0, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 19]
K=14 -> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 19]
K=16 -> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 19]


In [9]:
print("Features kept in multivariate feature selection")
for i in [8,10,12,14,16]:
    rows = multi_df[multi_df['end'] == 'limit_fea_' + str(i)].iloc[:, 5:]
    print(f"K={i:02d} -> {sorted([multi_df.columns.get_loc(col)-5 for col in rows.mean(axis=0).nlargest(i).index])}")

Features kept in multivariate feature selection
K=08 -> [0, 2, 3, 4, 6, 7, 8, 19]
K=10 -> [0, 2, 3, 4, 5, 6, 7, 8, 12, 19]
K=12 -> [0, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 19]
K=14 -> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 19]
K=16 -> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 19]


In [10]:
print("Features kept in wrapper feature selection")
for i in [8,10,12,14,16]:
    rows = wrapper_df[wrapper_df['end'] == 'limit_fea_' + str(i)].iloc[:, 5:]
    print(f"K={i:02d} -> {sorted([wrapper_df.columns.get_loc(col)-5 for col in rows.mean(axis=0).nlargest(i).index])}")

Features kept in wrapper feature selection
K=08 -> [0, 3, 4, 6, 7, 8, 12, 16]
K=10 -> [0, 3, 4, 6, 7, 8, 9, 12, 16, 20]
K=12 -> [0, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17, 19]
K=14 -> [0, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 19]
K=16 -> [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 19]


In [2]:
[17] # Wrapper
[16] # Multivariate
[15] #Univariate

[18,14,20] # None chose

[15]