In [7]:
import warnings

from sympy.solvers.diophantine.diophantine import Linear

warnings.filterwarnings("ignore")

from fss_funcs import fss
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from skopt import BayesSearchCV 
from skopt.space import Real, Categorical, Integer
import wittgenstein as lw
from sklearn.model_selection import train_test_split
import time
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import ComplementNB
from sklearn.preprocessing import MinMaxScaler
from io import StringIO
rng = np.random.RandomState(42)

In [2]:
df = pd.read_csv("csv/outlier_filtered.csv")

response_var = 'Diabetes_012'
features = list(df.columns)
features.remove(response_var)

print(features, response_var)
# Pretty-print using tabulate
df.head(1)

['BMI_q_normal', 'MentHlth', 'PhysHlth_q_uniform', 'GenHlth_q_uniform', 'Age_q_uniform', 'Education_coxbox', 'Income_q_uniform', 'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex'] Diabetes_012


Unnamed: 0,BMI_q_normal,MentHlth,PhysHlth_q_uniform,GenHlth_q_uniform,Age_q_uniform,Education_coxbox,Income_q_uniform,Diabetes_012,HighBP,HighChol,...,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,DiffWalk,Sex
0,1.60221,1.998592,0.891892,1.0,0.581582,-1.109347,0.117117,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [3]:
models = {
    'Logistic regression': BayesSearchCV(
        LogisticRegression(solver='newton-cholesky'),
        {
            'penalty': Categorical(['l2', None]),
            'C': Real(0.001, 100, prior='log-uniform'),
            'class_weight': Categorical(['balanced', None])
        },
        random_state=rng,
        scoring='f1_macro',
        n_jobs=-1,
        n_points=5,
        n_iter=50
    ),
    
    # Discriminant Analysis
    'Discriminant analysis (not svd)': BayesSearchCV(
        LinearDiscriminantAnalysis(),
        {
            'solver': Categorical(['lsqr', 'eigen']),
            'shrinkage': Real(0.0001, 1.0, prior='uniform'),
            'tol': Real(1e-5, 1.0, prior='uniform'),
        },
        random_state=rng,
        scoring='f1_macro',
        n_jobs=-1,
        n_points=4,
        n_iter=50
    ),
    # Discriminant Analysis
    'Discriminant analysis (svd)': BayesSearchCV(
        LinearDiscriminantAnalysis(store_covariance=True),
        {
            'tol': Real(1e-5, 1.0, prior='uniform'),
        },
        random_state=rng,
        scoring='f1_macro',
        n_jobs=-1,
        n_points=4,
        n_iter=50
    ),
    'ComplementNB': BayesSearchCV(
        ComplementNB(),
        {
            'alpha': Real(1e-5, 10.0, prior='log-uniform'),  # Smoothing parameter
            'force_alpha': Categorical([True, False]),      # Whether to force the alpha value unchanged
            'fit_prior': Categorical([True, False]),        # Whether to fit the prior probabilities
            'norm': Categorical([True, False]),             # Whether to perform a second normalization of weights
        },
        random_state=rng,
        scoring='f1_macro',
        n_jobs=-1,
        n_points=4,
        n_iter=50
    )
}

In [4]:
X, y = df[features], df[response_var]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.9,
                                                    random_state=rng,
                                                    stratify=y)

fea_subsets = {
    8:[0, 3, 4, 6, 7, 8, 12, 16],
    10:[0, 3, 4, 6, 7, 8, 9, 12, 16, 20],
    12:[0, 3, 4, 6, 7, 8, 9, 11, 12, 16, 17, 19],
    14:[0, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 19],
    16:[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 16, 17, 1]
}


In [5]:
# Start tracking total runtime
def run_models(elapse=0, skip=[]):
    start_total = time.time()
    
    y = y_train
    it = 0
    
    for model in models.keys():
        for k in fea_subsets.keys():
            if it <= elapse or model in skip:
                print(f"Skipping it={it} with {model};{k};")
                it += 1
                continue 
                
            # Start tracking time for this iteration
            start_time = time.time()
            
            # Define the feature subset and input data
            fea_subset = np.array(features)[fea_subsets[k]]
            X = X_train[fea_subset]
            
            # Fit the model
            bayes_search = models[model]
            
            
            print(f"\rFitting {model}...                   ", end="")
            if model == 'ComplementNB':
                X_scaled = MinMaxScaler().fit_transform(X)
                bayes_search.fit(X_scaled, y)  # Fit the model on the transformed data
            else:
                bayes_search.fit(X, y) 
            print(f"\rFitting {model} completed            ", end="")
            
            
            # Get best parameters and score
            best_params = bayes_search.best_params_
            score = bayes_search.score(X_test[fea_subset], y_test)
            
            # Calculate elapsed time for this iteration
            elapsed_time = time.time() - start_time
            
            # Print results along with time taken
            print(f"\r{it};{model};{k};{best_params};{score:01.04f};{elapsed_time:06.02f}")
            it += 1
    
    # Calculate and print total runtime
    total_time = time.time() - start_total
    print(f"Total runtime: {total_time:.2f} seconds")

run_models(-1)

0;Logistic regression;8;OrderedDict({'C': 0.009275062636581224, 'class_weight': 'balanced', 'penalty': 'l2'});0.4428;073.66
1;Logistic regression;10;OrderedDict({'C': 0.019884322615837377, 'class_weight': 'balanced', 'penalty': 'l2'});0.4421;073.19
2;Logistic regression;12;OrderedDict({'C': 0.0014661424746571484, 'class_weight': 'balanced', 'penalty': 'l2'});0.4420;081.25
3;Logistic regression;14;OrderedDict({'C': 0.0049931659015543745, 'class_weight': 'balanced', 'penalty': 'l2'});0.4424;097.71
4;Logistic regression;16;OrderedDict({'C': 0.008986806532830008, 'class_weight': 'balanced', 'penalty': 'l2'});0.4434;133.19
5;Discriminant analysis (not svd);8;OrderedDict({'shrinkage': 0.7753069475939367, 'solver': 'eigen', 'tol': 1.0});0.4343;038.70
6;Discriminant analysis (not svd);10;OrderedDict({'shrinkage': 0.7516415380327516, 'solver': 'lsqr', 'tol': 1e-05});0.4365;037.12
7;Discriminant analysis (not svd);12;OrderedDict({'shrinkage': 0.7709731236105747, 'solver': 'lsqr', 'tol': 1e-05});

In [18]:
output = """
it;model;num fea;best parameters;score;elapsed time 
0;Logistic regression;8;OrderedDict({'C': 0.009275062636581224, 'class_weight': 'balanced', 'penalty': 'l2'});0.4428;073.66
1;Logistic regression;10;OrderedDict({'C': 0.019884322615837377, 'class_weight': 'balanced', 'penalty': 'l2'});0.4421;073.19
2;Logistic regression;12;OrderedDict({'C': 0.0014661424746571484, 'class_weight': 'balanced', 'penalty': 'l2'});0.4420;081.25
3;Logistic regression;14;OrderedDict({'C': 0.0049931659015543745, 'class_weight': 'balanced', 'penalty': 'l2'});0.4424;097.71
4;Logistic regression;16;OrderedDict({'C': 0.008986806532830008, 'class_weight': 'balanced', 'penalty': 'l2'});0.4434;133.19
5;Discriminant analysis (not svd);8;OrderedDict({'shrinkage': 0.7753069475939367, 'solver': 'eigen', 'tol': 1.0});0.4343;038.70
6;Discriminant analysis (not svd);10;OrderedDict({'shrinkage': 0.7516415380327516, 'solver': 'lsqr', 'tol': 1e-05});0.4365;037.12
7;Discriminant analysis (not svd);12;OrderedDict({'shrinkage': 0.7709731236105747, 'solver': 'lsqr', 'tol': 1e-05});0.4437;026.60
8;Discriminant analysis (not svd);14;OrderedDict({'shrinkage': 0.7385508959093038, 'solver': 'lsqr', 'tol': 1e-05});0.4399;033.46
9;Discriminant analysis (not svd);16;OrderedDict({'shrinkage': 0.7820175034407315, 'solver': 'lsqr', 'tol': 1e-05});0.4283;048.99
10;Discriminant analysis (svd);8;OrderedDict({'tol': 0.8787898063269084});0.3933;029.79
11;Discriminant analysis (svd);10;OrderedDict({'tol': 0.8007791561388778});0.3934;044.66
12;Discriminant analysis (svd);12;OrderedDict({'tol': 0.881514287541885});0.4077;036.27
13;Discriminant analysis (svd);14;OrderedDict({'tol': 0.8682414474458714});0.4080;051.64
14;Discriminant analysis (svd);16;OrderedDict({'tol': 0.8804868604416194});0.3986;050.19
15;ComplementNB;8;OrderedDict({'alpha': 9.83097052187787, 'fit_prior': True, 'force_alpha': False, 'norm': False});0.3544;019.80
16;ComplementNB;10;OrderedDict({'alpha': 10.0, 'fit_prior': False, 'force_alpha': False, 'norm': False});0.3820;020.02
17;ComplementNB;12;OrderedDict({'alpha': 9.728726870982992, 'fit_prior': True, 'force_alpha': True, 'norm': True});0.3965;020.74
18;ComplementNB;14;OrderedDict({'alpha': 10.0, 'fit_prior': True, 'force_alpha': True, 'norm': True});0.4114;021.65
19;ComplementNB;16;OrderedDict({'alpha': 9.955213218146183, 'fit_prior': True, 'force_alpha': True, 'norm': True});0.4058;020.61
"""

stats = pd.read_csv(StringIO(output), delimiter=";")
print(stats.columns)
best_models = stats.loc[stats.groupby('model')['score'].idxmax()]
best_models

Index(['it', 'model', 'num fea', 'best parameters', 'score', 'elapsed time '], dtype='object')


Unnamed: 0,it,model,num fea,best parameters,score,elapsed time
18,18,ComplementNB,14,"OrderedDict({'alpha': 10.0, 'fit_prior': True,...",0.4114,21.65
7,7,Discriminant analysis (not svd),12,"OrderedDict({'shrinkage': 0.7709731236105747, ...",0.4437,26.6
13,13,Discriminant analysis (svd),14,OrderedDict({'tol': 0.8682414474458714}),0.408,51.64
4,4,Logistic regression,16,"OrderedDict({'C': 0.008986806532830008, 'class...",0.4434,133.19


In [40]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import ast

print("model,accuracy,accuracy 0,accuracy 1,accuracy 2,f1_macro,precision,recall")
for index, row in best_models.iterrows():
    model_name = row['model']
    kwargs = row['best parameters'].replace("OrderedDict(", '').replace("})", '}')
    kwargs = ast.literal_eval(kwargs)
    if model_name == 'Logistic regression': 
        model = LogisticRegression(solver='newton-cholesky', **kwargs)
    elif model_name == 'Discriminant analysis (not svd)': 
        model = LinearDiscriminantAnalysis(**kwargs)
    elif model_name == 'Discriminant analysis (svd)': 
        model = LinearDiscriminantAnalysis(store_covariance=True, **kwargs)
    elif model_name == 'ComplementNB': 
        model = ComplementNB(**kwargs)
    else:
        raise ValueError("Unknown model")
    
    if model_name == 'ComplementNB':
        X_scaled = MinMaxScaler().fit_transform(X_train)
        model.fit(X_scaled, y_train)  # Fit the model on the transformed data
    else:
        model.fit(X_train, y_train) 
        
    y_pred = model.predict(X_test)
    
    # Calculate the metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    matrix = confusion_matrix(y_test, y_pred)
    accuracy_per_class = matrix.diagonal()/matrix.sum(axis=1)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    
    # Print the metrics
    print(f"{model_name},{accuracy:.03f},{",".join(f"{x:.3f}" for x in accuracy_per_class)},{f1_macro:.03f},{precision:.03f},{recall:.03f}")
    

model,accuracy,accuracy 0,accuracy 1,accuracy 2,f1_macro,precision,recall
ComplementNB,0.743,0.812,0.102,0.406,0.419,0.417,0.440
Discriminant analysis (not svd),0.820,0.902,0.000,0.430,0.438,0.432,0.444
Discriminant analysis (svd),0.845,0.968,0.000,0.211,0.403,0.455,0.393
Logistic regression,0.694,0.707,0.207,0.677,0.443,0.443,0.530


In [41]:
output = """model,accuracy,accuracy 0,accuracy 1,accuracy 2,f1_macro,precision,recall
ComplementNB,0.743,0.812,0.102,0.406,0.419,0.417,0.440
Discriminant analysis (not svd),0.820,0.902,0.000,0.430,0.438,0.432,0.444
Discriminant analysis (svd),0.845,0.968,0.000,0.211,0.403,0.455,0.393
Logistic regression,0.694,0.707,0.207,0.677,0.443,0.443,0.530"""
stats = pd.read_csv(StringIO(output), delimiter=",")
stats

Unnamed: 0,model,accuracy,accuracy 0,accuracy 1,accuracy 2,f1_macro,precision,recall
0,ComplementNB,0.743,0.812,0.102,0.406,0.419,0.417,0.44
1,Discriminant analysis (not svd),0.82,0.902,0.0,0.43,0.438,0.432,0.444
2,Discriminant analysis (svd),0.845,0.968,0.0,0.211,0.403,0.455,0.393
3,Logistic regression,0.694,0.707,0.207,0.677,0.443,0.443,0.53
