In [6]:
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

In [7]:
def get_pipeline(classifier):
  categorical_features = [
      'sex',
      'chest-pain',
      'fasting-blood-sugar',
      'electrocardiographic',
      'angina',
      'thal'
  ]
  numerical_features = [
      'age',
      'rest-bp',
      'serum-chol',
      'max-heart-rate',
      'oldpeak',
      'slope',
      'major-vessels',
  ]
  column_transformer = ColumnTransformer(transformers=[
      ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
      ('scaling', StandardScaler(), numerical_features)
  ])
  pipeline = Pipeline(steps=[
      ('transform', column_transformer),
      ('classifier', classifier)
  ])
  return pipeline


def get_cross_val_predictions(X, y, pipeline, n_splits, random_state):
  cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

  result = np.zeros((X.shape[0],))
  for train_index, test_index in cv.split(X, y):
    X_train = X.iloc[train_index]
    y_train = y[train_index]
    X_test = X.iloc[test_index]

    pipeline.fit(X_train, y_train)
    result[test_index] = pipeline.predict(X_test)
  return result


def get_perf_metrics(y_true, y_pred):
  y_true_switched = (y_true + 1) % 2
  y_pred_switched = (y_pred + 1) % 2

  precision0 = metrics.precision_score(y_true, y_pred, pos_label=0, zero_division=0.0)
  recall0 = metrics.recall_score(y_true, y_pred, pos_label=0, zero_division=0.0)
  precision1 = metrics.precision_score(y_true, y_pred, pos_label=1, zero_division=0.0)
  recall1 =  metrics.recall_score(y_true, y_pred, pos_label=1, zero_division=0.0)

  f1_score0 = metrics.f1_score(y_true, y_pred, pos_label=0, zero_division=0.0)
  f1_score1 = metrics.f1_score(y_true, y_pred, pos_label=1, zero_division=0.0)

  accuracy = np.sum(y_true == y_pred) / len(y_true)
  jaccard_index = np.sum(y_true == y_pred) / (len(y_true) + len(y_pred) - np.sum(y_true == y_pred))

  return {
      'accuracy': accuracy,
      'precision0': precision0,
      'recall0': recall0,
      'precision1': precision1,
      'recall1': recall1,
      'balanced_accuracy': (recall0 + recall1) / 2,
      'f1_score0': f1_score0,
      'f1_score1': f1_score1,
      'average_f1_score': (f1_score0 + f1_score1) / 2,
      'fowlkes_mallows0': np.sqrt(precision0 * recall0),
      'fowlkes_mallows1': np.sqrt(precision1 * recall1),
      'markedness': precision0 + precision1 - 1,
      'mcc': metrics.matthews_corrcoef(y_true, y_pred),
      'jaccard_index': jaccard_index,
      'cohens_kappa': metrics.cohen_kappa_score(y_true, y_pred)
  }


def validate_models(models, X, y, n_splits, random_state, log=False, log_names=None, fake_models=None):
  perf_metrics_list = []
  if log:
    iterable = tqdm(models)
  else:
    iterable = models
  for i, model in enumerate(iterable):
    if log and log_names is not None:
      iterable.set_description(log_names[i])
    pipeline = get_pipeline(model)
    y_pred = get_cross_val_predictions(X, y, pipeline, n_splits, random_state)
    perf_metrics = get_perf_metrics(y, y_pred)
    perf_metrics_list.append(perf_metrics)

  if fake_models is not None:
    for y_pred in fake_models:
      perf_metrics_list.append(get_perf_metrics(y, y_pred))


  return pd.DataFrame.from_records(perf_metrics_list)


class ThresholdedModel(BaseEstimator):
  def __init__(self, model, threshold):
    self.model = model
    self.threshold = threshold

  def fit(self, X, y=None):
    self.model.fit(X, y)

  def predict(self, X):
    proba_predictions = self.model.predict_proba(X)[:, 1]
    return np.where(proba_predictions > self.threshold, 1, 0)

def get_performance_metrics(random_state=42):
    stdf = pd.read_csv('datasets/statlog_heart.csv')
    y = stdf['target']
    X = stdf.drop(columns=['target'])
    
    models = []
    names = []
    
    threshold_range = [0.2, 0.5, 0.8]
    
    def add_model(name, model):
        names.append(name)
        models.append(model)
        
    for max_depth in [1, 2, 4, 8, None]:
      add_model(
          f'DecisionTreeClassifier(max_depth={max_depth})',
          DecisionTreeClassifier(max_depth=max_depth, random_state=random_state)
      )
    add_model(
        'BernoulliNB',
        BernoulliNB()
    )
    for threshold in threshold_range:
      for hidden_layer_sizes in [
          (1,),
          (10,),
          (100,),
          (200,),
          (10, 10),
      ]:
        add_model(
            f'MLPClassifier(sizes={hidden_layer_sizes}), thr={threshold}',
            ThresholdedModel(MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, max_iter=1000, early_stopping=True, n_iter_no_change=100, random_state=random_state), threshold)
        )
    
    for k in [1, 2, 3, 5, 7, 9]:
      add_model(
          f'KNeighborsClassifier(k={k})',
          KNeighborsClassifier(n_neighbors=k)
      )
    
    for threshold in threshold_range:
      for C in [0.01, 0.1, 1, 2, 4, 8]:
        add_model(
            f'LinearSVC(C={C}), thr={threshold}',
            ThresholdedModel(SVC(kernel='linear', C=C, probability=True, random_state=random_state), threshold)
        )
    
    for threshold in threshold_range:
      for degree in [2, 3, 4]:
        for gamma in [0.001, 0.01, 0.1]:
          add_model(
              f'PolySVC(degree={degree}, gamma={gamma}), thr={threshold}',
              ThresholdedModel(SVC(kernel='poly', degree=degree, gamma=gamma, probability=True, random_state=random_state), threshold)
          )
    
    for threshold in threshold_range:
      for C in [0.01, 0.1, 1, 2, 4, 8]:
        add_model(
            f'RBFSVM(C={C}), thr={threshold}',
            ThresholdedModel(SVC(kernel='rbf', C=C, probability=True, random_state=random_state), threshold)
        )
    
    for threshold in threshold_range:
      for max_depth in [2, 8, None]:
        for n_estimators in [8, 32, 128]:
          add_model(
              f'RF(max_depth={max_depth}, n={n_estimators}), thr={threshold}',
              ThresholdedModel(RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state), threshold)
          )
    
    for threshold in threshold_range:
      for num_leaves in [4, 8, 32]:
        for n_estimators in [8, 32, 128]:
          add_model(
              f'LGBM(num_leaves={num_leaves}, n={n_estimators}), thr={threshold}',
              ThresholdedModel(lgb.LGBMClassifier(num_leaves=num_leaves, n_estimators=n_estimators, verbosity=-1, random_state=random_state), threshold)
          )
    
    fake_models = []
    
    names.append('OptimalClassifier')
    fake_models.append(y)
    names.append('PessimalClassifier')
    fake_models.append((y + 1) % 2)
    
    counts = np.bincount(y)
    y_majority_el = np.argmax(counts)
    y_majority = np.full(y.shape, y_majority_el)
    
    names.append('MajorityClassifier')
    fake_models.append(y_majority)
    names.append('MinorityClassifier')
    fake_models.append((y_majority + 1) % 2)
    names.append('RandomClassifier')
    np.random.seed(random_state)
    fake_models.append(np.random.binomial(n=1, p=0.5, size=y.shape))
    
    df = validate_models(
        models=models,
        X=X,
        y=y,
        n_splits=10,
        random_state=random_state,
        log=True,
        log_names=names,
        fake_models=fake_models
    )
    
    df.insert(0, column='name', value=names)
    
    df['markedness'] = (df['markedness'] + 1) / 2
    df['mcc'] = (df['mcc'] + 1) / 2
    df['cohens_kappa'] = (df['cohens_kappa'] + 1) / 2
    
    return df

In [8]:
df = get_performance_metrics(random_state=42)
df.to_csv('results/performance_metrics.csv', index=False)

  0%|          | 0/144 [00:00<?, ?it/s]

In [9]:
df.head()

Unnamed: 0,name,accuracy,precision0,recall0,precision1,recall1,balanced_accuracy,f1_score0,f1_score1,average_f1_score,fowlkes_mallows0,fowlkes_mallows1,markedness,mcc,jaccard_index,cohens_kappa
0,DecisionTreeClassifier(max_depth=1),0.703704,0.733333,0.733333,0.666667,0.666667,0.7,0.733333,0.666667,0.7,0.733333,0.666667,0.7,0.7,0.542857,0.7
1,DecisionTreeClassifier(max_depth=2),0.692593,0.698225,0.786667,0.683168,0.575,0.680833,0.739812,0.624434,0.682123,0.741128,0.626755,0.690697,0.685699,0.529745,0.683743
2,DecisionTreeClassifier(max_depth=4),0.774074,0.763314,0.86,0.792079,0.666667,0.763333,0.808777,0.723982,0.76638,0.810216,0.726672,0.777696,0.77042,0.63142,0.76757
3,DecisionTreeClassifier(max_depth=8),0.751852,0.778523,0.773333,0.719008,0.725,0.749167,0.77592,0.721992,0.748956,0.775924,0.721998,0.748766,0.748966,0.602374,0.748959
4,DecisionTreeClassifier(max_depth=None),0.751852,0.778523,0.773333,0.719008,0.725,0.749167,0.77592,0.721992,0.748956,0.775924,0.721998,0.748766,0.748966,0.602374,0.748959
