
# Model Comparison

In [None]:
import sys
sys.path.append('./../src/')

import os
import utils
import ioutil
import feature_selection

import numpy as np
import pandas as pd

from model_comparison import model_comparison
from model_selection import nested_cross_val

from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

**Time budget**
* 18 filters + 25 discr combos = 450 experiments
* Each experiemnt comprises:
    - M models (2/3)
    - 6 feature selection algs 
    - (20/30/40) repetitions
    - 4 folds

- C1: 20 CPUs
- C2: 8 CPUs

In [6]:
# The number of required runs to complete 
nruns_comp1 = 18 * 25 * 3 * 6 * 20 * 4 / 20
nruns_comp2 = 18 * 25 * 2 * 6 * 20 * 4 / 8

nruns_comp1, nruns_comp2

(48600.0, 81000.0)

In [7]:
# Amount of time at disposal (min)
avail_time = 60 * 24 * 4

min_per_C1runs = nruns_comp1 / avail_time
min_per_C2runs = nruns_comp2 / avail_time
# Required number of runs per minute
min_per_C1runs, min_per_C2runs

(8.4375, 14.0625)

In [3]:
# Setup:
K, CV, SEED = 20, 4, 0

# Priors summing to 1.0.
PFS_PRIORS = [0.677, 0.323]
LRC_PRIORS = [0.753, 0.247]

MAX_ITER = [800]

N_ESTIMATORS = [20, 50, 100, 200, 500, 1000]
LEARNINGR_RATE = [0.001, 0.05, 0.2, 0.6, 1, 3]

TOL = [0.0001, 0.001, 0.01, 0.1, 0.3, 0.7, 1]
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

SCORE = roc_auc_score

PENALTY = ['l1', 'l2']
CLASS_WEIGHT = ['balanced']

NameError: name 'roc_auc_score' is not defined

In [None]:
# Number of experiments.
n_experiments = 20

np.random.seed(SEED)
random_states = np.random.randint(1000, size=n_experiments)

In [None]:
# TODO: Hash out the estimators that are excluded from this experiment.
estimators = {
    # NB: Reports colinear variables.
    'lda': LinearDiscriminantAnalysis,
    'logreg': LogisticRegression,
    # NB: warnings.warn('Y residual constant at iteration %s' % k)
    'pls': PLSRegression,
    #'adaboost': AdaBoostClassifier,
    'gnb': GaussianNB,
    'svc': SVC,
}
# C2 = PLS + GNB
# C1 = rest

In [None]:
logreg_l1 = LogisticRegression(
    penalty='l1', class_weight='balanced', random_state=SEED,
    solver='liblinear'
)
logreg_l2 = LogisticRegression(
    penalty='l2', class_weight='balanced', random_state=SEED,
    solver='liblinear'
)
rf_model = RandomForestClassifier(
    n_estimators=50, class_weight='balanced', random_state=SEED
)

In [None]:
hparams = {
    'lda': {
        # NOTE: n_components determined in model selection
        'n_components': [None], 'tol': TOL, 'priors': [PFS_PRIORS],
    },
    'logreg': {
        'C': C, 'solver': ['liblinear'], 'penalty': PENALTY,
        'class_weight': CLASS_WEIGHT, 'max_iter': MAX_ITER
    },
    'pls': {
        # NOTE: n_components determined in model selection
        'n_components': [None], 'tol': TOL,
    },
    'adaboost': {
        'base_estimator': [logreg_l2],
        'learning_rate': LEARNINGR_RATE, 'n_estimators': N_ESTIMATORS,
    },
    'svc': {
        'kernel': ['rbf'], 'C': C,
        'gamma': [0.0001, 0.001, 0.01, 0.1, 0.3, 0.7, 1],
        'cache_size': [20, 100, 300, 500], 'degree': [2, 3],
        'class_weight': CLASS_WEIGHT
    },
    'lin_svc': {
        'C': C, 'class_weight': CLASS_WEIGHT, 'penalty': PENALTY,
        'dual': [False], 'tol': TOL,
    },
    'gnb': {'priors': [PFS_PRIORS]},
}

In [None]:
selectors = {
    # Wrapper methods:
    'ff_logregl1': feature_selection.forward_floating,
    'ff_logregl2': feature_selection.forward_floating,
    'rf_permut_imp': feature_selection.permutation_importance,
    # Filter methods:
    'var_thresh': feature_selection.variance_threshold,
    'relieff': feature_selection.relieff,
    'mutual_info': feature_selection.mutual_info,
}
selector_params = {
    'ff_logregl1': {'model': logreg_l1, 'k': K, 'cv': 2, 'scoring': SCORE},
    'ff_logregl2': {'model': logreg_l2, 'k': K, 'cv': 2, 'scoring': SCORE},
    'rf_permut_imp': {'model': rf_model, 'thresh': 0.0, 'nreps': 1},
    'var_thresh': {'alpha': 0.05},
    'relieff': {'k': K, 'n_neighbors': 20},
    'mutual_info': {'n_neighbors': 20, 'thresh': 0.05},
}

In [None]:
selection_scheme = nested_cross_val

In [None]:
df_y_pfs = pd.read_csv(
    './../../data/to_analysis/target_pfs.csv', index_col=0
)
df_y_lrc = pd.read_csv(
    './../../data/to_analysis/target_lrc.csv', index_col=0
)
y_pfs, y_lrc = np.squeeze(df_y_pfs.values), np.squeeze(df_y_lrc.values)

In [None]:
ref_feature_dir = './../../data/to_analysis'
ref_results_pfs_dir = './../../data/outputs/model_comparison_pfs'
ref_results_lrc_dir = './../../data/outputs/model_comparison_lrc'

In [None]:
dirnames = utils.listdir(ref_feature_dir)
dirnames[:3]

In [None]:
# Select subsets of filters to run with ensuring completed runs
# per interesting image types (see Alise work for selecting)
for dirname in dirnames:

    file_paths = ioutil.relative_paths(
        os.path.join(ref_feature_dir, dirname), target_format='.csv'
    )
    for path_to_file in file_paths:

        X = pd.read_csv(path_to_file, index_col=0).values

        # path_pfs_results = TODO: where to save results

        # NOTE: PFS
        pfs_results = model_comparison(
            selection_scheme, X, y_pfs, estimators, hparams, selectors,
            selector_params, random_states, CV, score_func=SCORE
        )
        # Write results for each analyzed data set of current filter and
        # discr combo.
        ioutil.write_final_results(path_pfs_results, pfs_results)