# Choosing a classification model

## Import libraries

In [2]:
import os
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

# Local modules
import utilities.utilities as pu

# Logging
import mlflow

## General constants

In [3]:
RANDOM_STATE = 1

## Useful functions

In [4]:
def cross_validate(X, y, model, params={}):
    # Cross validate models
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
    skf.get_n_splits(X, y)
    cv_recalls = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        ml = model(**params).fit(X_train, y_train)
        y_probs = ml.predict_proba(X_test)[:,1]
        recall = pu.model_metrics(y_probs, y_test)
        
        cv_recalls.append(recall)

    mean_recall = np.mean(cv_recalls)
    return mean_recall

@ignore_warnings(category=ConvergenceWarning)
def test_features(model, params={}):
    # train and test with different feature_sets
    recalls = []
    best_recall = 0

    for feature_set in range(len(feature_sets)):
        # Subset data
        X = data[:,feature_sets[feature_set]]
        
        # Cross validate models
        mean_recall = cross_validate(X, y, model)

        # Logging
        with mlflow.start_run(run_name=str(model)):
            mlflow.log_params(params)
            mlflow.log_param('feature_id', feature_set)
            mlflow.log_metric('mean_recall', mean_recall)
        
        recalls.append(mean_recall)

        if mean_recall > best_recall:
            best_recall = mean_recall
            best_features = feature_set

    try:
        best_features # If not initilized, no good models found 

        X, X_eval= data[:,feature_sets[best_features]], data_eval[:,feature_sets[best_features]]
        
        ml = model(**params).fit(X, y)
        y_probs = ml.predict_proba(X_eval)[:,1]
        eval_recall = pu.model_metrics(y_probs, y_eval)

        results = {'model': ml, 
                   'feature_set': best_features,
                   'eval_recall': eval_recall,
                   'cv_recalls': recalls}
        return results
    except:
        print("No good models")

## Data loading

In [5]:
# Load data
data_path = os.path.realpath('./data')
data, y, data_eval, y_eval, feature_names = pu.load_aug_data(data_path)
feature_sets, feature_sets_names = pu.variable_sets()

## Random Forest

In [6]:
from utilities.random_forest import RandomForestClassifier
results_rf = test_features(RandomForestClassifier)
results_rf

{'model': RandomForestClassifier(),
 'feature_set': 0,
 'eval_recall': 0.4957983193277311,
 'cv_recalls': [0.7586206896551725,
  0.48850574712643674,
  0.5833333333333333,
  0.7068965517241379,
  0.7270114942528735,
  0.5057471264367815,
  0.6925287356321839]}

## LogisticRegression

In [7]:
from sklearn.linear_model import LogisticRegression

params={'solver':'newton-cg', 
        'max_iter':1000, 
        'random_state':0, 
        'fit_intercept':False, 
        'penalty':'l2'}

results_lr = test_features(LogisticRegression, params)
results_lr

{'model': LogisticRegression(fit_intercept=False, max_iter=1000, random_state=0,
                    solver='newton-cg'),
 'feature_set': 3,
 'eval_recall': 0.4327731092436975,
 'cv_recalls': [0.7442528735632185,
  0.6005747126436781,
  0.5804597701149425,
  0.7471264367816092,
  0.7471264367816092,
  0.6293103448275862,
  0.7385057471264368]}

In [8]:
feature_sets_names[0]

('useful_num_vars',)

We see that random forest with default parameters and the '0' feature set (useful_num_vars) does better than the Logistic regression method