### AdaBoost Backward-Stepwise Selection

Steps taken:
1. Preprocess features with non-linear transformations
2. Tune SVC using IC50<10 threshold for classes (need to try new values)
3. Use tuned model for backward selection, par operations for speed
    * Set benchmark using log loss or roc with prob output from model
    * Use CV=sum(y_class) for all performance calculations
    * If removal of feature improved model performance, add to removals list
    * Group features for removals by correlation groups, only select the worst from each group each iteration
4. Analyze output of new features with confusion matrix

In [3]:
%load_ext autoreload
%autoreload
import pandas as pd
import numpy as np
from scipy.stats import skewtest
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Imputer
from sklearn.metrics import confusion_matrix, log_loss
from custom_functions import *
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
import pickle
from par_support import par_backward_stepwise
from time import time

In [4]:
avail_transformations = ["log", "log2", "log10", "cubert", 
                         "sqrt", "exp", "exp2", "cube", "sq"]

In [5]:
# Load in full dataset
full_x, full_y = load_full_dataset()
# Preprocess variables
full_x = preprocess_variables(full_x)
# Extract list of available columns
full_columns = full_x.columns
print("Loading in compound dataset....")
# Read in compound dataset
compound_x, compound_y = load_compound_dataset()
# Preprocess
compound_x = preprocess_variables(compound_x)
# Find intersecting features
avail_columns = compound_x.columns.intersection(full_columns)
# Select features on subset
x_data = compound_x.loc[:, avail_columns]
y_data = compound_y.copy()
print("Adding non-linear features to compound dataset....")
# Add all transformations on compound data
for feature in x_data.columns[x_data.dtypes == 'float64']:
    x_data = add_transformations(x_data, feature)
# Drop any new columns with NaN due to improper transformation
x_data.replace([np.inf, -np.inf], np.nan, inplace=True)
x_data.dropna(axis=1, inplace=True)
assert not sum(x_data.isna().sum()), "Unexpected nulls found"
# Create binary variable
y_class = np.squeeze([int(y_val <= 10) for y_val in y_data])

Adding Akt1_decoys_padel.csv....
Adding AmpC_decoys_padel.csv....
Adding cp3a4_decoys_padel.csv....
Adding cxcr4_decoys_padel.csv....
Adding gcr_decoys_padel.csv....
Adding HIVpr_decoys_padel.csv....
Adding HIVrt_decoys_padel.csv....
Adding Kif11_decoys_padel.csv....
Loading in compound dataset....
Adding non-linear features to compound dataset....


### AdaBoost

In [6]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=0, learning_rate=0.075, n_estimators=5)

In [8]:
# Analyze CV prediction performance
predict = cross_val_predict(
    model, x_data, y_class, cv=sum(y_class), method="predict")

predict_proba = cross_val_predict(
    model, x_data, y_class, cv=sum(y_class), method="predict_proba")

print(confusion_matrix(y_class, predict, labels=[1, 0]))
print(np.array([["TP", "FN"], ["FP", "TN"]]))

[[ 7  4]
 [ 4 32]]
[['TP' 'FN']
 ['FP' 'TN']]


In [9]:
benchmark = roc_auc_score(y_class, predict)

### Use parallel loop to calculate each features removal

In [None]:
scoring_function = make_scorer(roc_auc_score)
higher_is_better = True

In [103]:
features_in = list(x_train.columns)
n_jobs = 4
print("Starting benchmark: %s" % str(benchmark))
while True:  # While features to remove, add break
    start = time()
    results_par = Parallel(n_jobs=n_jobs)(
        delayed(par_backward_stepwise)(features, x_train[features_in], y_class, model, scoring_function) 
        for features in np.array_split(features_in, n_jobs))
    stop = time()
    print("iteration time: %s" % str(stop - start))

    # TODO update benchmark
    return_dict = dict()
    for d in results_par:
        # assert isinstance(d, dict)
        for k, v in d.items():
            return_dict[k] = v

    # If list len is 0 then stop, no more features to remove
    # Intuition: greater than benchmark means removal increased the loss function
    if higher_is_better:
        potential_removals = {feat: return_dict[feat] 
                              for feat, score in return_dict.items() 
                              if score > benchmark}
    else:
        potential_removals = {feat: return_dict[feat] 
                              for feat, score in return_dict.items() 
                              if score < benchmark}
    if len(potential_removals) == 0:
        print("nothing to remove")
        break
    
    # Determine correlated groupings, only remove one from each group if correlated
    df=x_train[list(potential_removals.keys())]
    corr_matrix=df.corr()
    corr_matrix.loc[:,:] =  np.tril(corr_matrix, k=-1) # borrowed from Karl D's answer

    already_in = set()
    corr_result = []
    for col in corr_matrix:
        correlated = corr_matrix[col][np.abs(corr_matrix[col]) > .7].index.tolist()
        if correlated and col not in already_in:
            already_in.update(set(correlated))
            correlated.append(col)
            corr_result.append(correlated)
        elif col not in already_in:
            already_in.update(set([col]))
            corr_result.append([col])
            
    # For each set of correlated features...
    # Sort by score from return_dict and remove only the highest score
    final_removal = []
    for corr_list in corr_result:
        score_sorted = sorted(corr_list, key=lambda x: return_dict[x], reverse=not higher_is_better)
        print("remove: %s" % return_dict[score_sorted[-1]])
        print("keep: %s" % return_dict[score_sorted[0]])
        final_removal.append(score_sorted[-1])

    # Remove the worst feature from each correlated group
    features_in = list(set(features_in) - set(final_removal))
    print("%s features removed" % str(len(final_removal)))

    # Set benchmark
    benchmark = np.mean(cross_val_score(model, x_train[features_in], y_class,
                                        scoring=scoring_function,
                                        cv=sum(y_class), n_jobs=n_jobs))
    print("new benchmark: %s" % str(benchmark))
print("final score: %s" % str(benchmark))

Starting benchmark: 0.7121212121212122
iteration time: 1173.6409928798676
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7575757575757577
keep: 0.7575757575757577
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7575757575757577
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121212121212
remove: 0.7462121212121212
keep: 0.7462121

In [104]:
selected_features = features_in

In [106]:
# Analyze CV prediction performance
predict = cross_val_predict(
    model, x_data[selected_features], y_class, cv=sum(y_class), method="predict")

print(confusion_matrix(y_class, predict, labels=[1, 0]))
print(np.array([["TP", "FN"], ["FP", "TN"]]))

ValueError: Classification metrics can't handle a mix of binary and continuous-multioutput targets

In [2]:
# TODO try with adaboost