### SVM Backward-Stepwise Selection

Steps taken:
1. Preprocess features with non-linear transformations
2. Tune SVC using IC50<10 threshold for classes (need to try new values)
3. Use tuned model for backward selection, par operations for speed
    * Set benchmark using log loss or roc with prob output from SVC
    * Use CV=sum(y_class) for all loss calculations
    * If removal of feature improved loss, add to removals list
    * Remove up to 10% of features available starting with lowest loss
4. Analyze output of new features with confusion matrix

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skewtest
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Imputer
from sklearn.metrics import confusion_matrix, log_loss
from custom_functions import *
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
import pickle
from par_support import par_backward_stepwise
from time import time

In [None]:
avail_transformations = ["log", "log2", "log10", "cubert", 
                         "sqrt", "exp", "exp2", "cube", "sq"]

In [None]:
# Load in full dataset
full_x, full_y = load_full_dataset()
# Preprocess variables
full_x = preprocess_variables(full_x)
# Extract list of available columns
full_columns = full_x.columns
print("Loading in compound dataset....")
# Read in compound dataset
compound_x, compound_y = load_compound_dataset()
# Preprocess
compound_x = preprocess_variables(compound_x)
# Find intersecting features
avail_columns = compound_x.columns.intersection(full_columns)
# Select features on subset
x_data = compound_x.loc[:, avail_columns]
y_data = compound_y.copy()
print("Adding non-linear features to compound dataset....")
# Add all transformations on compound data
for feature in x_data.columns[x_data.dtypes == 'float64']:
    x_data = add_transformations(x_data, feature)
# Drop any new columns with NaN due to improper transformation
x_data.replace([np.inf, -np.inf], np.nan, inplace=True)
x_data.dropna(axis=1, inplace=True)
assert not sum(x_data.isna().sum()), "Unexpected nulls found"
y_class = np.squeeze([int(y_val <= 10) for y_val in y_data])

### Selection using: SVC

In [None]:
# Score using log_loss and roc (compare results)
scaler = StandardScaler()
# Scale the train data
x_train = scaler.fit_transform(x_data)
x_train = pd.DataFrame(data=x_train, columns=x_data.columns, index=x_data.index)

# # Set params for tuning
model = SVC(random_state=0, probability=True)
params = {"kernel": ["linear", "poly", "rbf", "sigmoid"],
          "C": np.arange(0.05, 1.05, .05),
          "class_weight": [None, "balanced"]}

grid = GridSearchCV(model, param_grid=params, scoring=make_scorer(roc_auc_score),
                    cv=sum(y_class), n_jobs=7)
grid.fit(x_train, y_class)
print(grid.best_params_)

### Set benchmark

In [None]:
model = SVC(random_state=0, class_weight="balanced", kernel="sigmoid", probability=True, C=0.95)

benchmark = np.mean(cross_val_score(model, x_train, y_class,
                                    scoring=make_scorer(roc_auc_score),
                                    cv=sum(y_class), n_jobs=7))

### Use parallel loop to calculate each features removal

In [None]:
features_in = list(x_train.columns)

while True:  # While features to remove, add break
    higher_is_better = True
    start = time()
    results_par = Parallel(n_jobs=7)(
        delayed(par_backward_stepwise)(features, x_train, y_class, model) for features in np.array_split(features_in, 7))
    norm_summary = [item for sublist in results_par for item in sublist]
    stop = time()
    print((stop - start))

    # TODO update benchmark
    return_dict = dict()
    for d in results_par:
        # assert isinstance(d, dict)
        for k, v in d.items():
            return_dict[k] = v

    # If list len is 0 then stop, no more features to remove
    # Intuition: greater than benchmark means removal increased the loss function
    if higher_is_better:
        potential_removals = {feat: return_dict[feat] 
                              for feat, roc in return_dict.items() 
                              if roc >= benchmark}
    else:
        potential_removals = {feat: return_dict[feat] 
                              for feat, roc in return_dict.items() 
                              if roc <= benchmark}
    if len(potential_removals) == 0:
        print("nothing to remove")
        break

    # Remove features with the best scores (top 10%) intuition: removing them led to improved scores
    filter_to = max(len(return_dict) - len(potential_removals), int(len(return_dict) * .90))
    features_in = sorted(return_dict.keys(), key=lambda x: return_dict[x], reverse=higher_is_better)[:filter_to]

    # Set benchmark
    benchmark = np.mean(cross_val_score(model, x_train[features_in], y_class,
                                        scoring=make_scorer(roc_auc_score),
                                        cv=sum(y_class), n_jobs=7))
print("final score: %s" % str(benchmark))

In [None]:
# Analyze CV prediction performance
predict = cross_val_predict(
    model, x_data[selected_features], y_class, cv=sum(y_class), method="predict")

print(confusion_matrix(y_class, predict, labels=[1, 0]))
print(np.array([["TP", "FN"], ["FP", "TN"]]))

In [2]:
# TODO try with adaboost