### AdaBoost Feature Importance for Selection

Process steps:
* Build full dataset with decoys and compounds
* Build compound-only dataset
* Preprocess categorical with one-hot-encoding
* Find intersecting columns and filter compound-only to these
* Add non-linear transformations to compound-only dataset
* Create binary target with IC50 <= 10 as 1, else 0
* Using compound-only df, fine-tune random forest with GridSearchCV (CV = count of positive class)
* Extract feature importance and remove any with importance of 0
* Create important features in full dataset

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skewtest
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Imputer
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score, make_scorer, fbeta_score
from custom_functions import *

In [2]:
avail_transformations = ["log", "log2", "log10", "cubert", 
                         "sqrt", "exp", "exp2", "cube", "sq"]

* Build full-dataset
* Build compound-dataset
* Preprocess data adding one-hot-encoded features for both
* Find intersecting columns
* Add non-linear transformations and drop na's

In [3]:
# Load in full dataset
selleck = pd.read_csv("data/Imputed_Selleck_filtered_padel_corrected.csv")
full_x = selleck.drop("Name", axis=1)
# Preprocess variables
# full_x = preprocess_variables(full_x)
# Extract list of available columns
full_columns = full_x.columns
print("Loading in compound dataset....")
# Read in compound dataset
compound_x, compound_y = load_compound_dataset()
# Preprocess
compound_x = preprocess_variables(compound_x)
# Find intersecting features
avail_columns = compound_x.columns.intersection(full_columns)
# Select features on subset
x_data = compound_x.loc[:, avail_columns]
y_data = compound_y.copy()
print("Adding non-linear features to compound dataset....")
# Add all transformations on compound data
for feature in x_data.columns[x_data.dtypes == 'float64']:
    x_data = add_transformations(x_data, feature)
    full_x = add_transformations(full_x, feature)
# Drop any new columns with NaN due to improper transformation
x_data.replace([np.inf, -np.inf], np.nan, inplace=True)
x_data.dropna(axis=1, inplace=True)
assert not sum(x_data.isna().sum()), "Unexpected nulls found"

Loading in compound dataset....
Adding non-linear features to compound dataset....


KeyboardInterrupt: 

In [None]:
# TODO ensure that columns match from x_data and full_x

In [44]:
# Create binary variable
split_val = 10

if split_val == 10:
    split_name = "10"
    y_class = np.squeeze([int(y_val <= 10) for y_val in y_data])
elif split_val == 25:
    split_name = "25"
    y_class = np.squeeze([int(y_val <= 25) for y_val in y_data])
elif split_val == 40:
    split_name = "40"
    y_class = np.squeeze([int(y_val <= 40) for y_val in y_data])

### fbeta score

In [63]:
cross_val_beta = fbeta_score(y_class, predict, beta=.5)
print("Cross val fbeta: " + str(cross_val_beta))

Cross val fbeta: 0.42857142857142855


#### Loading Selleck and creating test prediction

In [64]:
# Train on full set
best_rf_model.fit(x_data, y_class)
pred = best_rf_model.predict()
# Create test prediction for Selleck
selleck_out = selleck.Name.to_frame()
selleck_out["rand_f_p10_fbet_0_428"] = pred

ValueError: Length of values does not match length of index

### AdaBoostClassifier

In [17]:

# How well does AdaBoost predict potency?
print("Tuning AdaBoost on compound dataset....")
model = AdaBoostClassifier(random_state=0)
params = {"n_estimators": [35, 40, 45],
          "learning_rate": [0.05, 0.075]}
grid = GridSearchCV(estimator=model, param_grid=params, cv=5, n_jobs=3, scoring=make_scorer(roc_auc_score))

grid.fit(x_data, y_class)
print(grid.best_params_)
best_model = grid.best_estimator_

Tuning AdaBoost on compound dataset....
{'learning_rate': 0.05, 'n_estimators': 40}


In [18]:
best_model = AdaBoostClassifier(random_state=0, learning_rate=0.075, n_estimators=5)
# Best model from other analysis

In [19]:
# Analyze CV prediction performance
predict = cross_val_predict(
    best_model, x_data, y_class, cv=sum(y_class), method="predict")

predict_proba = cross_val_predict(
    best_model, x_data, y_class, cv=sum(y_class), method="predict_proba")

print(confusion_matrix(y_class, predict, labels=[1, 0]))
print(np.array([["TP", "FN"], ["FP", "TN"]]))


[[ 7  4]
 [ 4 32]]
[['TP' 'FN']
 ['FP' 'TN']]


In [None]:
# Train on full set

# Add AdaBoost est to Selleck test df


In [20]:
roc_auc_score(y_class, predict)

0.7626262626262625

In [42]:
# Analyze CV Performance
print(pd.DataFrame(
    {"IC50": y_data, 
     "y_class": y_class, 
     "Prediction": predict, 
     "Proba": predict_proba[:,1]})[
    ["IC50", "y_class", "Prediction", "Proba"]])

       IC50  y_class  Prediction     Proba
0     0.036        1           1  0.999236
1    10.000        1           0  0.068899
2    50.000        0           0  0.208487
3    50.000        0           0  0.123817
4    50.000        0           0  0.123817
5     8.000        1           1  0.998017
6    50.000        0           0  0.123817
7    50.000        0           0  0.068899
8    35.000        0           0  0.068899
9    50.000        0           0  0.068899
10   45.000        0           0  0.068899
11   45.000        0           0  0.092583
12   40.000        0           0  0.092583
13   50.000        0           0  0.092583
14   50.000        0           0  0.092583
15   50.000        0           0  0.061057
16   50.000        0           0  0.061057
17   25.000        0           0  0.061057
18   50.000        0           1  0.993514
19   50.000        0           0  0.062513
20   15.000        0           1  1.000000
21    1.700        1           0  0.061057
22   10.000

In [10]:
# Analyze train prediction performance
best_model.fit(x_data, y_class)
predict = best_model.predict(x_data)
print(confusion_matrix(y_class, predict, labels=[1, 0]))
print(np.array([["TP", "FN"], ["FP", "TN"]]))

[[11  0]
 [ 0 36]]
[['TP' 'FN']
 ['FP' 'TN']]


Analyze feature importance

In [11]:
best_model.fit(x_data, y_class)
feat_importance = best_model.feature_importances_
best_features = [
    (f, i) for i, f in sorted(zip(feat_importance, x_data.columns), 
                              reverse=True) if i != 0]
feat_df = pd.DataFrame(best_features)
feat_df.columns = ["Feature", "Importance"]
feat_df.head(20)

Unnamed: 0,Feature,Importance
0,MATS5p_sq,0.175
1,MATS5m,0.15
2,ATSC6v,0.125
3,ATSC6v_sq,0.1
4,MATS5p_cube,0.075
5,MATS5m_exp2,0.075
6,MATS5m_exp,0.075
7,MATS5m_cube,0.075
8,MATS5p_exp2,0.025
9,MATS5p_exp,0.025


Descriptors Info
http://www.talete.mi.it/products/dragon_molecular_descriptor_list.pdf