In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('conventional_data_unique.csv', index_col='CID')
y_ = data['consensus_act']

X_all = data.drop(['consensus_act', 'gmin'], axis = 1)
X_all.replace([np.inf, -np.inf], np.nan, inplace=True) #replace all infinite values with NaN

X_all = X_all.dropna(axis=1, how='all') #Drop any column that has all NaN
X_all.fillna(value = X_all.mean(axis=0), inplace=True) #Impute mean in place on NaN

#X_all.loc[:, X_all.isnull().any()] See which column still had NaN after prepocessing. gmin was removed
print('Shape of X_all before removing low variance features:', X_all.shape)

#0.16 =(.8 * (1 - .8)) that is the threshold for features that are constant in 80% of the instances
feat = VarianceThreshold(threshold=0.16)
feat.fit_transform(X_all)
X_all = X_all[X_all.columns[feat.get_support(indices=True)]] #retain column names
print('Shape of X after removing low variance features:', X_all.shape)


  interactivity=interactivity, compiler=compiler, result=result)


Shape of X_all before removing low variance features: (7665, 17967)
Shape of X after removing low variance features: (7665, 2544)


In [3]:
n_splits=5
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=23)

#Outer CV
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_all, y_)):
    print("Fold " + str(fold+1) + " of " + str(n_splits))
    
    xtrain, xtest = X_all.iloc[train_idx], X_all.iloc[val_idx]
    y_train, y_test = y_.values[train_idx], y_.values[val_idx]
    
    #y_train = pd.get_dummies(y_train)
    #y_test = pd.get_dummies(y_test)
    
    sc = StandardScaler().fit(xtrain)
    X_train = pd.DataFrame(sc.transform(xtrain), columns = X_all.columns)
    X_test = pd.DataFrame(sc.transform(xtest), columns = X_all.columns)
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    #-----------------------------------------------------------------------------
    param_dist = {'n_estimators':[50, 100, 200], "max_depth": [3, None], 
              "max_features": ["auto", "log2", None, 0.8, 0.5, 0.2], 
              "min_samples_leaf": [0.5, 1, 5, 10, 25], "criterion": ["gini", "entropy"]}

    rf = RandomForestClassifier() #Change model
    
    #Inner CV with gridsearch
    rf_clf = GridSearchCV(rf, param_grid=param_dist, scoring='f1_micro', cv=StratifiedKFold(10), n_jobs=9)
    rf_clf.fit(X_train, y_train)

    print('CV Best score: ', rf_clf.best_score_)
    print('CV Best Params: ', rf_clf.best_params_)

    new_rf_model = RandomForestClassifier(**rf_clf.best_params_)
    new_rf_model.fit(X_train, y_train)
    #-------------------------------------------------------------------------------
    X_train_cols = X_train.columns.values #feature names

    imp = pd.concat([pd.DataFrame(new_rf_model.feature_importances_), pd.DataFrame(X_train_cols)], axis = 1)
    imp.columns = ['_Imp', '_Features']
    imp = imp.sort_values('_Imp', ascending=False)
    imp_filename = 'Unique_Conventional_RF_feat_importance' + str(fold+1) + '.csv'
    imp.to_csv(imp_filename)
    
    #Save te model
    pickle.dump(new_rf_model, open('Unique_ConvRF_' + str(fold+1) + '.pkl', 'wb'))
    #loaded_model = pickle.load(open('convRF.pkl', 'rb'))
    
    #----------------------------------------------------------------------------------
    y_pred = new_rf_model.predict(X_test)

    #labels = ['active_agonist', 'active_antagonist', 'inactive', 'inconclusive']
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print ("Precision:", metrics.precision_score(y_test, y_pred, average="macro"))
    print ("Recall:", metrics.recall_score(y_test, y_pred, average="macro"))
    print ("f1_score:", metrics.f1_score(y_test, y_pred, average="macro"))
    #------------------------------------------------
    y_test_ = pd.get_dummies(y_test)
    y_proba = new_rf_model.predict_proba(X_test)
    print('AUROC:', metrics.roc_auc_score(y_test_, y_proba, average="macro"))
    print('AUPRC:', metrics.average_precision_score(y_test_, y_proba, average="macro"))
    #------------------------------------------------


Fold 1 of 5
(6130, 2544) (1535, 2544) (6130,) (1535,)
CV Best score:  0.751386623165
CV Best Params:  {'max_depth': None, 'criterion': 'gini', 'min_samples_leaf': 5, 'n_estimators': 200, 'max_features': 'auto'}
[[ 19   0   3   6]
 [  0   2   7   8]
 [  1   1 889 103]
 [  2   1 228 265]]
                   precision    recall  f1-score   support

   active_agonist       0.86      0.68      0.76        28
active_antagonist       0.50      0.12      0.19        17
         inactive       0.79      0.89      0.84       994
     inconclusive       0.69      0.53      0.60       496

      avg / total       0.76      0.77      0.75      1535

Precision: 0.711543379225
Recall: 0.556214719532
f1_score: 0.598101166446
AUROC: 0.845726857517
AUPRC: 0.642755942984
Fold 2 of 5
(6131, 2544) (1534, 2544) (6131,) (1534,)
CV Best score:  0.746370901974
CV Best Params:  {'max_depth': None, 'criterion': 'gini', 'min_samples_leaf': 10, 'n_estimators': 200, 'max_features': 0.2}
[[ 20   0   8   0]
 [  1   0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(6133, 2544) (1532, 2544) (6133,) (1532,)
CV Best score:  0.750692972444
CV Best Params:  {'max_depth': None, 'criterion': 'gini', 'min_samples_leaf': 1, 'n_estimators': 200, 'max_features': 'auto'}
[[ 17   0   6   5]
 [  1   0   5  10]
 [  0   1 878 114]
 [  3   0 216 276]]
                   precision    recall  f1-score   support

   active_agonist       0.81      0.61      0.69        28
active_antagonist       0.00      0.00      0.00        16
         inactive       0.79      0.88      0.84       993
     inconclusive       0.68      0.56      0.61       495

      avg / total       0.75      0.76      0.75      1532

Precision: 0.571393856688
Recall: 0.512226984999
f1_score: 0.5360496229
AUROC: 0.856725000619
AUPRC: 0.599072952356
Fold 4 of 5
(6133, 2544) (1532, 2544) (6133,) (1532,)
CV Best score:  0.746779716289
CV Best Params:  {'max_depth': None, 'criterion': 'gini', 'min_samples_leaf': 5, 'n_estimators': 100, 'max_features': 'auto'}
[[ 20   0   6   2]
 [  0   0   4  12]
 [

In [None]:
#CORRECT/INCORRECT PREDICTION DATAFRAME

In [14]:
data = pd.read_csv('conventional_data_unique.csv', index_col= 'CID')
y_ = data['consensus_act']

X_all = data.drop(['consensus_act','gmin'], axis = 1)
X_all.replace([np.inf, -np.inf], np.nan, inplace=True) #replace all infinite values with NaN

X_all = X_all.dropna(axis=1, how='all') #Drop any column that has all NaN
X_all.fillna(value = X_all.mean(axis=0), inplace=True) #Impute mean in place on NaN

#X_all.loc[:, X_all.isnull().any()] See which column still had NaN after prepocessing. gmin was removed
print('Shape of X_all before removing low variance features:', X_all.shape)

#0.16 =(.8 * (1 - .8)) that is the threshold for features that are constant in 80% of the instances
feat = VarianceThreshold(threshold=0.16)
feat.fit_transform(X_all)
X_all = X_all[X_all.columns[feat.get_support(indices=True)]] #retain column names
print('Shape of X after removing low variance features:', X_all.shape)

  interactivity=interactivity, compiler=compiler, result=result)


Shape of X_all before removing low variance features: (7665, 17967)
Shape of X after removing low variance features: (7665, 2544)


In [15]:
Predictions_DF = pd.DataFrame()
n_splits=5
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=23)

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_all, y_)):
    print("Fold " + str(fold+1) + " of " + str(n_splits))
    
    xtrain, xtest = X_all.iloc[train_idx], X_all.iloc[val_idx]
    y_train, y_test = y_.values[train_idx], y_.values[val_idx]
    
    sc = StandardScaler().fit(xtrain)
    X_train = pd.DataFrame(sc.transform(xtrain), columns = X_all.columns, index=xtrain.index)
    X_test = pd.DataFrame(sc.transform(xtest), columns = X_all.columns, index=xtest.index)
    
    #Save model
    #pickle.dump(new_rf_model, open('ConvRF_' + str(fold+1) + '.pkl', 'wb'))
    #load saved model
    loaded_RFmodel = pickle.load(open('Unique_ConvRF_' + str(fold+1) + '.pkl', 'rb'))
    
    y_pred = loaded_RFmodel.predict(X_test)
    #y_proba = loaded_RFmodel.predict_proba(X_test)
    
    compare = pd.DataFrame(np.hstack((y_test.reshape(y_test.shape[0],1),
                                  y_pred.reshape(y_pred.shape[0],1))),
                                  columns = ['consensus_act', 'predicted_act'], index=X_test.index)
    
    compare['score'] = np.where(compare['consensus_act'] == compare['predicted_act'], 'correct', 'incorrect')
    #compare['score'] = compare.apply(lambda x: 'correct' if x['consensus_act'] == x['predicted_act'] else 'incorrect', axis=1)
    compare.to_csv('Unique_RF_Predictions_Run'+str(fold+1) + '.csv')
    
    Predictions_DF = Predictions_DF.append(compare)
Predictions_DF.to_csv('Unique_RF_Predictions.csv')

Fold 1 of 5
Fold 2 of 5
Fold 3 of 5
Fold 4 of 5
Fold 5 of 5
