In [1]:
#This function gets the raw data and clean it 
def data_clean(data):

    print("Data shape before cleaning:"  + str(np.shape(data)))
    
             
    #Change the data type of any column if necessary.
    print("Now it will print only those columns with non-numeric values")
    print(data.select_dtypes(exclude=[np.number]))
    

    #Now dropping those columns with zero values entirely or which sums to zero
    data= data.loc[:, (data != 0).any(axis=0)]

    #Now dropping those columns with NAN values entirely 
    data=data.dropna(axis=1, how='all')
    data=data.dropna(axis=0, how='all')

    #Keep track of the columns which are exculded after NAN and column zero sum operation above
    print("Data shape after cleaning:"  + str(np.shape(data)))

    return data

In [2]:
#This function impute the missing values with features (column mean)
def data_impute(data):
      
    #Seprating out the NAMES of the molecules column and ACTIVITY column because they are not the features to be normalized.
    data_input=data.drop(['ACTIVITY', 'NAME'], axis=1)
    data_labels= data.ACTIVITY
    data_names = data.NAME
    

    #Imputing the missing values with features mean values
    fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=1)
    Imputed_Data_input = pd.DataFrame(fill_NaN.fit_transform(data_input))
    print(np.shape(Imputed_Data_input))
    print("Data shape after imputation:"  + str(np.shape(Imputed_Data_input)))
    return Imputed_Data_input, data_labels, data_names         

In [3]:
#This function is to normalize features  
def data_norm(Imputed_Data_input,data_labels,data_names):   
    #Calculatig the mean and STD of the imputed input data set
    Imputed_Data_input_mean=Imputed_Data_input.mean()
    Imputed_Data_input_std=Imputed_Data_input.std()

    #z-score normalizing the whole input data:
    Imputed_Data_input_norm = (Imputed_Data_input - Imputed_Data_input_mean)/Imputed_Data_input_std

    #Adding names and labels to the data again
    frames = [data_names,data_labels, Imputed_Data_input_norm]
    full_data_norm = pd.concat(frames,axis=1)
    
    return full_data_norm

In [4]:
#This function gives train-test-split 
from sklearn.cross_validation import train_test_split as sk_train_test_split
def data_split(full_data_norm, test_size):
    full_data_norm_input=full_data_norm.drop(['ACTIVITY', 'NAME'], axis=1)
    target_attribute = full_data_norm['ACTIVITY']
    # We call train set as train_cv as a part of it will be used for cross-validadtion
    train_cv_x, test_x, train_cv_y, test_y = sk_train_test_split(full_data_norm_input, target_attribute, test_size=test_size, random_state=55)
    return train_cv_x, test_x, train_cv_y, test_y




In [5]:
#Optimizing drop_out and threshold  with 3 cross CV validation
def hybrid_model_opt():
    class fs(TransformerMixin, BaseEstimator):

        def __init__(self, n_estimators=1000, threshold='1.7*mean'):
            self.ss=None
            self.n_estimators = n_estimators
            self.x_new = None     
            self. threshold= threshold      

        def fit(self, X, y):
            m = ExtraTreesClassifier(n_estimators=self.n_estimators,  random_state=0)
            m.fit(X,y)
            self.ss = SelectFromModel(m, threshold=self. threshold , prefit=True)
            return self

        def transform(self, X):
            self.x_new=self.ss.transform(X)
            global xx
            xx=self.x_new.shape[1]
            return self.x_new


    def nn_model_opt(dropout_rate=0.5,init_mode='uniform', activation='relu'):

        #n_x_new=xx # this is the number of features selected for current iteration
        np.random.seed(200000)
        model_opt = Sequential()
        model_opt.add(Dense(xx,input_dim=xx ,kernel_initializer='he_normal', activation='relu'))
        model_opt.add(Dense(10, kernel_initializer='he_normal', activation='relu'))
        model_opt.add(Dropout(dropout_rate))
        model_opt.add(Dense(1,kernel_initializer='he_normal', activation='sigmoid'))

        model_opt.compile(loss='binary_crossentropy',optimizer='adam', metrics=['binary_crossentropy'])

        return model_opt


    clf=KerasClassifier(build_fn=nn_model_opt, epochs=250, batch_size=3000, verbose=-1)

    hybrid_model = Pipeline([('fs', fs()),('clf', clf)])
    
    return hybrid_model

In [6]:

#Getting fetaures importances of all the features using extra_tree classifier only

def feature_imp(train_cv_x,train_cv_y):


   
    m = ExtraTreesClassifier(n_estimators=1000 )
    m.fit(train_cv_x,train_cv_y)
    importances = m.feature_importances_

    return importances, m

In [7]:
def selected_feature_names(m, thr, train_cv_x):

    sel = SelectFromModel(m,threshold=thr ,prefit=True)


    feature_idx = sel.get_support()
    feature_name = train_cv_x.columns[feature_idx]
    feature_name =pd.DataFrame(feature_name )
    return feature_name


In [8]:
def train_test_feature_based_selection(feature_name,train_cv_x,train_cv_y,test_x,test_y ):
    
    feature_name=feature_name.T
    feature_name.columns = feature_name.iloc[0]
    feature_name.reindex(feature_name.index.drop(0))
    train_selected_x=train_cv_x[train_cv_x.columns.intersection(feature_name.columns)]
    test_selected_x=test_x[test_x.columns.intersection(feature_name.columns)]

    train_selected_x=train_selected_x.as_matrix()
    test_selected_x=test_selected_x.as_matrix()
    train_selected_y=train_cv_y.as_matrix()
    test_selected_y=test_y.as_matrix()
    
    return train_selected_x, train_selected_y, test_selected_x, test_selected_y

In [9]:
def model_nn_final(train_selected_x, train_selected_y, test_selected_x, test_selected_y, x, drop_out):    
    model_final = Sequential()
    #n_x_new=train_selected_x.shape[1]
    n_x_new=train_selected_x.shape[1]
    model_final.add(Dense(n_x_new, input_dim=n_x_new, kernel_initializer ='he_normal', activation='sigmoid'))


    model_final.add(Dense(10, kernel_initializer='he_normal', activation='sigmoid'))
    model_final.add(Dropout(drop_out))


    model_final.add(Dense(1, kernel_initializer='he_normal', activation='sigmoid'))



    model_final.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_crossentropy'])

    seed = 7000
    np.random.seed(seed)


    model_final.fit(train_selected_x, train_selected_y, epochs=250, batch_size=1064)

    pred_test = model_final.predict(test_selected_x)
    auc_test = roc_auc_score(test_selected_y, pred_test)
    print ("AUROC_test: " + str(auc_test))

    print("  ") 
    model_json = model_final.to_json()
    with open(str(x)+"_model.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model_final.save_weights(str(x)+"_model.h5")
    print("Saved model to disk")
    print("  ")
    
    return pred_test


##  1) Loading all packages needed 

In [10]:
from keras.callbacks import ModelCheckpoint
from keras import backend as K
from keras import optimizers
from keras.layers import Dense
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from pandas import ExcelFile
from pandas import ExcelWriter
from PIL import Image
from scipy import ndimage
from scipy.stats import randint as sp_randint
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn import datasets
from sklearn import metrics
from sklearn import pipeline
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from tensorflow.python.framework import ops
import h5py
import keras
import matplotlib.pyplot as plt
import numpy as np
import openpyxl
import pandas as pd
import scipy
import tensorflow as tf
import xlsxwriter
%load_ext autoreload
%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 2) Loading the data

The "NAME" Column is for naming the molecule. The "ACTIVITY" column is the Activity of molecule. Rest of the columns shows the features.

In [11]:
data = pd.read_excel(r'full_data.xlsx')
data

Unnamed: 0,NAME,ACTIVITY,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,...,AMW,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb
0,AUTOGEN_convert_out (2)_1,0,0,0.2537,0.064364,110.2356,75.264134,0,0,68,...,6.033560,61.425548,2.047518,8.138095,8.138095,0.000000,2493,57,6.513,164
1,AUTOGEN_convert_out (2)_2,0,0,0.2108,0.044437,17.2257,12.651551,0,0,13,...,6.849821,10.679177,1.779863,7.606551,4.695141,2.911411,32,3,0.963,20
2,AUTOGEN_convert_out (2)_3,1,1,0.7472,0.558308,28.9038,27.008758,10,11,22,...,9.910580,32.083784,2.005237,15.860677,9.717629,6.143049,403,25,1.756,82
3,AUTOGEN_convert_out (2)_4,1,0,0.3352,0.112359,30.8427,45.002309,14,16,34,...,10.000291,43.894493,2.090214,7.707102,5.093812,0.000000,828,41,5.223,120
4,AUTOGEN_convert_out (2)_5,1,0,-0.0834,0.006956,25.7047,24.616344,6,6,21,...,8.384277,26.306407,2.023570,14.009842,2.401987,11.607855,249,15,-0.527,66
5,AUTOGEN_convert_out (2)_6,0,0,-1.1305,1.278030,26.2871,26.247516,5,5,24,...,7.003277,23.247109,1.937259,7.867033,7.867033,0.000000,219,11,0.459,54
6,AUTOGEN_convert_out (2)_7,1,0,-0.9040,0.817216,5.8412,29.023137,13,15,23,...,7.959496,29.303536,2.093110,5.715301,3.168065,2.547236,274,21,3.033,78
7,AUTOGEN_convert_out (2)_8,1,0,1.2840,1.648656,11.0042,37.495102,14,16,30,...,6.870318,33.215152,2.075947,0.000000,0.000000,0.000000,396,26,8.460,88
8,AUTOGEN_convert_out (2)_9,1,0,0.4283,0.183441,32.4598,45.421895,19,22,37,...,7.814399,45.778673,2.080849,15.241572,0.000000,15.241572,1007,39,3.736,124
9,AUTOGEN_convert_out (2)_10,1,0,0.5254,0.276045,33.4047,17.413551,0,0,17,...,8.532279,18.497693,1.849769,15.925007,7.281943,8.643064,125,12,-0.266,38


## 3) Cleaning the data
Removing NAN values from the data. Other attributes can also be added here to clean the data as per requirement. After executing this function, only those columns will be displayed which have non-numeric values in it. If these non-numeric values appear in numeric features columns, then these should be treated before going further. It will also print the data shape before and after cleaning.

In [12]:
#Cleaning the data
data= data_clean(data)

Data shape before cleaning:(6500, 1446)
Now it will print only those columns with non-numeric values
                            NAME
0      AUTOGEN_convert_out (2)_1
1      AUTOGEN_convert_out (2)_2
2      AUTOGEN_convert_out (2)_3
3      AUTOGEN_convert_out (2)_4
4      AUTOGEN_convert_out (2)_5
5      AUTOGEN_convert_out (2)_6
6      AUTOGEN_convert_out (2)_7
7      AUTOGEN_convert_out (2)_8
8      AUTOGEN_convert_out (2)_9
9     AUTOGEN_convert_out (2)_10
10    AUTOGEN_convert_out (2)_11
11    AUTOGEN_convert_out (2)_12
12    AUTOGEN_convert_out (2)_13
13    AUTOGEN_convert_out (2)_14
14    AUTOGEN_convert_out (2)_15
15    AUTOGEN_convert_out (2)_16
16    AUTOGEN_convert_out (2)_17
17    AUTOGEN_convert_out (2)_18
18    AUTOGEN_convert_out (2)_19
19    AUTOGEN_convert_out (2)_20
20    AUTOGEN_convert_out (2)_21
21    AUTOGEN_convert_out (2)_22
22    AUTOGEN_convert_out (2)_23
23    AUTOGEN_convert_out (2)_24
24    AUTOGEN_convert_out (2)_25
25    AUTOGEN_convert_out (2)_26
26    AU

## 4) Imputing the missing data
Imputing the missin values in feature columns by means of respective feature.

In [13]:
#imputing the missing values
Imputed_Data_input, data_labels, data_names=data_impute(data)

(6500, 1251)
Data shape after imputation:(6500, 1251)


## 5) Normalizing the data
Imputing the missin values in feature columns by means of respective feature.

In [14]:
#Normalizing the data 
full_data_norm=data_norm(Imputed_Data_input, data_labels, data_names)

## 6) splitting the data the data


In [15]:
#Splitting the data into train and test
test_size=0.30
train_cv_x, test_x, train_cv_y, test_y=data_split(full_data_norm, test_size)

## 7) Hybrid Model optimization
Currently, only two variables are optimized (drop_out and threshold). This optimization search can be extended as per requiremnet. x-fold cross validation is used in random search setting.

In [16]:
xx=0  #This variable stores the number of features selected
hybrid_model=hybrid_model_opt() #calling the hybrid model for optimizattion

#Defining two important paramters of hybrid model to be optimized using random cv search
param_grid= {'fs__threshold': ['0.08*mean','0.09*mean','0.10*mean','0.2*mean','0.3*mean','0.4*mean','0.5*mean','0.6*mean','0.7*mean','0.8*mean','0.9*mean','1*mean','1.1*mean','1.2*mean','1.3*mean','1.4*mean','1.5*mean','1.6*mean','1.7*mean','1.8*mean','1.9*mean','2.0*mean','2.1*mean','2.2*mean','2.3*mean'],
'clf__dropout_rate': [0.1, 0.2, 0.3, 0.4, 0.5,0.6,0.7,0.8,0.9]}

#Random CV search
grid = RandomizedSearchCV(estimator=hybrid_model, param_distributions=param_grid,n_iter = 1,scoring='roc_auc',cv = 3 , n_jobs=1)
opt_result = grid.fit(train_cv_x, train_cv_y)

#Printing the optimization results
print("Best: %f using %s" % (opt_result.best_score_, opt_result.best_params_))
means = opt_result.cv_results_['mean_test_score']
stds = opt_result.cv_results_['std_test_score']
params = opt_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
    

  if np.issubdtype(mask.dtype, np.int):


Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

  if np.issubdtype(mask.dtype, np.int):


Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

  if np.issubdtype(mask.dtype, np.int):


Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

  if np.issubdtype(mask.dtype, np.int):


Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

## 8) Gini_importances

In [17]:
#getting the importances of all the features    
importances, m =feature_imp(train_cv_x,train_cv_y)

## 9) Features names

In [18]:
#getting the features names of the selected features based on optimized threshold
feature_name=selected_feature_names(m, opt_result.best_params_["fs__threshold"], train_cv_x)

## 10) Saving the gini-importance and selected features names

In [19]:
#saving gini-importance of all the featues
writer = pd.ExcelWriter('importances.xlsx',engine='xlsxwriter')
pd.DataFrame(importances).to_excel(writer,sheet_name='importances')
writer.save()



#Saving features names which are selected on the basis of optimized threshold
writer = pd.ExcelWriter('feature_name.xlsx',engine='xlsxwriter')
pd.DataFrame(feature_name).to_excel(writer,sheet_name='feature_name')
writer.save()

## 11) Features selection in train and test 

In [20]:
#Selection of train and test features based on optimized value of threshold
train_selected_x, train_selected_y, test_selected_x, test_selected_y=train_test_feature_based_selection(feature_name,train_cv_x,train_cv_y,test_x,test_y )

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


## 12) Saving the test on the basis of selected features columns

In [21]:
#Saving the selected test set
writer = pd.ExcelWriter('test_selected.xlsx',engine='xlsxwriter')
pd.DataFrame(test_selected_x).to_excel(writer,sheet_name='test_selected_x')
pd.DataFrame(test_selected_y).to_excel(writer,sheet_name='test_selected_y')
writer.save()
  
    
    

## 13) Final prediction based on ensembling.

This will also save all the ensembled average models and weight matrix.

In [22]:
    
    
# At this point, we have obtained the optimized optimized values and selected the features in train and test based on 
#optimized threshold value of feature selection module of hybrid framework


ensemb=4 #Number of ensembling average
pred_test=[] #To store the individual model test prediction
pred_test_final=np.zeros((test_selected_x.shape[0],1)) # To store the final test prediction after ensembling

#As per the above number of ensemble, the models will be saved in the directory
for x in range(ensemb):
    pred_test.append(model_nn_final(train_selected_x, train_selected_y, test_selected_x, test_selected_y, x, opt_result.best_params_["clf__dropout_rate"]))
    pred_test_final=pred_test[x]+pred_test_final
        
        
#ensemble averaging        
pred_test_final=pred_test_final/ensemb        
        
        
#Final Accuracy
auc_test_final = roc_auc_score(test_selected_y, pred_test_final)
print(auc_test_final)
    

IndentationError: expected an indented block (<ipython-input-22-2a79bbcd853d>, line 13)