# RNN for the original whole dataset

In [1]:
# Import of required libraries and functions from 'make_dataset' script
import os
from make_dataset import Discotope_Dataset
import numpy as np
import torch
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, roc_auc_score
from tqdm import tqdm
from IPython.display import clear_output

# Importing the style package
from matplotlib import style

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Import tensorflow and keras
import tensorflow as tf
from tensorflow import keras

In [3]:
tf.random.set_seed(1234)

In [4]:
def data_load(name_set, data_dir, separate=False):
    
    '''
    Function to load training, validation or test data from the folder/directory you have storaged the whole dataset
        - 'name_set': text variable with the type of data set to load ('train', 'test', 'valid')
        - 'data_dir': directory where you have storaged the data set (in my case is '../Data/')
        - 'separate': boolean that returns the PDBs and AF2 sets separately (only when True)
    
    Output:
        - 'X_all': all observations from both solved and predicted structures all together
        - 'y_all': all labels from both solved and predicted structures all together
        - 'N_solved': number of observations from solved structures (in case of wanted to unmerge the 'all' array)
        - 'N_af2': number of observations from predicted structures (in case of wanted to unmerge the 'all' array)
    '''
    
    # 'Pathlib module' load to work with windows path
    import pathlib
    temp = pathlib.PosixPath
    pathlib.PosixPath = pathlib.WindowsPath
    
    # List of all subdirectories inside the data directory
    dirs = [d for d in os.listdir(data_dir)]
    
    # 're' module to use regex for filtering the specific directories according to the type of data set to load
    pattern = '.*' + name_set + '.*'
    R = re.compile(pattern)
    filtered = [folder for folder in dirs if R.match(folder)]
    
    # Loading the data set for solved structures and AlphaFold2 predicted structures
    path_af2 = data_dir + filtered[0] + '/dataset.pt'
    path_solved = data_dir + filtered[1] + '/dataset.pt'
    set_af2 = torch.load(path_af2)
    set_solved = torch.load(path_solved)
    
    # Stack all features and targets to one big array (removing NaN entries)
    X_set_solved = np.concatenate([set_solved[i]["X_arr"] for i in range(0, len(set_solved), 1) if not set_solved[i]['df_stats']['rsa'].isna().any()])
    y_set_solved = np.concatenate([set_solved[i]["y_arr"] for i in range(0, len(set_solved), 1) if not set_solved[i]['df_stats']['rsa'].isna().any()])
    X_set_af2 = np.concatenate([set_af2[i]["X_arr"] for i in range(0, len(set_solved), 1) if not set_solved[i]['df_stats']['rsa'].isna().any()])
    y_set_af2 = np.concatenate([set_af2[i]["y_arr"] for i in range(0, len(set_solved), 1) if not set_solved[i]['df_stats']['rsa'].isna().any()])
    
    # Nº of observations for each subtype of data set
    N_set_solved = X_set_solved.shape[0]
    N_set_af2 = X_set_af2.shape[0]
    
    # Stack all features and targets from solved and predicted structures into only one big
    X_set_all = np.concatenate((X_set_solved, X_set_af2), axis=0)
    y_set_all = np.concatenate((y_set_solved, y_set_af2), axis=0)
    
    if (separate==True):
        return(X_set_all, y_set_all, X_set_solved, y_set_solved, X_set_af2, y_set_af2)
    else:
        return(X_set_all, y_set_all, N_set_solved, N_set_af2)

In [5]:
def results_df_stats_creation(name_set, data_dir):
    
    '''
    Function to store the stats information from each pdb of the original dataset
        - 'name_set': text variable with the type of data set to load ('train', 'test', 'valid')
        - 'data_dir': directory where you have storaged the data set (in my case is '../Data/')
    
    Output:
        - 'stats_df_solved': dataframe for the solved structures (removing NaN entries of RSA values)
        - 'stats_df_af2': dataframe for the AF2 structures (removing NaN entries of RSA values)
    '''
    
    # 'Pathlib module' load to work with windows path
    import pathlib
    temp = pathlib.PosixPath
    pathlib.PosixPath = pathlib.WindowsPath
    
    # List of all subdirectories inside the data directory
    dirs = [d for d in os.listdir(data_dir)]
    
    # 're' module to use regex for filtering the specific directories according to the type of data set to load
    pattern = '.*' + name_set + '.*'
    R = re.compile(pattern)
    filtered = [folder for folder in dirs if R.match(folder)]
    
    # Loading the data set for solved structures and AlphaFold2 predicted structures
    path_af2 = data_dir + filtered[0] + '/dataset.pt'
    path_solved = data_dir + filtered[1] + '/dataset.pt'
    set_af2 = torch.load(path_af2)
    set_solved = torch.load(path_solved)
    
    # Stats dataframe creation for SOLVED and AF2 structures (removing 'NaN' entries according the PDB set)
    stats_dfs_solved = []
    stats_dfs_af2 = []
    for i in range(0, len(set_solved), 1):
        sample = set_solved[i]
        sample_af2 = set_af2[i]
        df_sample = sample['df_stats']
        df_sample_af2 = sample_af2['df_stats']
        
        # Removing 'NaN' entries and adding the epitope column
        if(df_sample['rsa'].isna().any() == False):
            df_sample['epitope'] = sample['y_arr'].astype(bool)
            df_sample_af2['epitope'] = sample_af2['y_arr'].astype(bool)
            stats_dfs_solved.append(df_sample)
            stats_dfs_af2.append(df_sample_af2)
    
    return(stats_dfs_solved, stats_dfs_af2)

In [6]:
def dataframe_load(name_set, data_dir):
    
    '''
    Function to load training, validation or test dataframes from the folder/directory you have storaged the whole dataset.
    This function is specifically to have the original dataframes of the data, and their corresponding description
        - 'name_set': text variable with the type of data set to load ('train', 'test', 'valid')
        - 'data_dir': directory where you have storaged the data set (in my case is '../Data/')
    
    Output:
        - 'set_af2': dataframe for AF2 predicted structures
        - 'set_solved': dataframe for PDB solved structures
    '''
    
    # 'Pathlib module' load to work with windows path
    import pathlib
    temp = pathlib.PosixPath
    pathlib.PosixPath = pathlib.WindowsPath
    
    # List of all subdirectories inside the data directory
    dirs = [d for d in os.listdir(data_dir)]
    
    # 're' module to use regex for filtering the specific directories according to the type of data set to load
    pattern = '.*' + name_set + '.*'
    R = re.compile(pattern)
    filtered = [folder for folder in dirs if R.match(folder)]
    
    # Loading the data set for solved structures and AlphaFold2 predicted structures
    path_af2 = data_dir + filtered[0] + '/dataset.pt'
    path_solved = data_dir + filtered[1] + '/dataset.pt'
    set_af2 = torch.load(path_af2)
    set_solved = torch.load(path_solved)
    
    return(set_af2, set_solved)

In [7]:
def remove_NaN(data, y):
    
    '''
    Function to remove NaN values (some PDB entries have RSA NaN values)
        - 'data': numpy array with the specific (train, valid, test) data
        - 'y': numpy array with the specific (train, valid, test) labels
    
    Output:
        - 'data_noNaN': array withouth the entries/observations that contain NaN values
    '''
    
    # Merging X and y arrays all together
    joint_data = np.hstack((data, y.reshape(-1, 1)))
    
    # Removal of NaN entries
    nan_rows = np.isnan(joint_data).any(axis=1)
    data_noNaN = joint_data[~nan_rows, :]
    
    # Demerging the final array into X and y
    X_noNaN = data_noNaN[:, 0:data.shape[1]]
    y_noNaN = data_noNaN[:,-1]
    
    return(X_noNaN, y_noNaN)

In [8]:
def normalize_train(X):
    
    '''
    Function to normalize the columns 532 (pLLDT) and 533 (length) because they have high length
        - 'X': data to normalize
    '''
    
    # Create a copy of the X vector to do the normalization
    X_scaled = X.copy()
    
    # Create an instance of MinMaxScaler
    scaler = StandardScaler()
    
    # Fit the scaler to the data 
    #scaler.fit(X_scaled[:, 532:534])
    scaler.fit(X_scaled)
    
    # Transform the data
    #X_scaled[:, 532:534] = scaler.fit_transform(X_scaled[:, 532:534])
    X_scaled = scaler.fit_transform(X_scaled)
    return(X_scaled)

In [9]:
def Z_transform_train(X, cols_to_select):
    
    '''
    Function to normalize the columns 532 (pLLDT) and 533 (length) because they have high length
        - 'X': data to normalize
    '''
    
    # Obtain the mean and standard deviation for each feature on the array
    X_mean = np.mean(X[:, cols_to_select], axis=0)
    X_std = np.std(X[:, cols_to_select], axis=0)
    
    # Correcting pLDDT column sd in case of using only PDB set
    X_std[X_std == 0] = 1
    
    # Z-transform (standardization)
    X[:, cols_to_select] = (X[:, cols_to_select] - X_mean)/X_std
    return(X, X_mean, X_std)

In [10]:
def Z_transform_valid(X, mean_train, sd_train, cols_to_select):
    
    '''
    Function to normalize the columns 532 (pLLDT) and 533 (length) because they have high length
        - 'X': data to normalize
        - 'mean_train': mean from standardized training set
        - 'sd_train': standard deviation from standardized training set
    '''
    
    # Z-transform (standardization)
    X[:, cols_to_select] = (X[:, cols_to_select] - mean_train)/sd_train
    return(X)

In [11]:
def class_weight_calculator(y_train):
        
    '''
    Function to calculate the class weights for the unbalanced data
        - 'y_train': training labels (contains 0 and 1)
    '''
    
    # Compute the class weights with sklearn function
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

    # Convert the class weights to a dictionary
    class_weight = dict(enumerate(class_weights))
    return(class_weight)

In [12]:
# Neuronal network model with one layer
def nn_model(train_data, y_train, valid_data, y_valid, act_fun, loss_fun, h, alpha, 
             class_weight_fn, drop_rate, batch = None, balancing=True):
    
    '''
    Function to create and train/validate the feed-forward neuronal network with only 1 hidden layer
        -'train_data': X train standardized
        -'y_train': training labels
        -'act_fun': activation function
        -'loss_fun': loss function
        -'h': number of hidden units
        -'alpha': L2 regularization value
        -'class_weight_calculator': function to calculate the weights for each class
        -'batch': batch_size (baseline is None, but can be changed by adding a integer as new value)
        -'balancing': True/False argument for incorporating balancing in classes
    
    Output:
        -'model': neural network model trained
        -'history': attributes obtained during fitting the model
    '''
    
    # Calculation of the class weights with function previously defined
    class_weight = class_weight_fn(y_train)
    
    # Normalization of the class_weight to sum 1
    tot = class_weight[0] + class_weight[1]
    class_weight[0] = class_weight[0]/tot
    class_weight[1] = class_weight[1]/tot
    
    # Implementation of keras for creating a sequential model with 1 layer
    tf.random.set_seed(1234)
    from keras.layers import Dense, Dropout
    from keras import regularizers, metrics
    
    model = keras.Sequential()
    # Input layer with train_data.shape neurons and a hidden layer with 1 neuron
    model.add(Dense(h, activation=act_fun, input_shape=train_data.shape[1:], kernel_regularizer=regularizers.l2(alpha)))
    model.add(Dropout(drop_rate))
    # Output layer with sigmoid activation (better for binary classification)
    model.add(Dense(1, activation='sigmoid'))
    
    opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=opt, loss=loss_fun, metrics=
                  ['accuracy', metrics.Precision(), metrics.Recall(), metrics.AUC(), loss_fun])
    
    if (balancing == True):
        history = model.fit(train_data, y_train, epochs = 100, batch_size=batch, verbose=0, class_weight=class_weight, 
                            validation_data = (valid_data, y_valid))
    else:
        history = model.fit(train_data, y_train, epochs = 100, batch_size=batch, verbose=0, 
                            validation_data = (valid_data, y_valid))
    
    return(model, history)

In [13]:
def loss_plot(history, h_units, l2, drop):
    
    '''
    Function to plot the loss curve of the training of the model
        - 'loss_values': array with the loss values for each iteration of the training
    '''
    
    plt.plot(history.history['loss'], label = 'train')
    plt.plot(history.history['val_loss'], label='validation')
    plt.title('Model loss when trying ' + str(h_units) + ' hunits,\n' + str(l2) + ' reg, and ' + str(drop) + ' dropout')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='best')
    
    plt.savefig('Loss - ' + str(h_units) + ' hunits, ' + str(l2) + ' reg, and ' + str(drop) + ' drop.png')
    plt.close()

In [14]:
def ROC_AUC(y_true, y_hat):
    
    '''
    Function to obtain the AUC value based on the ROC curve
        - 'y_true': y original values
        - 'y_hat': y predicted values
        
    Output:
        - 'ROC_auc': AUC value
    '''
    
    fpr, tpr, _ = metrics.roc_curve(y_true, y_hat)
    ROC_auc = metrics.auc(fpr, tpr)
    return(ROC_auc)

In [15]:
def PR_AUC(y_true, y_hat):
    
    '''
    Function to obtain the AUC value based on the precision and recall parameters
        - 'y_true': y original values
        - 'y_hat': y predicted values
        
    Output:
        - 'PR_auc': AUC value
    '''
    
    precision, recall, thresholds = metrics.precision_recall_curve(y_true, y_hat)
    PR_auc = metrics.auc(recall, precision)
    return(PR_auc)

In [16]:
# Epitope rank percentile score
# https://github.com/Magnushhoie/discotope3/blob/main/src/models/mlscripts.py#L55

def get_percentile_score_arr(
    scores: np.array,
    epitopes: np.array,
):
    
    """Find mean predicted epitope rank percentile score from the scores (y_hat) and the epitopes (y_true)"""
    epitopes_bool = epitopes.astype(bool)
    assert epitopes_bool.dtype == "bool"

    c = scores[epitopes_bool].mean()
    c_percentile = (c > scores).mean()

    return c_percentile

In [17]:
# Data loading for training, validation, and test data sets (needs a couple of minutes)
X_train, y_train, X_train_PDB, y_train_PDB, X_train_af2, y_train_af2 = data_load(name_set='train', data_dir='../Data/', separate = True)
X_valid, y_valid, X_valid_PDB, y_valid_PDB, X_valid_af2, y_valid_af2 = data_load(name_set='valid', data_dir='../Data/', separate = True)
X_test, y_test, X_test_PDB, y_test_PDB, X_test_af2, y_test_af2 = data_load(name_set='test', data_dir='../Data/', separate = True)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

((446835, 536), (446835,), (115978, 536), (115978,), (11882, 536), (11882,))

In [18]:
## NOT NEEDED: ALREADY REMOVED WHEN LOADING THE DATA

# # Data manipulation to remove all PDB entries with NaN values in the RSA feature (535)
# X_train, y_train = remove_NaN(data=X_train, y=y_train)
# X_valid, y_valid = remove_NaN(data=X_valid, y=y_valid)
# X_test, y_test = remove_NaN(data=X_test, y=y_test)

# X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

In [19]:
# Data normalization
cols = list(range(0, 512)) + [532] + [533] + [535]
X_train_sc, mean_X_train, sd_X_train = Z_transform_train(X_train, cols_to_select=cols)
X_valid_sc = Z_transform_valid(X_valid, mean_X_train, sd_X_train, cols_to_select=cols)
X_test_sc = Z_transform_valid(X_test, mean_X_train, sd_X_train, cols_to_select=cols)

X_train_sc.shape, X_valid_sc.shape, X_test_sc.shape

((446835, 536), (115978, 536), (11882, 536))

In [20]:
# Data normalization (PDB only)
cols = list(range(0, 512)) + [532] + [533] + [535]
X_train_PDB_sc, mean_X_train_PDB, sd_X_train_PDB = Z_transform_train(X_train_PDB, cols_to_select=cols)
X_valid_PDB_sc = Z_transform_valid(X_valid_PDB, mean_X_train_PDB, sd_X_train_PDB, cols_to_select=cols)
X_test_PDB_sc = Z_transform_valid(X_test_PDB, mean_X_train_PDB, sd_X_train_PDB, cols_to_select=cols)

X_train_PDB_sc.shape, X_valid_PDB_sc.shape, X_test_PDB_sc.shape

((223346, 536), (57989, 536), (5941, 536))

In [21]:
# Data normalization (AF2 only)
cols = list(range(0, 512)) + [532] + [533] + [535]
X_train_af2_sc, mean_X_train_af2, sd_X_train_af2 = Z_transform_train(X_train_af2, cols_to_select=cols)
X_valid_af2_sc = Z_transform_valid(X_valid_af2, mean_X_train_af2, sd_X_train_af2, cols_to_select=cols)
X_test_af2_sc = Z_transform_valid(X_test_af2, mean_X_train_af2, sd_X_train_af2, cols_to_select=cols)

X_train_af2_sc.shape, X_valid_af2_sc.shape, X_test_af2_sc.shape

((223489, 536), (57989, 536), (5941, 536))

In [22]:
# Class unbalanced in train
zero, one = np.bincount(y_train.astype(int))
total = zero + one
print("Class count (train):\n\n      Total: {}\n      Epitope label: {} ({:.2f}% of total)".format(total, one, 100*one/total))

Class count (train):

      Total: 446835
      Epitope label: 40938 (9.16% of total)


In [23]:
# Class unbalanced in valid
zero, one = np.bincount(y_valid.astype(int))
total = zero + one
print("Class count (validation):\n\n      Total: {}\n      Epitope label: {} ({:.2f}% of total)".format(total, one, 100*one/total))

Class count (validation):

      Total: 115978
      Epitope label: 9578 (8.26% of total)


In [24]:
# Class unbalanced in train
zero, one = np.bincount(y_test.astype(int))
total = zero + one
print("Class count (test):\n\n      Total: {}\n      Epitope label: {} ({:.2f}% of total)".format(total, one, 100*one/total))

Class count (test):

      Total: 11882
      Epitope label: 712 (5.99% of total)


In [25]:
# Calculation of the class weights with function previously defined
class_weight = class_weight_calculator(y_train)
    
# Normalization of the class_weight to sum 1
tot = class_weight[0] + class_weight[1]
class_weight[0] = class_weight[0]/tot
class_weight[1] = class_weight[1]/tot
    
print("Class weight implemented")
class_weight

Class weight implemented


{0: 0.09161771123569103, 1: 0.9083822887643089}

## Loop trying different possible combinations of parameters

In [29]:
# Parameters definition
loss = 'binary_crossentropy'
act = 'relu'
l2_value = [1e-4, 1e-3, 1e-2, 1e-1, 1]
batch_size = 128
class_balancing = False
drop_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]

# Hidden units to try
h_units = [1, 2, 5, 10]

In [27]:
# Text file with results
file_name = 'AUC_hunits_hyperparams.txt'
file = open(file_name, "a")
file.write('H_units' + '\t' + 'L2_reg' + '\t' + 'dropout_rate' + '\t' + 'AUC_valid' + '\t' + 'AUC_valid_PDB' + '\t' + 'AUC_valid_af2' + '\t' + 'AUC_test' + '\t' + 'AUC_test_PDB' + '\t' + 'AUC_test_af2' + '\n')
file.close()

In [30]:
# Manual loop for parameter combinations
for hunit in h_units:
    
    for l2_reg in l2_value:
        
        for drop in drop_rate:
    
            # Training and validation of the model
            model, history = nn_model(train_data=X_train_sc, y_train=y_train, valid_data=X_valid_sc, y_valid=y_valid, 
                                      act_fun=act, loss_fun=loss, h=hunit, alpha=l2_reg, class_weight_fn=class_weight_calculator,
                                      drop_rate = drop, batch = batch_size, balancing = class_balancing)    

            # Obtaining probabilities values
            y_pred_valid_prob = model.predict(X_valid_sc, verbose=0)
            y_pred_test_prob = model.predict(X_test_sc, verbose=0)

            # Obtaining probabilities values
            y_pred_valid_PDB_prob = model.predict(X_valid_PDB_sc, verbose=0)
            y_pred_test_PDB_prob = model.predict(X_test_PDB_sc, verbose=0)

            # Obtaining probabilities values
            y_pred_valid_af2_prob = model.predict(X_valid_af2_sc, verbose=0)
            y_pred_test_af2_prob = model.predict(X_test_af2_sc, verbose=0)

            # ROC-AUC value (valid)
            ROC_AUC_valid = ROC_AUC(y_valid, y_pred_valid_prob)
            ROC_AUC_valid_PDB = ROC_AUC(y_valid_PDB, y_pred_valid_PDB_prob)
            ROC_AUC_valid_af2 = ROC_AUC(y_valid_af2, y_pred_valid_af2_prob)
            # ROC-AUC value (test)
            ROC_AUC_test = ROC_AUC(y_test, y_pred_test_prob)
            ROC_AUC_test_PDB = ROC_AUC(y_test_PDB, y_pred_test_PDB_prob)
            ROC_AUC_test_af2 = ROC_AUC(y_test_af2, y_pred_test_af2_prob)

            # Text file with results
            file = open(file_name, "a")
            word_h = str(hunit) + '_unit'
            file.write(word_h + '\t' + str(l2_reg) + '\t' + str(drop) + '\t' +
                       str(ROC_AUC_valid) + '\t' + str(ROC_AUC_valid_PDB) + '\t' + str(ROC_AUC_valid_af2) + 
                       '\t' + str(ROC_AUC_test) + '\t' + str(ROC_AUC_test_PDB) + '\t' + str(ROC_AUC_test_af2) + '\n')
            file.close()

            # Save training/validation loss plot
            loss_plot(history, hunit, l2_reg, drop)

            # Print round
            print("Finished try with", str(hunit), "hidden units,", str(l2_reg), "L2 reg, and ", str(drop), "dropout")

Finished try with 5 hidden units, 1 L2 reg, and  0.0 dropout
Finished try with 5 hidden units, 1 L2 reg, and  0.1 dropout
Finished try with 5 hidden units, 1 L2 reg, and  0.2 dropout
Finished try with 5 hidden units, 1 L2 reg, and  0.3 dropout
Finished try with 5 hidden units, 1 L2 reg, and  0.4 dropout
Finished try with 5 hidden units, 1 L2 reg, and  0.5 dropout
Finished try with 10 hidden units, 1 L2 reg, and  0.0 dropout
Finished try with 10 hidden units, 1 L2 reg, and  0.1 dropout
Finished try with 10 hidden units, 1 L2 reg, and  0.2 dropout
Finished try with 10 hidden units, 1 L2 reg, and  0.3 dropout
Finished try with 10 hidden units, 1 L2 reg, and  0.4 dropout
Finished try with 10 hidden units, 1 L2 reg, and  0.5 dropout


In [35]:
# Read in tab-separated file into DataFrame
df = pd.read_csv(file_name, sep='\t')

# Sort the DataFrame by the maximum value of AUC (validation)
AUCval_df = df.sort_values(by='AUC_valid', ascending=False)
AUCval_df.head(20)

Unnamed: 0,H_units,L2_reg,dropout_rate,AUC_valid,AUC_valid_PDB,AUC_valid_af2,AUC_test,AUC_test_PDB,AUC_test_af2
100,10_unit,0.01,0.4,0.794642,0.802523,0.786897,0.780097,0.785078,0.775623
89,10_unit,0.0001,0.5,0.793855,0.799442,0.789058,0.774642,0.780682,0.769971
87,10_unit,0.0001,0.3,0.792883,0.798456,0.787688,0.77273,0.781825,0.763755
95,10_unit,0.001,0.5,0.791244,0.798602,0.783384,0.783574,0.790344,0.776481
93,10_unit,0.001,0.3,0.790875,0.79617,0.785691,0.774602,0.784307,0.76666
94,10_unit,0.001,0.4,0.790723,0.796635,0.785012,0.771631,0.782724,0.760727
88,10_unit,0.0001,0.4,0.79042,0.796818,0.783857,0.777153,0.783009,0.772751
99,10_unit,0.01,0.3,0.790375,0.796361,0.783967,0.772114,0.781452,0.7647
101,10_unit,0.01,0.5,0.789449,0.797259,0.782122,0.781314,0.787629,0.776113
97,10_unit,0.01,0.1,0.789414,0.794942,0.782269,0.763114,0.775033,0.752789


In [36]:
# Sort the DataFrame by the maximum value of AUC (test)
AUCtest_df = df.sort_values(by='AUC_test', ascending=False)
AUCtest_df.head(20)

Unnamed: 0,H_units,L2_reg,dropout_rate,AUC_valid,AUC_valid_PDB,AUC_valid_af2,AUC_test,AUC_test_PDB,AUC_test_af2
48,2_unit,0.1,0.0,0.782424,0.792101,0.772664,0.787002,0.79125,0.782949
41,2_unit,0.001,0.5,0.780724,0.788246,0.774098,0.785832,0.789533,0.780712
71,5_unit,0.001,0.5,0.787927,0.795865,0.780231,0.785527,0.788068,0.784689
40,2_unit,0.001,0.4,0.781407,0.788887,0.774123,0.785388,0.788346,0.780716
6,1_unit,0.001,0.0,0.784163,0.794051,0.774353,0.785303,0.789975,0.781275
12,1_unit,0.01,0.0,0.783704,0.793639,0.773777,0.784845,0.789834,0.780397
74,5_unit,0.01,0.2,0.787058,0.795293,0.779181,0.784023,0.790563,0.779422
43,2_unit,0.01,0.1,0.783479,0.791273,0.775401,0.78384,0.789226,0.779383
76,5_unit,0.01,0.4,0.788135,0.796903,0.779455,0.783808,0.78962,0.778691
95,10_unit,0.001,0.5,0.791244,0.798602,0.783384,0.783574,0.790344,0.776481


Feature extraction from the linear model: check the 20 highest absolute value weights

- Also save all the weights before and after training the linear neural network and do the histogram plot