# RNN for the original whole dataset

In [1]:
# Import of required libraries and functions from 'make_dataset' script
import os
from make_dataset import Discotope_Dataset
import numpy as np
import torch
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, roc_auc_score
from tqdm import tqdm
from IPython.display import clear_output

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Import tensorflow and keras
import tensorflow as tf
from tensorflow import keras

In [3]:
tf.random.set_seed(1234)

In [4]:
def data_load(name_set, data_dir, separate=False):
    
    '''
    Function to load training, validation or test data from the folder/directory you have storaged the whole dataset
        - 'name_set': text variable with the type of data set to load ('train', 'test', 'valid')
        - 'data_dir': directory where you have storaged the data set (in my case is '../Data/')
        - 'separate': boolean that returns the PDBs and AF2 sets separately (only when True)
    
    Output:
        - 'X_all': all observations from both solved and predicted structures all together
        - 'y_all': all labels from both solved and predicted structures all together
        - 'N_solved': number of observations from solved structures (in case of wanted to unmerge the 'all' array)
        - 'N_af2': number of observations from predicted structures (in case of wanted to unmerge the 'all' array)
    '''
    
    # 'Pathlib module' load to work with windows path
    import pathlib
    temp = pathlib.PosixPath
    pathlib.PosixPath = pathlib.WindowsPath
    
    # List of all subdirectories inside the data directory
    dirs = [d for d in os.listdir(data_dir)]
    
    # 're' module to use regex for filtering the specific directories according to the type of data set to load
    pattern = '.*' + name_set + '.*'
    R = re.compile(pattern)
    filtered = [folder for folder in dirs if R.match(folder)]
    
    # Loading the data set for solved structures and AlphaFold2 predicted structures
    path_af2 = data_dir + filtered[0] + '/dataset.pt'
    path_solved = data_dir + filtered[1] + '/dataset.pt'
    set_af2 = torch.load(path_af2)
    set_solved = torch.load(path_solved)
    
    # Stack all features and targets to one big array (removing NaN entries)
    X_set_solved = np.concatenate([set_solved[i]["X_arr"] for i in range(0, len(set_solved), 1) if not set_solved[i]['df_stats']['rsa'].isna().any()])
    y_set_solved = np.concatenate([set_solved[i]["y_arr"] for i in range(0, len(set_solved), 1) if not set_solved[i]['df_stats']['rsa'].isna().any()])
    X_set_af2 = np.concatenate([set_af2[i]["X_arr"] for i in range(0, len(set_solved), 1) if not set_solved[i]['df_stats']['rsa'].isna().any()])
    y_set_af2 = np.concatenate([set_af2[i]["y_arr"] for i in range(0, len(set_solved), 1) if not set_solved[i]['df_stats']['rsa'].isna().any()])
    
    # Nº of observations for each subtype of data set
    N_set_solved = X_set_solved.shape[0]
    N_set_af2 = X_set_af2.shape[0]
    
    # Stack all features and targets from solved and predicted structures into only one big
    X_set_all = np.concatenate((X_set_solved, X_set_af2), axis=0)
    y_set_all = np.concatenate((y_set_solved, y_set_af2), axis=0)
    
    if (separate==True):
        return(X_set_all, y_set_all, X_set_solved, y_set_solved, X_set_af2, y_set_af2)
    else:
        return(X_set_all, y_set_all, N_set_solved, N_set_af2)

In [5]:
def results_df_stats_creation(name_set, data_dir):
    
    '''
    Function to store the stats information from each pdb of the original dataset
        - 'name_set': text variable with the type of data set to load ('train', 'test', 'valid')
        - 'data_dir': directory where you have storaged the data set (in my case is '../Data/')
    
    Output:
        - 'stats_df_solved': dataframe for the solved structures (removing NaN entries of RSA values)
        - 'stats_df_af2': dataframe for the AF2 structures (removing NaN entries of RSA values)
    '''
    
    # 'Pathlib module' load to work with windows path
    import pathlib
    temp = pathlib.PosixPath
    pathlib.PosixPath = pathlib.WindowsPath
    
    # List of all subdirectories inside the data directory
    dirs = [d for d in os.listdir(data_dir)]
    
    # 're' module to use regex for filtering the specific directories according to the type of data set to load
    pattern = '.*' + name_set + '.*'
    R = re.compile(pattern)
    filtered = [folder for folder in dirs if R.match(folder)]
    
    # Loading the data set for solved structures and AlphaFold2 predicted structures
    path_af2 = data_dir + filtered[0] + '/dataset.pt'
    path_solved = data_dir + filtered[1] + '/dataset.pt'
    set_af2 = torch.load(path_af2)
    set_solved = torch.load(path_solved)
    
    # Stats dataframe creation for SOLVED and AF2 structures (removing 'NaN' entries according the PDB set)
    stats_dfs_solved = []
    stats_dfs_af2 = []
    for i in range(0, len(set_solved), 1):
        sample = set_solved[i]
        sample_af2 = set_af2[i]
        df_sample = sample['df_stats']
        df_sample_af2 = sample_af2['df_stats']
        
        # Removing 'NaN' entries and adding the epitope column
        if(df_sample['rsa'].isna().any() == False):
            df_sample['epitope'] = sample['y_arr'].astype(bool)
            df_sample_af2['epitope'] = sample_af2['y_arr'].astype(bool)
            stats_dfs_solved.append(df_sample)
            stats_dfs_af2.append(df_sample_af2)
    
    return(stats_dfs_solved, stats_dfs_af2)

In [6]:
def dataframe_load(name_set, data_dir):
    
    '''
    Function to load training, validation or test dataframes from the folder/directory you have storaged the whole dataset.
    This function is specifically to have the original dataframes of the data, and their corresponding description
        - 'name_set': text variable with the type of data set to load ('train', 'test', 'valid')
        - 'data_dir': directory where you have storaged the data set (in my case is '../Data/')
    
    Output:
        - 'set_af2': dataframe for AF2 predicted structures
        - 'set_solved': dataframe for PDB solved structures
    '''
    
    # 'Pathlib module' load to work with windows path
    import pathlib
    temp = pathlib.PosixPath
    pathlib.PosixPath = pathlib.WindowsPath
    
    # List of all subdirectories inside the data directory
    dirs = [d for d in os.listdir(data_dir)]
    
    # 're' module to use regex for filtering the specific directories according to the type of data set to load
    pattern = '.*' + name_set + '.*'
    R = re.compile(pattern)
    filtered = [folder for folder in dirs if R.match(folder)]
    
    # Loading the data set for solved structures and AlphaFold2 predicted structures
    path_af2 = data_dir + filtered[0] + '/dataset.pt'
    path_solved = data_dir + filtered[1] + '/dataset.pt'
    set_af2 = torch.load(path_af2)
    set_solved = torch.load(path_solved)
    
    return(set_af2, set_solved)

In [7]:
def remove_NaN(data, y):
    
    '''
    Function to remove NaN values (some PDB entries have RSA NaN values)
        - 'data': numpy array with the specific (train, valid, test) data
        - 'y': numpy array with the specific (train, valid, test) labels
    
    Output:
        - 'data_noNaN': array withouth the entries/observations that contain NaN values
    '''
    
    # Merging X and y arrays all together
    joint_data = np.hstack((data, y.reshape(-1, 1)))
    
    # Removal of NaN entries
    nan_rows = np.isnan(joint_data).any(axis=1)
    data_noNaN = joint_data[~nan_rows, :]
    
    # Demerging the final array into X and y
    X_noNaN = data_noNaN[:, 0:data.shape[1]]
    y_noNaN = data_noNaN[:,-1]
    
    return(X_noNaN, y_noNaN)

In [8]:
def normalize_train(X):
    
    '''
    Function to normalize the columns 532 (pLLDT) and 533 (length) because they have high length
        - 'X': data to normalize
    '''
    
    # Create a copy of the X vector to do the normalization
    X_scaled = X.copy()
    
    # Create an instance of MinMaxScaler
    scaler = StandardScaler()
    
    # Fit the scaler to the data 
    #scaler.fit(X_scaled[:, 532:534])
    scaler.fit(X_scaled)
    
    # Transform the data
    #X_scaled[:, 532:534] = scaler.fit_transform(X_scaled[:, 532:534])
    X_scaled = scaler.fit_transform(X_scaled)
    return(X_scaled)

In [9]:
def Z_transform_train(X, cols_to_select):
    
    '''
    Function to normalize the columns 532 (pLLDT) and 533 (length) because they have high length
        - 'X': data to normalize
    '''
    
    # Obtain the mean and standard deviation for each feature on the array
    X_mean = np.mean(X[:, cols_to_select], axis=0)
    X_std = np.std(X[:, cols_to_select], axis=0)
    
    # Correcting pLDDT column sd in case of using only PDB set
    X_std[X_std == 0] = 1
    
    # Z-transform (standardization)
    X[:, cols_to_select] = (X[:, cols_to_select] - X_mean)/X_std
    return(X, X_mean, X_std)

In [10]:
def Z_transform_valid(X, mean_train, sd_train, cols_to_select):
    
    '''
    Function to normalize the columns 532 (pLLDT) and 533 (length) because they have high length
        - 'X': data to normalize
        - 'mean_train': mean from standardized training set
        - 'sd_train': standard deviation from standardized training set
    '''
    
    # Z-transform (standardization)
    X[:, cols_to_select] = (X[:, cols_to_select] - mean_train)/sd_train
    return(X)

In [11]:
def class_weight_calculator(y_train):
        
    '''
    Function to calculate the class weights for the unbalanced data
        - 'y_train': training labels (contains 0 and 1)
    '''
    
    # Compute the class weights with sklearn function
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

    # Convert the class weights to a dictionary
    class_weight = dict(enumerate(class_weights))
    return(class_weight)

In [40]:
# Neuronal network model with one layer
def nn1_model(train_data, y_train, act_fun, loss_fun, alpha, class_weight_fn, batch = None, balancing=True):
    
    '''
    Function to create and train/validate the feed-forward neuronal network with only 1 hidden layer
        -'train_data': X train standardized
        -'y_train': training labels
        -'act_fun': activation function
        -'loss_fun': loss function
        -'class_weight_calculator': function to calculate the weights for each class
        -'batch': batch_size (baseline is None, but can be changed by adding a integer as new value)
        -'balancing': True/False argument for incorporating balancing in classes
    
    Output:
        -'model': neural network model trained
        -'history': attributes obtained during fitting the model
    '''
    
    # Calculation of the class weights with function previously defined
    class_weight = class_weight_fn(y_train)
    
    # Normalization of the class_weight to sum 1
    tot = class_weight[0] + class_weight[1]
    class_weight[0] = class_weight[0]/tot
    class_weight[1] = class_weight[1]/tot
    
    # Implementation of keras for creating a sequential model with 1 layer
    tf.random.set_seed(1234)
    from keras.layers import Dense, Dropout
    from keras import regularizers, metrics
    
    model = keras.Sequential()
    # Input layer with train_data.shape neurons and a hidden layer with 1 neuron
    model.add(Dense(1, activation=act_fun, input_shape=train_data.shape[1:], kernel_regularizer=regularizers.l2(alpha)))
    # Output layer with sigmoid activation (better for binary classification)
    model.add(Dense(1, activation='sigmoid'))
    
    opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=opt, loss=loss_fun, metrics=
                  ['accuracy', metrics.Precision(), metrics.Recall(), metrics.AUC(), loss_fun])
    
    if (balancing == True):
        history = model.fit(train_data, y_train, epochs = 100, batch_size=batch, verbose=0, class_weight=class_weight)
    else:
        history = model.fit(train_data, y_train, epochs = 100, batch_size=batch, verbose=0)
    
    return(model, history)

In [33]:
# Neuronal network model with one layer and 10 hidden unit
def nn10_model(train_data, y_train, act_fun, loss_fun, alpha, class_weight_fn, drop_rate, batch = None, balancing=True):
    
    '''
    Function to create and train/validate the feed-forward neuronal network with only 1 hidden layer
        -'train_data': X train standardized
        -'y_train': training labels
        -'act_fun': activation function
        -'loss_fun': loss function
        -'class_weight_calculator': function to calculate the weights for each class
        -'batch': batch_size (baseline is None, but can be changed by adding a integer as new value)
        -'balancing': True/False argument for incorporating balancing in classes
    
    Output:
        -'model': neural network model trained
        -'history': attributes obtained during fitting the model
    '''
    
    # Calculation of the class weights with function previously defined
    class_weight = class_weight_fn(y_train)
    
    # Normalization of the class_weight to sum 1
    tot = class_weight[0] + class_weight[1]
    class_weight[0] = class_weight[0]/tot
    class_weight[1] = class_weight[1]/tot
    
    # Implementation of keras for creating a sequential model with 1 layer
    tf.random.set_seed(1234)
    from keras.layers import Dense, Dropout
    from keras import regularizers, metrics
    
    model = keras.Sequential()
    # Input layer with train_data.shape neurons and a hidden layer with 1 neuron
    model.add(Dense(10, activation=act_fun, input_shape=train_data.shape[1:], kernel_regularizer=regularizers.l2(alpha)))
    model.add(Dropout(drop_rate))
    # Output layer with sigmoid activation (better for binary classification)
    model.add(Dense(1, activation='sigmoid'))
    
    opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(optimizer=opt, loss=loss_fun, metrics=
                  ['accuracy', metrics.Precision(), metrics.Recall(), metrics.AUC(), loss_fun])
    
    if (balancing == True):
        history = model.fit(train_data, y_train, epochs = 100, batch_size=batch, verbose=0, class_weight=class_weight)
    else:
        history = model.fit(train_data, y_train, epochs = 100, batch_size=batch, verbose=0)
    
    return(model, history)

In [13]:
def loss_plot(loss_values):
    
    '''
    Function to plot the loss curve of the training of the model
        - 'loss_values': array with the loss values for each iteration of the training
    '''
    
    plt.plot(loss_values, label = 'Train')
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper right')
    plt.show()

In [14]:
def ROC_AUC(y_true, y_hat):
    
    '''
    Function to obtain the AUC value based on the ROC curve
        - 'y_true': y original values
        - 'y_hat': y predicted values
        
    Output:
        - 'ROC_auc': AUC value
    '''
    
    fpr, tpr, _ = metrics.roc_curve(y_true, y_hat)
    ROC_auc = metrics.auc(fpr, tpr)
    return(ROC_auc)

In [15]:
def PR_AUC(y_true, y_hat):
    
    '''
    Function to obtain the AUC value based on the precision and recall parameters
        - 'y_true': y original values
        - 'y_hat': y predicted values
        
    Output:
        - 'PR_auc': AUC value
    '''
    
    precision, recall, thresholds = metrics.precision_recall_curve(y_true, y_hat)
    PR_auc = metrics.auc(recall, precision)
    return(PR_auc)

In [16]:
# Epitope rank percentile score
# https://github.com/Magnushhoie/discotope3/blob/main/src/models/mlscripts.py#L55

def get_percentile_score_arr(
    scores: np.array,
    epitopes: np.array,
):
    
    """Find mean predicted epitope rank percentile score from the scores (y_hat) and the epitopes (y_true)"""
    epitopes_bool = epitopes.astype(bool)
    assert epitopes_bool.dtype == "bool"

    c = scores[epitopes_bool].mean()
    c_percentile = (c > scores).mean()

    return c_percentile

In [17]:
# Data loading for training, validation, and test data sets (needs a couple of minutes)
X_train, y_train, X_train_PDB, y_train_PDB, X_train_af2, y_train_af2 = data_load(name_set='train', data_dir='../Data/', separate = True)
X_valid, y_valid, X_valid_PDB, y_valid_PDB, X_valid_af2, y_valid_af2 = data_load(name_set='valid', data_dir='../Data/', separate = True)
X_test, y_test, X_test_PDB, y_test_PDB, X_test_af2, y_test_af2 = data_load(name_set='test', data_dir='../Data/', separate = True)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

((446835, 536), (446835,), (115978, 536), (115978,), (11882, 536), (11882,))

In [18]:
## NOT NEEDED: ALREADY REMOVED WHEN LOADING THE DATA
# Data manipulation to remove all PDB entries with NaN values in the RSA feature (535)

# X_train, y_train = remove_NaN(data=X_train, y=y_train)
# X_valid, y_valid = remove_NaN(data=X_valid, y=y_valid)
# X_test, y_test = remove_NaN(data=X_test, y=y_test)

# X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, X_test.shape, y_test.shape

In [19]:
# Data normalization
cols = list(range(0, 512)) + [532] + [533] + [535]
X_train_sc, mean_X_train, sd_X_train = Z_transform_train(X_train, cols_to_select=cols)
X_valid_sc = Z_transform_valid(X_valid, mean_X_train, sd_X_train, cols_to_select=cols)
X_test_sc = Z_transform_valid(X_test, mean_X_train, sd_X_train, cols_to_select=cols)

X_train_sc.shape, X_valid_sc.shape, X_test_sc.shape

((446835, 536), (115978, 536), (11882, 536))

In [20]:
# Data normalization (PDB only)
cols = list(range(0, 512)) + [532] + [533] + [535]
X_train_PDB_sc, mean_X_train_PDB, sd_X_train_PDB = Z_transform_train(X_train_PDB, cols_to_select=cols)
X_valid_PDB_sc = Z_transform_valid(X_valid_PDB, mean_X_train_PDB, sd_X_train_PDB, cols_to_select=cols)
X_test_PDB_sc = Z_transform_valid(X_test_PDB, mean_X_train_PDB, sd_X_train_PDB, cols_to_select=cols)

X_train_PDB_sc.shape, X_valid_PDB_sc.shape, X_test_PDB_sc.shape

((223346, 536), (57989, 536), (5941, 536))

In [21]:
# Data normalization (AF2 only)
cols = list(range(0, 512)) + [532] + [533] + [535]
X_train_af2_sc, mean_X_train_af2, sd_X_train_af2 = Z_transform_train(X_train_af2, cols_to_select=cols)
X_valid_af2_sc = Z_transform_valid(X_valid_af2, mean_X_train_af2, sd_X_train_af2, cols_to_select=cols)
X_test_af2_sc = Z_transform_valid(X_test_af2, mean_X_train_af2, sd_X_train_af2, cols_to_select=cols)

X_train_af2_sc.shape, X_valid_af2_sc.shape, X_test_af2_sc.shape

((223489, 536), (57989, 536), (5941, 536))

In [22]:
# Class unbalanced in train
zero, one = np.bincount(y_train.astype(int))
total = zero + one
print("Class count (train):\n\n      Total: {}\n      Epitope label: {} ({:.2f}% of total)".format(total, one, 100*one/total))

Class count (train):

      Total: 446835
      Epitope label: 40938 (9.16% of total)


In [23]:
# Class unbalanced in valid
zero, one = np.bincount(y_valid.astype(int))
total = zero + one
print("Class count (validation):\n\n      Total: {}\n      Epitope label: {} ({:.2f}% of total)".format(total, one, 100*one/total))

Class count (validation):

      Total: 115978
      Epitope label: 9578 (8.26% of total)


In [24]:
# Class unbalanced in train
zero, one = np.bincount(y_test.astype(int))
total = zero + one
print("Class count (test):\n\n      Total: {}\n      Epitope label: {} ({:.2f}% of total)".format(total, one, 100*one/total))

Class count (test):

      Total: 11882
      Epitope label: 712 (5.99% of total)


In [25]:
# Calculation of the class weights with function previously defined
class_weight = class_weight_calculator(y_train)
    
# Normalization of the class_weight to sum 1
tot = class_weight[0] + class_weight[1]
class_weight[0] = class_weight[0]/tot
class_weight[1] = class_weight[1]/tot
    
print("Class weight implemented")
class_weight

Class weight implemented


{0: 0.09161771123569103, 1: 0.9083822887643089}

## Expansion of the feature space

In [26]:
# Function to expand the feature space in the shape of (ROWS, 3·COLUMNS)

def expand_matrix_feature(matrix):
    
    # Shape of the input matrix
    rows, cols = matrix.shape

    # Create arrays of zeros for the FIRST and LAST residue
    prev_row = np.zeros(cols)
    next_row = np.zeros(cols)

    # Horizontally stack the previous, actual, and next row values (residues) of each row in the matrix
    prev_row_matrix = np.vstack([prev_row, matrix[:-1],])
    actual_row_matrix = np.vstack([matrix,])
    next_row_matrix = np.vstack([matrix[1:], next_row])

    # Concatenate the previous, actual, and next row (residues) matrices horizontally
    expanded_matrix = np.hstack([prev_row_matrix, actual_row_matrix, next_row_matrix])

    return expanded_matrix

In [27]:
# Loading stats dataframe
stats_train_solved, stats_train_AF2 = results_df_stats_creation(name_set='train', data_dir='../Data/')
stats_valid_solved, stats_valid_AF2 = results_df_stats_creation(name_set='valid', data_dir='../Data/')
stats_test_solved, stats_test_AF2 = results_df_stats_creation(name_set='test', data_dir='../Data/')

len(stats_train_solved), len(stats_train_AF2), len(stats_valid_solved), len(stats_valid_AF2), len(stats_test_solved), len(stats_test_AF2)

(1061, 1061, 270, 270, 21, 21)

In [29]:
# Expanding the feature space according to the length of each PDB ID (TRAIN SET)

n, p = X_train_PDB_sc.shape
X_train_PDB_exp = np.empty((n, 3*p), dtype=np.float32)

length_PDB = np.empty(len(stats_train_solved))
tot_len_PDB = 0
for i in range(0, len(stats_train_solved), 1):
    
    # Storing actual length
    sample_PDB = stats_train_solved[i]
    actual_len_PDB = sample_PDB['length'][0]
    length_PDB[i] = actual_len_PDB
    
    # Expanding the feature space for each protein
    X_train_PDB_exp[tot_len_PDB:tot_len_PDB + actual_len_PDB] = expand_matrix_feature(X_train_PDB_sc[tot_len_PDB:tot_len_PDB + actual_len_PDB])
    
    # Storing cumulative sum of lengths
    tot_len_PDB = tot_len_PDB + actual_len_PDB

print('Nº of proteins in the train set (PDB):', length_PDB.shape[0])
print('\nAre the total length equal to the y_train (PDB)?', tot_len_PDB == len(y_train_PDB))

print('\nOriginal shape of X train PDB matrix: ', X_train_PDB_sc.shape)
print('Shape of the expanded X train PDB matrix: ', X_train_PDB_exp.shape)

Nº of proteins in the train set (PDB): 1061

Are the total length equal to the y_train (PDB)? True

Original shape of X train PDB matrix:  (223346, 536)
Shape of the expanded X train PDB matrix:  (223346, 1608)


In [30]:
# Expanding the feature space according to the length of each PDB ID (TRAIN SET)

n, p = X_train_af2_sc.shape
X_train_af2_exp = np.empty((n, 3*p), dtype=np.float32)

length_af2 = np.empty(len(stats_train_AF2))
tot_len_af2 = 0
for i in range(0, len(stats_train_solved), 1):
    
    # Storing actual length
    sample_af2 = stats_train_AF2[i]
    actual_len_af2 = sample_af2['length'][0]
    length_af2[i] = actual_len_af2
    
    # Expanding the feature space for each protein
    X_train_af2_exp[tot_len_af2:tot_len_af2 + actual_len_af2] = expand_matrix_feature(X_train_af2_sc[tot_len_af2:tot_len_af2 + actual_len_af2])
    
    # Storing cumulative sum of lengths
    tot_len_af2 = tot_len_af2 + actual_len_af2

print('Nº of proteins in the train set (AF2):', length_af2.shape[0])
print('Are the total length equal to the y_train (AF2)?', tot_len_af2 == len(y_train_af2))

print('\nOriginal shape of X train AF2 matrix: ', X_train_af2_sc.shape)
print('Shape of the expanded X train AF2 matrix: ', X_train_af2_exp.shape)

Nº of proteins in the train set (AF2): 1061
Are the total length equal to the y_train (AF2)? True

Original shape of X train AF2 matrix:  (223489, 536)
Shape of the expanded X train AF2 matrix:  (223489, 1608)


In [31]:
# Expanding the feature space according to the length of each PDB ID (VALID SET)

n, p = X_valid_PDB_sc.shape
X_valid_PDB_exp = np.empty((n, 3*p), dtype=np.float32)
X_valid_af2_exp = np.empty((n, 3*p), dtype=np.float32)

length_PDB = np.empty(len(stats_valid_solved))
length_af2 = np.empty(len(stats_valid_AF2))
tot_len_PDB = 0
tot_len_af2 = 0
for i in range(0, len(stats_valid_solved), 1):
    
    # Storing actual length
    sample_PDB = stats_valid_solved[i]
    sample_af2 = stats_valid_AF2[i]
    actual_len_PDB = sample_PDB['length'][0]
    actual_len_af2 = sample_af2['length'][0]
    length_PDB[i] = actual_len_PDB
    length_af2[i] = actual_len_af2
    
    # Expanding the feature space for each protein
    X_valid_PDB_exp[tot_len_PDB:tot_len_PDB + actual_len_PDB] = expand_matrix_feature(X_valid_PDB_sc[tot_len_PDB:tot_len_PDB + actual_len_PDB])
    X_valid_af2_exp[tot_len_af2:tot_len_af2 + actual_len_af2] = expand_matrix_feature(X_valid_af2_sc[tot_len_af2:tot_len_af2 + actual_len_af2])
    
    # Storing cumulative sum of lengths
    tot_len_PDB = tot_len_PDB + actual_len_PDB
    tot_len_af2 = tot_len_af2 + actual_len_af2

print('Nº of proteins in the valid set (PDB and AF2):', length_PDB.shape[0], 'and', length_af2.shape[0])
print('\nAre the total length equal to the y_valid (PDB)?', tot_len_PDB == len(y_valid_PDB))
print('Are the total length equal to the y_valid (AF2)?', tot_len_af2 == len(y_valid_af2))

print('\nOriginal shape of X valid PDB matrix: ', X_valid_PDB_sc.shape)
print('Shape of the expanded X valid PDB matrix: ', X_valid_PDB_exp.shape)

print('\nOriginal shape of X valid AF2 matrix: ', X_valid_af2_sc.shape)
print('Shape of the expanded X valid AF2 matrix: ', X_valid_af2_exp.shape)

Nº of proteins in the valid set (PDB and AF2): 270 and 270

Are the total length equal to the y_valid (PDB)? True
Are the total length equal to the y_valid (AF2)? True

Original shape of X valid PDB matrix:  (57989, 536)
Shape of the expanded X valid PDB matrix:  (57989, 1608)

Original shape of X valid AF2 matrix:  (57989, 536)
Shape of the expanded X valid AF2 matrix:  (57989, 1608)


In [32]:
# Expanding the feature space according to the length of each PDB ID (TEST SET)

n, p = X_test_PDB_sc.shape
X_test_PDB_exp = np.empty((n, 3*p), dtype=np.float32)
X_test_af2_exp = np.empty((n, 3*p), dtype=np.float32)

length_PDB = np.empty(len(stats_test_solved))
length_af2 = np.empty(len(stats_test_AF2))
tot_len_PDB = 0
tot_len_af2 = 0
for i in range(0, len(stats_test_solved), 1):
    
    # Storing actual length
    sample_PDB = stats_test_solved[i]
    sample_af2 = stats_test_AF2[i]
    actual_len_PDB = sample_PDB['length'][0]
    actual_len_af2 = sample_af2['length'][0]
    length_PDB[i] = actual_len_PDB
    length_af2[i] = actual_len_af2
    
    # Expanding the feature space for each protein
    X_test_PDB_exp[tot_len_PDB:tot_len_PDB + actual_len_PDB] = expand_matrix_feature(X_test_PDB_sc[tot_len_PDB:tot_len_PDB + actual_len_PDB])
    X_test_af2_exp[tot_len_af2:tot_len_af2 + actual_len_af2] = expand_matrix_feature(X_test_af2_sc[tot_len_af2:tot_len_af2 + actual_len_af2])
    
    # Storing cumulative sum of lengths
    tot_len_PDB = tot_len_PDB + actual_len_PDB
    tot_len_af2 = tot_len_af2 + actual_len_af2

print('Nº of proteins in the test set (PDB and AF2):', length_PDB.shape[0], 'and', length_af2.shape[0])
print('\nAre the total length equal to the y_test (PDB)?', tot_len_PDB == len(y_test_PDB))
print('Are the total length equal to the y_test (AF2)?', tot_len_af2 == len(y_test_af2))

print('\nOriginal shape of X test PDB matrix: ', X_test_PDB_sc.shape)
print('Shape of the expanded X test PDB matrix: ', X_test_PDB_exp.shape)

print('\nOriginal shape of X test AF2 matrix: ', X_test_af2_sc.shape)
print('Shape of the expanded X test AF2 matrix: ', X_test_af2_exp.shape)

Nº of proteins in the test set (PDB and AF2): 21 and 21

Are the total length equal to the y_test (PDB)? True
Are the total length equal to the y_test (AF2)? True

Original shape of X test PDB matrix:  (5941, 536)
Shape of the expanded X test PDB matrix:  (5941, 1608)

Original shape of X test AF2 matrix:  (5941, 536)
Shape of the expanded X test AF2 matrix:  (5941, 1608)


In [35]:
# Stack all features and targets from solved and predicted structures into only one big (training)
X_train_exp = np.concatenate((X_train_PDB_exp, X_train_af2_exp), axis=0)
y_train_exp = np.concatenate((y_train_PDB, y_train_af2), axis=0)

X_train_exp.shape, y_train_exp.shape

((446835, 1608), (446835,))

In [36]:
# Stack all features and targets from solved and predicted structures into only one big (validation)
X_valid_exp = np.concatenate((X_valid_PDB_exp, X_valid_af2_exp), axis=0)
y_valid_exp = np.concatenate((y_valid_PDB, y_valid_af2), axis=0)

X_valid_exp.shape, y_valid_exp.shape

((115978, 1608), (115978,))

In [37]:
# Stack all features and targets from solved and predicted structures into only one big (test)
X_test_exp = np.concatenate((X_test_PDB_exp, X_test_af2_exp), axis=0)
y_test_exp = np.concatenate((y_test_PDB, y_test_af2), axis=0)

X_test_exp.shape, y_test_exp.shape

((11882, 1608), (11882,))

## Training with whole data set (1hu)

In [38]:
# Parameters definition
loss = 'binary_crossentropy'
act = 'relu'
l2_value = 0.0001
batch_size = 128
class_balancing = False

In [41]:
# Training and validation of the model
model, history = nn1_model(train_data=X_train_exp, y_train=y_train_exp, act_fun=act, loss_fun=loss, alpha=l2_value, 
                           class_weight_fn=class_weight_calculator, batch = batch_size, balancing = class_balancing)    

# Obtaining probabilities values for whole valid, valid_PDB, and valid_AF2
y_pred_valid_exp_prob = model.predict(X_valid_exp)
y_pred_valid_PDB_exp_prob = model.predict(X_valid_PDB_exp)
y_pred_valid_af2_exp_prob = model.predict(X_valid_af2_exp)

# Obtaining probabilities values for whole test, test_PDB, and test_AF2
y_pred_test_exp_prob = model.predict(X_test_exp)
y_pred_test_PDB_exp_prob = model.predict(X_test_PDB_exp)
y_pred_test_af2_exp_prob = model.predict(X_test_af2_exp)



In [46]:
print('AUC Validation results: \n')
print('Total valid: {}'.format(ROC_AUC(y_valid_exp, y_pred_valid_exp_prob)))
print('Solved valid: {}'.format(ROC_AUC(y_valid_PDB, y_pred_valid_PDB_exp_prob)))
print('AF2 valid: {}'.format(ROC_AUC(y_valid_af2, y_pred_valid_af2_exp_prob)))

print('\nAUC-valid from original features: 0.782217')

AUC Validation results: 

Total valid: 0.780764285753536
Solved valid: 0.7913879846044428
AF2 valid: 0.770143028274382

AUC-valid from original features: 0.782217


In [47]:
print('AUC Test results: \n')
print('Total test: {}'.format(ROC_AUC(y_test_exp, y_pred_test_exp_prob)))
print('Solved test: {}'.format(ROC_AUC(y_test_PDB, y_pred_test_PDB_exp_prob)))
print('AF2 test: {}'.format(ROC_AUC(y_test_af2, y_pred_test_af2_exp_prob)))

print('\nAUC-test from original features: 0.789158')

AUC Test results: 

Total test: 0.7736042444147143
Solved test: 0.7737174715580458
AF2 test: 0.7736100912355527

AUC-test from original features: 0.789158


## Training with whole data set (10 hu)

In [48]:
# Parameters definition
loss = 'binary_crossentropy'
act = 'relu'
l2_value = 0.01
drop = 0.4
batch_size = 128
class_balancing = False

In [49]:
# Training and validation of the model
model10, history10 = nn10_model(train_data=X_train_exp, y_train=y_train_exp, act_fun=act, loss_fun=loss, alpha=l2_value, 
                                class_weight_fn=class_weight_calculator, drop_rate=drop, 
                                batch = batch_size, balancing = class_balancing)    

# Obtaining probabilities values for whole valid, valid_PDB, and valid_AF2
y_pred_valid_exp_prob = model10.predict(X_valid_exp)
y_pred_valid_PDB_exp_prob = model10.predict(X_valid_PDB_exp)
y_pred_valid_af2_exp_prob = model10.predict(X_valid_af2_exp)

# Obtaining probabilities values for whole test, test_PDB, and test_AF2
y_pred_test_exp_prob = model10.predict(X_test_exp)
y_pred_test_PDB_exp_prob = model10.predict(X_test_PDB_exp)
y_pred_test_af2_exp_prob = model10.predict(X_test_af2_exp)



In [52]:
print('AUC Validation results: \n')
print('Total valid: {}'.format(ROC_AUC(y_valid_exp, y_pred_valid_exp_prob)))
print('Solved valid: {}'.format(ROC_AUC(y_valid_PDB, y_pred_valid_PDB_exp_prob)))
print('AF2 valid: {}'.format(ROC_AUC(y_valid_af2, y_pred_valid_af2_exp_prob)))

print('\nAUC-valid from original features: \n')
print('Total valid: 0.794642')
print('Solved valid: 0.802523')
print('AF2 valid: 0.786897')

AUC Validation results: 

Total valid: 0.7917316896137294
Solved valid: 0.7984357597376193
AF2 valid: 0.7852283604971919

AUC-valid from original features: 

Total valid: 0.794642
Solved valid: 0.802523
AF2 valid: 0.786897


In [53]:
print('AUC Test results: \n')
print('Total test: {}'.format(ROC_AUC(y_test_exp, y_pred_test_exp_prob)))
print('Solved test: {}'.format(ROC_AUC(y_test_PDB, y_pred_test_PDB_exp_prob)))
print('AF2 test: {}'.format(ROC_AUC(y_test_af2, y_pred_test_af2_exp_prob)))

print('\nAUC-test from original features: \n')
print('Total test: 0.780097')
print('Solved test: 0.785078')
print('AF2 test: 0.775623')

AUC Test results: 

Total test: 0.7793727932966513
Solved test: 0.7836492712220736
AF2 test: 0.7758854475772785

AUC-test from original features: 

Total test: 0.780097
Solved test: 0.785078
AF2 test: 0.775623
