# Training protein engieering models to assess near-ture and invisible Performance


Case: Protein engineering functinal mapping problems used for validating results from the Wednesday Noisy-ML Simualtion framework regarding the interplay between noise in measuremenets, their quality and modeling performance.

Part of paper submission: The Invisible Performance of Regression Models on Noisy Measurements

Author: Fatma-Elzahraa Eid, Broad Institute of MIT and Harvard


------


## [0] Imports

In [2]:
from __future__ import division
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import io
import time 
import pandas as pd
import numpy as np
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, LSTM 
from keras.callbacks import Callback
from keras.models import load_model

## [1] Inputs 

## A note on data source, avilability and license 

The three files used in this training and assessment program (Section: Data files) are from the Fit4Function study: Eid, F.-E., et al. (2022). "Systematic multi-trait AAV capsid engineering for efficient gene delivery." bioRxiv. doi:10.1101/2022.12.22.521680. These files are available in the associated data repository at Zenodo (https://zenodo.org/records/8401253). The license for using these files is included in the same repository.

### Please download the data files as described above to run the training program. Otherwise, you can run the validation analysis directly as the result file from this training program is provided in the 'example results/' directory.

In [2]:
# Data files 
# All data are from the Fit4Function paper
training_lib_file = 'data/TrainingLibrary_Allreplicates.csv'
assessment_lib_file = 'data/AssessmentLibrary_Allreplicates.csv'
testHQ_file = 'data/SharedSet_HQtesting_k48.csv'

# Output file 
output_file = 'results/AAV_Training_differentQuality.csv' 

In [3]:
# Training program parameters
repeats = 10

train_size = 20000
# Fit4Function study showed that 5K is enough for training to saturate the model's performance, 
# the 20K is intended to help the lower quality samples reach their full training potential 

valdiation_size = 3500
insample_test_size = 3500 

In [4]:
# Model parameters 
layer1_size = 70
layer2_size = 20 
batch_size  = 256
max_epochs = 200 
early_stopping_patience = 2

In [5]:
# Meta data: Replicates at each aggregation level 

replicates_k1 = ['Virus1_IT_R1',
    'Virus1_IT_R2',
    'Virus1_IT_R3',
    'Virus2_IT_R1',
    'Virus2_IT_R2',
    'Virus2_IT_R3',
    'Virus1_KC_R1',
    'Virus1_KC_R2',
    'Virus1_KC_R3',
    'Virus2_KC_R1',
    'Virus2_KC_R2',
    'Virus2_KC_R3']

replicates_k3 = ['Virus1_IT', 'Virus2_IT', 'Virus1_KC', 'Virus2_KC']

replicates_k6 = ['Virus1_IT_Virus2_IT',
    'Virus1_IT_Virus1_KC',
    'Virus1_IT_Virus2_KC',
    'Virus2_IT_Virus1_KC',
    'Virus2_IT_Virus2_KC',
    'Virus1_KC_Virus2_KC']

replicates_k12 = ['CodonRep1', 'CodonRep2']

replicates_k24 = ['AllcodonReps'] 

Replicates = {'k1': replicates_k1,
              'k3': replicates_k3,
              'k6': replicates_k6,
              'k12': replicates_k12,
              'k24': replicates_k24 }


## [2] Auxiliary Functions 

In [3]:
# Hot Encoding function 
def AA_hotencoding(variant):
    
    """
    
    AA_hotencoding takes an amino acid sequence 'variant' of an arbitrary length, 
    and returns a 20xlength one-hot encoding matrix 'onehot_encoded'.   
    
    """
       
    AAs = 'ARNDCQEGHILKMFPSTWYV'
    encoding_length = len(AAs)
    variant_length = len(variant)

    # Define a mapping of chars to integers
    char_to_int = dict((c, i) for i, c in enumerate(AAs))
    int_to_char = dict((i, c) for i, c in enumerate(AAs))

    # Encode input data 
    integer_encoded = [char_to_int[char] for i, char in enumerate(variant) if i <variant_length]
    
    # Start one-hot-encoding
    onehot_encoded = list()
    
    for value in integer_encoded:
        letter = [0 for _ in range(encoding_length)]
        letter[value] = 1
        onehot_encoded.append(letter)
                
    return onehot_encoded




# Custom early stopping 
class CustomEarlyStopping(Callback):
    
    """
    
    Modified from:
    https://stackoverflow.com/questions/42470604/keras-callback-earlystopping-comparing-training-and-validation-loss
    Distribution of this class is subject to the licenses enforced by stackoverflow.com 
    
    'CustomEarlyStopping' enforces early stop of ML training process when a user-defined condition is met. 
    
    """
    def __init__(self, ratio=0.0,
                 patience=0, verbose=0, restore_best_weights = True):
        #super(EarlyStopping, self).__init__()
        super(CustomEarlyStopping, self).__init__()

        self.ratio = ratio
        self.patience = patience
        self.verbose = verbose
        self.wait = 0
        self.stopped_epoch = 0
        self.monitor_op = np.greater
        self.restore_best_weights = True

    def on_train_begin(self, logs=None):
        self.wait = 0  # Allow instances to be re-used

    def on_epoch_end(self, epoch, logs=None):
        current_val = logs.get('val_loss')
        current_train = logs.get('loss')
        if current_val is None:
            warnings.warn('Early stopping requires %s available!' %
                          (self.monitor), RuntimeWarning)

        # If ratio current_loss / current_val_loss > self.ratio
        if self.monitor_op(np.divide(current_train,current_val),self.ratio):
            self.wait = 0
        else:
            if self.wait >= self.patience:
                self.stopped_epoch = epoch
                self.model.stop_training = True
            self.wait += 1

    def on_train_end(self, logs=None):
        if self.stopped_epoch > 0 and self.verbose > 0:
            print('Epoch %05d: early stopping' % (self.stopped_epoch))

            

            
# Define master fitness learning model 

def parent_model(L1=140, L2=20):
    
    """
    
    parent_model builds an LSTM model with paramters that work accross all functional fitness models in the Fit4Function study. 
    L1 and L2 define the sizes of the model two layers. 
    
    """
    model = Sequential()
    model.add(LSTM(L1, return_sequences=True, input_shape=(7, 20)))
    model.add(LSTM(L2, return_sequences=False))
    model.add(Dense(units=1))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])

    return model


## [3] Preprocessing

In [6]:
#%% Load data 

# Training library 
df = pd.read_csv(training_lib_file)
print(df.head(10))

# HQ test data 
testHQ =  pd.read_csv(testHQ_file)
print(testHQ.head(10))

        AA  Virus1_IT_R1  Virus1_IT_R2  Virus1_IT_R3  Virus1_KC_R1  \
0  AAACMFE      0.000000      0.000000      0.000000      0.000000   
1  AAAFQEE     41.886001     32.271719     37.650338      7.569387   
2  AAAGCEN      0.000000      0.661654      0.220551      0.000000   
3  AAALKGI      9.436508      8.420268      8.420268     15.553728   
4  AACDVHF      0.000000      0.000000      0.000000      0.428087   
5  AACHVFF      0.000000      0.000000      0.000000      0.000000   
6  AACQNYP      0.000000      0.000000      0.000000      0.000000   
7  AACRLMA      0.000000      0.000000      0.000000      0.000000   
8  AACTGYL      0.162180      0.243270      0.648720      0.072415   
9  AADCFDP      0.390051      0.000000      0.000000      0.000000   

   Virus1_KC_R2  Virus1_KC_R3  Virus2_IT_R1  Virus2_IT_R2  Virus2_IT_R3  ...  \
0      0.000000      0.000000      0.251626      0.629064      0.000000  ...   
1      8.708144      8.574173      1.546353      1.411888      1.7480

In [7]:
#%% Preprocessing high-quality testing data 

# Remove NaNs 
testHQ = testHQ[testHQ.k48 > 0]
testHQ.loc[:, 'target'] = np.log2(testHQ.k48)
remove = np.isnan(testHQ.target) |  np.isinf(testHQ.target)
testHQ = testHQ[~remove]
testHQ.reset_index(drop = True, inplace = True)

# Encode 
testHQ_x_seq = testHQ.AA
testHQ_y = testHQ.target
testHQ_x = np.asarray([AA_hotencoding(variant) for variant in testHQ.AA])

testHQ.head(10)

Unnamed: 0,AA,TrainLib,AssessLib,k48,target
0,AAANGCQ,0.344546,0.358417,0.351481,-1.50848
1,AAASQPM,21.31783,9.730699,15.524265,3.956453
2,AADRYKC,0.228552,0.040562,0.134557,-2.893708
3,AAEGAYT,24.458915,11.488455,17.973685,4.167814
4,AAFWGYL,0.044085,0.0,0.022043,-5.503564
5,AAHPTRQ,25.25705,168.039485,96.648267,6.594672
6,AAKYPSM,15.812063,30.733539,23.272801,4.540573
7,AAMHPNP,30.515214,15.272829,22.894022,4.516899
8,AAMVVMM,0.186674,0.075299,0.130986,-2.932514
9,AANFTLD,12.460771,34.935817,23.698294,4.566711


## [4] Training Program 


In [8]:
# Initialization 
k_level, train_replicate_name, repeat_no =  ([] for i in range(3))
R_train, R_validation, R_test_insample, R_test_independent, R_testHQ, times_min = ([] for i in range(6))


# Iterate over each aggregation level: 
for k in Replicates.keys():
    crnt_replicates = Replicates[k]
    
    # Iterate over each replicate in the current aggregation level: 
    for crnt_rep in crnt_replicates:
        
        # Preprocess training data 
        df = pd.read_csv(training_lib_file, usecols=['AA', crnt_rep])
        df = df[df[crnt_rep]>0]
        df['target'] = np.log2(df[crnt_rep])
        remove = np.isnan(df.target) |  np.isinf(df.target)
        df = df[~remove]
        df.reset_index(drop = True, inplace = True) 

        # Repeat training (repeats) times:             
        for crnt_repeat in range(repeats): 

            print(f"Currently training: {k} | Replicate {crnt_rep} | Repeat {crnt_repeat}") 
            
            t = time.time()


            # ---------------- Sample ----------------
            # --- validation 
            valdiation, remain_pool = np.split(df.sample(frac=1), [valdiation_size])
            valdiation.reset_index(drop = True, inplace=True)
            validate_x_seq = valdiation.AA
            validate_y = valdiation.target
            # --- in-sample test 
            test, remain_pool = np.split(remain_pool.sample(frac=1), [insample_test_size])
            test.reset_index(drop = True, inplace=True)
            test_x_seq = test.AA
            test_y = test.target
            # --- train 
            train, notused_pool = np.split(remain_pool.sample(frac=1), [train_size])
            train.reset_index(drop = True, inplace=True)
            train_x_seq = train.AA
            train_y = train.target


            # ---------------- Encode ----------------
            train_x =  np.asarray([AA_hotencoding(variant) for variant in train.AA])
            validate_x = np.asarray([AA_hotencoding(variant) for variant in valdiation.AA])
            test_x = np.asarray([AA_hotencoding(variant) for variant in test.AA])


            # ---------------- Train ----------------
            model = parent_model( L1 = layer1_size, L2 = layer2_size)
            model.fit(train_x, train_y,  
                      batch_size = batch_size, 
                      epochs = max_epochs, 
                      validation_data = (validate_x, validate_y),
                      verbose=0,
                      callbacks=[CustomEarlyStopping(ratio=0.90, patience=early_stopping_patience, 
                                                     verbose=0,restore_best_weights = True)])


            # ---------------- Test ----------------
            #-- train performance 
            Ymeasured = train_y
            Ypred = model.predict(train_x)
            crnt_corr = np.corrcoef(Ymeasured, np.ravel(Ypred))
            R_train.append(crnt_corr[0][1])
            #-- validation performance 
            Ymeasured = validate_y
            Ypred = model.predict(validate_x)
            crnt_corr = np.corrcoef(Ymeasured, np.ravel(Ypred))
            R_validation.append(crnt_corr[0][1])
            #-- test performance 
            Ymeasured = test_y
            Ypred = model.predict(test_x)
            crnt_corr = np.corrcoef(Ymeasured, np.ravel(Ypred))
            R_test_insample.append(crnt_corr[0][1])


            #-- Independent testing (at same k-level, independent dataset) 
            crnt_testInd_R = [] 

            for crnt_test_rep in crnt_replicates:

                # Preprocess
                df_test = pd.read_csv(assessment_lib_file, usecols=['AA', crnt_test_rep])
                df_test = df_test[df_test[crnt_test_rep]>0]
                df_test['target'] = np.log2(df_test[crnt_test_rep])
                remove = np.isnan(df_test.target) |  np.isinf(df_test.target)
                df_test = df_test[~remove]
                df_test.reset_index(drop = True, inplace = True) 

                # Encode 
                test_ind_x_seq = df_test.AA
                test_ind_y = df_test.target
                test_ind_x = np.asarray([AA_hotencoding(variant) for variant in df_test.AA])

                # Test Independent set 
                Ymeasured = test_ind_y
                Ypred = model.predict(test_ind_x)
                crnt_corr = np.corrcoef(Ymeasured, np.ravel(Ypred))
                crnt_testInd_R.append(crnt_corr[0][1])

            R_test_independent.append(crnt_testInd_R)


            #-- Test HQ 
            Ymeasured = testHQ_y
            Ypred = model.predict(testHQ_x)
            crnt_corr = np.corrcoef(Ymeasured, np.ravel(Ypred))
            R_testHQ.append(crnt_corr[0][1])


            # Record 
            k_level.append(k)
            repeat_no.append(crnt_repeat)
            train_replicate_name.append(crnt_rep)
            time_spent = round( (time.time()-t)/60  )
            times_min.append(time_spent)


            # Continuously update result file
            Performance = zip(k_level, train_replicate_name, repeat_no, times_min, R_train, R_validation, R_test_insample,
                              R_test_independent, R_testHQ)
            Performance_col = ['replicates_count','train_replicate', 'repeat_no', 'time(min)','R_train', 'R_validation', 'R_test_insample',
                               'R_test_independent', 'R_testHQ']

            outputs = pd.DataFrame(list(Performance), columns = Performance_col) 
            outputs.to_csv(output_file)


Currently training: k1 | Replicate Virus1_IT_R1 | Repeat 0
Currently training: k1 | Replicate Virus1_IT_R1 | Repeat 1
Currently training: k1 | Replicate Virus1_IT_R1 | Repeat 2
Currently training: k1 | Replicate Virus1_IT_R1 | Repeat 3
Currently training: k1 | Replicate Virus1_IT_R1 | Repeat 4
Currently training: k1 | Replicate Virus1_IT_R1 | Repeat 5
Currently training: k1 | Replicate Virus1_IT_R1 | Repeat 6
Currently training: k1 | Replicate Virus1_IT_R1 | Repeat 7
Currently training: k1 | Replicate Virus1_IT_R1 | Repeat 8
Currently training: k1 | Replicate Virus1_IT_R1 | Repeat 9
Currently training: k1 | Replicate Virus1_IT_R2 | Repeat 0
Currently training: k1 | Replicate Virus1_IT_R2 | Repeat 1
Currently training: k1 | Replicate Virus1_IT_R2 | Repeat 2
Currently training: k1 | Replicate Virus1_IT_R2 | Repeat 3
Currently training: k1 | Replicate Virus1_IT_R2 | Repeat 4
Currently training: k1 | Replicate Virus1_IT_R2 | Repeat 5
Currently training: k1 | Replicate Virus1_IT_R2 | Repeat