### Imports

In [3]:
import importlib
from AIBind.import_modules import *
from AIBind import AIBind

In [13]:
importlib.reload(AIBind)

<module 'AIBind.AIBind' from '/home/sars-busters-consolidated/GitCode/AIBind/AIBind.py'>

### GPU Settings

In [5]:
str(subprocess.check_output('nvidia-smi', shell = True)).split('\\n')

["b'Thu Jul 15 09:06:07 2021       ",
 '+-----------------------------------------------------------------------------+',
 '| NVIDIA-SMI 418.87.01    Driver Version: 418.87.01    CUDA Version: 10.1     |',
 '|-------------------------------+----------------------+----------------------+',
 '| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |',
 '| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |',
 '|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |',
 '| N/A   75C    P0    34W /  70W |  14653MiB / 15079MiB |      0%      Default |',
 '+-------------------------------+----------------------+----------------------+',
 '|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |',
 '| N/A   77C    P0    33W /  70W |  14906MiB / 15079MiB |      0%      Default |',
 '+-------------------------------+----------------------+----------------------+',
 '|   2  Tesla T4            Off  | 00

In [6]:
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

### Siamese

#### Read In Test Datasets

In [7]:
targets_test = []
targets_validation = []
edges_test = []
edges_validation = []
train_sets = []

for run_number in tqdm(range(5)):
    
    targets_test.append(pd.read_csv('/data/sars-busters-consolidated/GitData/VecNet_Unseen_Nodes/test_unseen_nodes_' + str(run_number) + '.csv'))
    edges_test.append(pd.read_csv('/data/sars-busters-consolidated/GitData/VecNet_Unseen_Nodes/test_unseen_edges_' + str(run_number) + '.csv'))    
    targets_validation.append(pd.read_csv('/data/sars-busters-consolidated/GitData/VecNet_Unseen_Nodes/validation_unseen_nodes_' + str(run_number) + '.csv'))    
    edges_validation.append(pd.read_csv('/data/sars-busters-consolidated/GitData/VecNet_Unseen_Nodes/validation_unseen_edges_' + str(run_number) + '.csv'))    
    train_sets.append(pd.read_csv('/data/sars-busters-consolidated/GitData/VecNet_Unseen_Nodes/train_' + str(run_number) + '.csv'))    

  0%|          | 0/5 [00:00<?, ?it/s]

#### AIBind Object

In [8]:
drugs = pd.read_csv('/data/sars-busters-consolidated/chemicals/chemicals_01.csv')
targets = pd.read_csv('/data/sars-busters-consolidated/targets/amino/amino_01.csv')

drugs = drugs.rename(columns = {'Label' : 'InChiKey'})
targets = targets.rename(columns = {'Label' : 'target_aa_code'})

In [9]:
targets_to_add = list(set(pd.concat(targets_test)['target_aa_code']).difference(targets['target_aa_code']))
targets_to_add = pd.DataFrame(targets_to_add)
targets_to_add.columns = ['target_aa_code']

targets = targets[['target_aa_code']]
targets = pd.concat([targets, targets_to_add])

In [14]:
# Create object
siamese_object = AIBind.AIBind(interactions_location = '/data/sars-busters-consolidated/GitData/interactions/Network_Derived_Negatives.csv',
                              interactions = None,
                              interaction_y_name = 'Y',

                              absolute_negatives_location = None,
                              absolute_negatives = None,

                              drugs_location = None,
                              drugs_dataframe = drugs,
                              drug_inchi_name = 'InChiKey',
                              drug_smile_name = 'SMILE',

                              targets_location = None,
                              targets_dataframe = targets, 
                              target_seq_name = 'target_aa_code',

                              mol2vec_location = None,
                              mol2vec_model = None,

                              protvec_location = None, 
                              protvec_model = None,

                              nodes_test = targets_test, 
                              nodes_validation = targets_validation, 

                              edges_test = edges_test, 
                              edges_validation = edges_validation, 

                              model_out_dir = '/data/sars-busters-consolidated/siamese/KFUD-Final/',

                              debug = False)

#### Update Drugs and Targets 

In [15]:
siamese_object.get_simaese_input_format(drugs_dataframe = None,
                                        targets_dataframe = None,
                                        drug_fingerprint_name = 'fingerprint',
                                        drug_out_encoding_name = 'padded_encoded_fingerprint',
                                        target_out_encoding_name = 'target_aa_encoded_padded_int',
                                        replace_dataframe = True,
                                        return_dataframes = False)

0it [00:00, ?it/s]

Number of drugs skipped :  0


  0%|          | 0/5186 [00:00<?, ?it/s]

  0%|          | 0/5186 [00:00<?, ?it/s]

  0%|          | 0/5186 [00:00<?, ?it/s]

#### Create Train Sets

In [16]:
siamese_object.create_train_sets(unseen_nodes_flag = False,
                                 data_leak_check = True)

  0%|          | 0/5 [00:00<?, ?it/s]

Set :  0
Train - Test - Validation Overlap For Unseen Targets :  0
Train - Test - Validation Overlap For Unseen Edges :  0
Train Set :  (38125, 3)
Nodes Test :  (6491, 3)
Nodes Val :  (6492, 3)
Edge Test :  (3693, 3)
Edge Val :  (3693, 3)
Positive / Negatative Ratio :  0.5453366300514774

Set :  1
Train - Test - Validation Overlap For Unseen Targets :  0
Train - Test - Validation Overlap For Unseen Edges :  0
Train Set :  (37340, 3)
Nodes Test :  (6935, 3)
Nodes Val :  (6790, 3)
Edge Test :  (3603, 3)
Edge Val :  (3603, 3)
Positive / Negatative Ratio :  0.5286363450280427

Set :  2
Train - Test - Validation Overlap For Unseen Targets :  0
Train - Test - Validation Overlap For Unseen Edges :  0
Train Set :  (37119, 3)
Nodes Test :  (6329, 3)
Nodes Val :  (6184, 3)
Edge Test :  (3785, 3)
Edge Val :  (3785, 3)
Positive / Negatative Ratio :  0.51797325481536

Set :  3
Train - Test - Validation Overlap For Unseen Targets :  0
Train - Test - Validation Overlap For Unseen Edges :  0
Train Set

#### Train Embedder

In [None]:
siamese_object.train_siamese_embedder(model_name = 'siamese_embed',
                             epochs = 7,
                             version = 0,
                             triplets_per_epoch = 50000,
                             learning_rate = 0.00001,
                             desired_input_dimension = (17000, 1),
                             output_dimension = 128,
                             print_frequency = 10000,
                             interactive = True)

  0%|          | 0/5 [00:00<?, ?it/s]



  0%|          | 0/7 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Total Number Of Drugs : 6751
Total Number Of Usable Drugs (Drugs That Have Data On Both Binding & No Binding) : 5045
Total Number Of Targets : 3594
Total Number Of Usable Targets (Targets That Have Data On Both Binding & No Binding) : 2973


  0%|          | 0/2973 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch : 0 | Target Triplet : 0 | Average Loss : 0.5962993
Epoch : 0 | Target Triplet : 10000 | Average Loss : 0.4892436
Epoch : 0 | Target Triplet : 20000 | Average Loss : 0.48613766
Epoch : 0 | Target Triplet : 30000 | Average Loss : 0.48336825
Epoch : 0 | Target Triplet : 40000 | Average Loss : 0.48241845
Epoch Time : 3722.313366651535


0it [00:00, ?it/s]

Total Number Of Drugs : 6751
Total Number Of Usable Drugs (Drugs That Have Data On Both Binding & No Binding) : 5045
Total Number Of Targets : 3594
Total Number Of Usable Targets (Targets That Have Data On Both Binding & No Binding) : 2973


  0%|          | 0/2973 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch : 1 | Target Triplet : 0 | Average Loss : 0.5335914
Epoch : 1 | Target Triplet : 10000 | Average Loss : 0.46119907
Epoch : 1 | Target Triplet : 20000 | Average Loss : 0.4621303
Epoch : 1 | Target Triplet : 30000 | Average Loss : 0.46003363
Epoch : 1 | Target Triplet : 40000 | Average Loss : 0.45719236
Epoch Time : 3702.212413549423


0it [00:00, ?it/s]

Total Number Of Drugs : 6751
Total Number Of Usable Drugs (Drugs That Have Data On Both Binding & No Binding) : 5045
Total Number Of Targets : 3594
Total Number Of Usable Targets (Targets That Have Data On Both Binding & No Binding) : 2973


  0%|          | 0/2973 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch : 2 | Target Triplet : 0 | Average Loss : 0.34592527
Epoch : 2 | Target Triplet : 10000 | Average Loss : 0.42296866
Epoch : 2 | Target Triplet : 20000 | Average Loss : 0.42085376
Epoch : 2 | Target Triplet : 30000 | Average Loss : 0.4154806
Epoch : 2 | Target Triplet : 40000 | Average Loss : 0.40962476
Epoch Time : 3694.881034374237


0it [00:00, ?it/s]

Total Number Of Drugs : 6751
Total Number Of Usable Drugs (Drugs That Have Data On Both Binding & No Binding) : 5045
Total Number Of Targets : 3594
Total Number Of Usable Targets (Targets That Have Data On Both Binding & No Binding) : 2973


  0%|          | 0/2973 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch : 3 | Target Triplet : 0 | Average Loss : 0.76379704
Epoch : 3 | Target Triplet : 10000 | Average Loss : 0.37412432
Epoch : 3 | Target Triplet : 20000 | Average Loss : 0.36932674
Epoch : 3 | Target Triplet : 30000 | Average Loss : 0.36395848
Epoch : 3 | Target Triplet : 40000 | Average Loss : 0.3579379
Epoch Time : 3682.1627264022827


0it [00:00, ?it/s]

Total Number Of Drugs : 6751
Total Number Of Usable Drugs (Drugs That Have Data On Both Binding & No Binding) : 5045
Total Number Of Targets : 3594
Total Number Of Usable Targets (Targets That Have Data On Both Binding & No Binding) : 2973


  0%|          | 0/2973 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch : 4 | Target Triplet : 0 | Average Loss : 1.0579653
Epoch : 4 | Target Triplet : 10000 | Average Loss : 0.3226327
Epoch : 4 | Target Triplet : 20000 | Average Loss : 0.31703585
Epoch : 4 | Target Triplet : 30000 | Average Loss : 0.31383762
Epoch : 4 | Target Triplet : 40000 | Average Loss : 0.3084014
Epoch Time : 3663.602888584137


0it [00:00, ?it/s]

Total Number Of Drugs : 6751
Total Number Of Usable Drugs (Drugs That Have Data On Both Binding & No Binding) : 5045
Total Number Of Targets : 3594
Total Number Of Usable Targets (Targets That Have Data On Both Binding & No Binding) : 2973


  0%|          | 0/2973 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch : 5 | Target Triplet : 0 | Average Loss : 1.1542526
Epoch : 5 | Target Triplet : 10000 | Average Loss : 0.27633554
Epoch : 5 | Target Triplet : 20000 | Average Loss : 0.26859608
Epoch : 5 | Target Triplet : 30000 | Average Loss : 0.2641986
Epoch : 5 | Target Triplet : 40000 | Average Loss : 0.2597817
Epoch Time : 3645.786553621292


0it [00:00, ?it/s]

Total Number Of Drugs : 6751
Total Number Of Usable Drugs (Drugs That Have Data On Both Binding & No Binding) : 5045
Total Number Of Targets : 3594
Total Number Of Usable Targets (Targets That Have Data On Both Binding & No Binding) : 2973


  0%|          | 0/2973 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch : 6 | Target Triplet : 0 | Average Loss : 0.9447793
Epoch : 6 | Target Triplet : 10000 | Average Loss : 0.23826806
Epoch : 6 | Target Triplet : 20000 | Average Loss : 0.22791314
Epoch : 6 | Target Triplet : 30000 | Average Loss : 0.2250398
Epoch : 6 | Target Triplet : 40000 | Average Loss : 0.22233467
Epoch Time : 3633.884115934372
Total Time To Train : 25746.286742448807


  0%|          | 0/7 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Total Number Of Drugs : 6765
Total Number Of Usable Drugs (Drugs That Have Data On Both Binding & No Binding) : 4695
Total Number Of Targets : 3533
Total Number Of Usable Targets (Targets That Have Data On Both Binding & No Binding) : 2897


  0%|          | 0/2897 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch : 0 | Target Triplet : 0 | Average Loss : 0.40996754
Epoch : 0 | Target Triplet : 10000 | Average Loss : 0.4963722
Epoch : 0 | Target Triplet : 20000 | Average Loss : 0.49349818
Epoch : 0 | Target Triplet : 30000 | Average Loss : 0.4906447
Epoch : 0 | Target Triplet : 40000 | Average Loss : 0.48786315
Epoch Time : 3712.1016557216644


0it [00:00, ?it/s]

Total Number Of Drugs : 6765
Total Number Of Usable Drugs (Drugs That Have Data On Both Binding & No Binding) : 4695
Total Number Of Targets : 3533
Total Number Of Usable Targets (Targets That Have Data On Both Binding & No Binding) : 2897


  0%|          | 0/2897 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch : 1 | Target Triplet : 0 | Average Loss : 0.20200992
Epoch : 1 | Target Triplet : 10000 | Average Loss : 0.4705561
Epoch : 1 | Target Triplet : 20000 | Average Loss : 0.46793824
Epoch : 1 | Target Triplet : 30000 | Average Loss : 0.46535704
Epoch : 1 | Target Triplet : 40000 | Average Loss : 0.46048012
Epoch Time : 3716.6999044418335


0it [00:00, ?it/s]

Total Number Of Drugs : 6765
Total Number Of Usable Drugs (Drugs That Have Data On Both Binding & No Binding) : 4695
Total Number Of Targets : 3533
Total Number Of Usable Targets (Targets That Have Data On Both Binding & No Binding) : 2897


  0%|          | 0/2897 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch : 2 | Target Triplet : 0 | Average Loss : 0.4395693
Epoch : 2 | Target Triplet : 10000 | Average Loss : 0.4220299
Epoch : 2 | Target Triplet : 20000 | Average Loss : 0.41514245
Epoch : 2 | Target Triplet : 30000 | Average Loss : 0.40555066
Epoch : 2 | Target Triplet : 40000 | Average Loss : 0.3985644
Epoch Time : 3703.8596143722534


0it [00:00, ?it/s]

Total Number Of Drugs : 6765
Total Number Of Usable Drugs (Drugs That Have Data On Both Binding & No Binding) : 4695
Total Number Of Targets : 3533
Total Number Of Usable Targets (Targets That Have Data On Both Binding & No Binding) : 2897


  0%|          | 0/2897 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch : 3 | Target Triplet : 0 | Average Loss : 0.0
Epoch : 3 | Target Triplet : 10000 | Average Loss : 0.34926072
Epoch : 3 | Target Triplet : 20000 | Average Loss : 0.34438124
Epoch : 3 | Target Triplet : 30000 | Average Loss : 0.33898035
Epoch : 3 | Target Triplet : 40000 | Average Loss : 0.33251932
Epoch Time : 3684.033591747284


0it [00:00, ?it/s]

Total Number Of Drugs : 6765
Total Number Of Usable Drugs (Drugs That Have Data On Both Binding & No Binding) : 4695
Total Number Of Targets : 3533
Total Number Of Usable Targets (Targets That Have Data On Both Binding & No Binding) : 2897


  0%|          | 0/2897 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch : 4 | Target Triplet : 0 | Average Loss : 0.0
Epoch : 4 | Target Triplet : 10000 | Average Loss : 0.2846207


#### Train Decoder

In [None]:
out_dir = '/data/sars-busters-consolidated/siamese/KFUD-Final/'
embedding_model_list = [out_dir + model for model in os.listdir(out_dir)]
embedding_model_list

In [None]:
siamese_object.train_siamese_decoder(model_name = 'siamese_model',
                                     epochs = 30,
                                     embedding_model_list = embedding_model_list,
                                     drug_input_encoding_name = 'padded_encoded_fingerprint',
                                     drug_output_embedding_name = 'normalized_embeddings',
                                     target_input_encoding_name = 'target_aa_encoded_padded_int',
                                     target_output_embedding_name = 'normalized_embeddings',
                                     version = 0,
                                     desired_input_dimension = (17000, 1),
                                     batch_size = 16,
                                     chunk_split_size = 500,
                                     chunk_test_frequency = 250,
                                     interactive = True)

#### Validation 

In [None]:
siamese_object.get_validation_results(model_name = None,
                   show_plots = True,
                   plot_title = 'Validation Results - 5 Fold Cross Validation',
                   num_cols = 2,
                   plot_height = 1500,
                   plot_width = 1500,
                   write_plot_to_html = True,
                   plot_dir = './',
                   plot_name = './Siamese_Unseen_Targets')

#### Test Results

In [None]:
siamese_object.get_test_results(model_name = None,
                                version_number = None,
                                optimal_validation_model = None,
                                drug_filter_list = [],
                                target_filter_list = [],
                                write_plot_to_disk = True,
                                plot_dir = './',
                                plot_name = 'f1_curves_siamese',
                                per_run_embedding = True,
                                embedding_model_list = embedding_model_list,
                                drug_input_encoding_name = 'padded_encoded_fingerprint',
                                drug_output_embedding_name = 'normalized_embeddings',
                                target_input_encoding_name = 'target_aa_encoded_padded_int',
                                target_output_embedding_name = 'normalized_embeddings',
                                desired_input_dimension = (17000, 1))

#### Prediction 

In [50]:
sars_targets = pd.read_csv('/data/External Predictions/SARS Sequences/20201203_Targets_Sequences_SARS_Cov2.csv')

sars_d_list = []
sars_t_list = []

for d in list(drugs['InChiKey'])[:500]:
    
    sars_d_list = sars_d_list + ([d] * len(list(sars_targets['Sequence'])))
    sars_t_list = sars_t_list + list(sars_targets['Sequence'])
    
predict_df = pd.DataFrame(list(zip(sars_d_list, sars_t_list)))
predict_df.columns = ['InChiKey', 'target_aa_code']
predict_df = predict_df.drop_duplicates(keep = False)
predict_df

Unnamed: 0,InChiKey,target_aa_code
0,XLYOFNOQVPJJNP-UHFFFAOYSA-N,MPAAAGDGLLGEPAAPGGGGGAEDAARPAAACEGSFLPAWVSGVPR...
1,XLYOFNOQVPJJNP-UHFFFAOYSA-N,MGSGSSSYRPKAIYLDIDGRIQKVIFSKYCNSSDIMDLFCIATGLP...
2,XLYOFNOQVPJJNP-UHFFFAOYSA-N,MPTTIEREFEELDTQRRWQPLYLEIRNESHDYPHRVAKFPENRNRN...
3,XLYOFNOQVPJJNP-UHFFFAOYSA-N,MAAQRRSLLQSEQQPSWTDDLPLCHLSGVGSASNRSYSADGKGTES...
4,XLYOFNOQVPJJNP-UHFFFAOYSA-N,MARGSALPRRPLLCIPAVWAAAALLLSVSRTSGEVEVLDPNDPLGP...
...,...,...
16495,QZNPNKJXABGCRC-LFRDXLMFSA-N,MANFQEHLSCSSSPHLPFSESKTFNGLQDELTAMGNHPSPKLLEDQ...
16496,QZNPNKJXABGCRC-LFRDXLMFSA-N,MDSSAVITQISKEEARGPLRGKGDQKSAASQKPRSRGILHSLFCCV...
16497,QZNPNKJXABGCRC-LFRDXLMFSA-N,MGQQVGRVGEAPGLQQPQPRGIRGSSAARPSGRRRDPAGRTTETGF...
16498,QZNPNKJXABGCRC-LFRDXLMFSA-N,MGTVLSLSPSYRKATLFEDGAATVGHYTAVQNSKNAKDKNLKRHSI...


In [37]:
vaenet_object.protvec_location = '/home/sars-busters/Mol2Vec/Results/protVec_100d_3grams.csv'

sars_embeddings = vaenet_object.get_protvec_embeddings(prediction_interactions = predict_df,
                                                       embedding_dimension = 100,
                                                       replace_dataframe = False,
                                                       return_normalisation_conststants = False,
                                                       delimiter = '\t')

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [51]:
sars_embeddings

Unnamed: 0,target_aa_code,normalized_embeddings
0,MPAAAGDGLLGEPAAPGGGGGAEDAARPAAACEGSFLPAWVSGVPR...,"[-18.630093000000002, -2.896413, -3.0235929999..."
1,MGSGSSSYRPKAIYLDIDGRIQKVIFSKYCNSSDIMDLFCIATGLP...,"[-12.565796999999996, -2.817929999999998, -1.3..."
2,MPTTIEREFEELDTQRRWQPLYLEIRNESHDYPHRVAKFPENRNRN...,"[-8.348188, -3.7263290000000002, -1.7557660000..."
3,MAAQRRSLLQSEQQPSWTDDLPLCHLSGVGSASNRSYSADGKGTES...,"[-12.543451000000006, -4.744106, -0.1855949999..."
4,MARGSALPRRPLLCIPAVWAAAALLLSVSRTSGEVEVLDPNDPLGP...,"[-22.265352, -4.667877, -7.002666000000003, -2..."
5,MEAPLRPAADILRRNPQQDYELVQRVGSGTYGDVYKARNVHTGELA...,"[-19.799501999999997, -2.3652699999999993, -0...."
6,MEPPRGPPANGAEPSRAVGTVKVYLPNKQRTVVTVRDGMSVYDSLD...,"[-11.395638999999996, -1.5503529999999994, -5...."
7,MNFNTILEEILIKRSQQKKKTSPLNYKERLFVLTKSMLTYYEGRAE...,"[-11.858782999999999, 0.575775, -2.40721400000..."
8,MSEVPVARVWLVLLLLTVQVGVTAGAPWQCAPCSAEKLALCPPVSA...,"[-4.9076900000000006, -1.9370750000000003, -2...."
9,MEPSRALLGCLASAAAAAPPGEDGAGAGAEEEEEEEEEAAAAVGPG...,"[-26.994364999999995, -9.187720999999994, -3.6..."


In [53]:
vaenet_object.get_fold_averaged_prediction_results(model_name = None,
                                                   version_number = None,
                                                   model_paths = [],
                                                   optimal_validation_model = None,
                                                   test_sets = [predict_df],
                                                   get_drug_embed = False,
                                                   pred_drug_embeddings = None,
                                                   drug_embed_normalized = False,
                                                   get_target_embed = True,
                                                   pred_target_embeddings = sars_embeddings,
                                                   target_embed_normalized = False,
                                                   drug_filter_list = [],
                                                   target_filter_list = [],
                                                   return_dataframes = False)


Testing on model :  /data/sars-busters-consolidated/test_remove_if_needed/Run_0/vaenet_ds2_5_fold_2x_v00_run0_06-18_17h58_epoch_0_idx_0.model
filtered_nodes_test :  (16500, 2)
Drugs :  500
Targets :  33
pred_drugs_dataframe :  (629, 2)
X0, X1 :  (16500, 100) (16500, 200)
unseen_targets_pred :  (16500,)
list :  16500
