### Imports

In [1]:
import time
import random
import json
import lxml
import importlib
import os
import subprocess

import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
import pandas as pd
import numpy as np
import pickle as pkl
import plotly.graph_objects as go
import umap.umap_ as umap
import tensorflow.keras

from plotly.subplots import make_subplots
from tqdm import tqdm
from tqdm.notebook import tqdm
from pandarallel import pandarallel
from ast import literal_eval
from mpl_toolkits.mplot3d import Axes3D
from collections import Counter

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole

from gensim.models import Word2Vec
from gensim.models import word2vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import FastText
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec


from mol2vec import features
from mol2vec import helpers
from mol2vec.features import mol2alt_sentence, MolSentence, DfVec, sentences2vec
from mol2vec.helpers import depict_identifier, plot_2D_vectors, IdentifierTable, mol_to_svg

from Bio import SeqUtils

from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Activation, Dropout, Conv1D, Flatten, MaxPooling1D,\
                        AveragePooling1D, Concatenate, LeakyReLU, Embedding,\
                        GlobalMaxPooling1D,GlobalAveragePooling1D,GaussianNoise,BatchNormalization,Add
from tensorflow.keras.initializers import glorot_uniform

from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.metrics import confusion_matrix, f1_score, classification_report


from IPython.core.display import display, HTML
pandarallel.initialize(progress_bar = True)
tqdm.pandas()


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.


Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.

Traceback (most recent call last):
  File "/miniconda/lib/python3.6/site-packages/rdkit/Chem/PandasTools.p

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### GPU Settings

In [2]:
str(subprocess.check_output('nvidia-smi', shell = True)).split('\\n')

["b'Tue Jun  1 16:20:38 2021       ",
 '+-----------------------------------------------------------------------------+',
 '| NVIDIA-SMI 418.87.01    Driver Version: 418.87.01    CUDA Version: 10.1     |',
 '|-------------------------------+----------------------+----------------------+',
 '| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |',
 '| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |',
 '|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |',
 '| N/A   66C    P0    30W /  70W |  14688MiB / 15079MiB |      0%      Default |',
 '+-------------------------------+----------------------+----------------------+',
 '|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |',
 '| N/A   77C    P0    35W /  70W |   1717MiB / 15079MiB |      0%      Default |',
 '+-------------------------------+----------------------+----------------------+',
 '|   2  Tesla T4            Off  | 00

In [3]:
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

### Class Definition

In [4]:
class AIBind():
    
    # Class Initialisation
    def __init__(self,
                 
                 interactions_location = None,
                 interactions = None,
                 interaction_y_name = 'Y',
                 
                 drugs_location = None,
                 drugs_dataframe = None,
                 drug_inchi_name = None,
                 drug_smile_name = None,
                 
                 targets_location = None,
                 targets_dataframe = None, 
                 target_seq_name = None,
                 
                 mol2vec_location = None,
                 mol2vec_model = None,
                 
                 protvec_location = None, 
                 protvec_model = None,
                 
                 nodes_test = None, 
                 nodes_validation = None, 
                 
                 edges_test = None, 
                 edges_validation = None, 
                 
                 model_out_dir = None,
                 
                 debug = False):
        
        '''
         Class initialisation
         
         Inputs : 
             
             Optional - one of two below
                 interactions_location : String - Location of interactions file (CSV / Pickle)
                 interactions : Pandas DataFrame - Interactions dataframe
             
             interaction_y_name : String - Column name for true variable in interactions file

             Optional - one of two below
                 drugs_location : String - Location of drugs file (CSV / Pickle)
                 drugs_dataframe : Pandas DataFrame - Drugs DataFrame
             drug_inchi_name : String - Column name of field that contains the InChi Key 
             drug_smile_name : String - Column name of field that contains the chemical SMILE

             Optional - one of two below
                 targets_location : String - Location of targets file (CSV / Pickle)
                 targets_dataframe : Pandas DataFrame - Targets DataFrame
             target_seq_name : String - Column name of field that contains the amino acid sequence

             Optional - one of two below
                 mol2vec_location : String - Location of Mol2Vec model file
                 mol2vec_model : Word2Vec - Word2Vec model
 
             Optional - one of two below
                 protvec_location : String - Location of ProtVec model file 
                 protvec_model : Pandas DataFrame - ProtVec model DataFrame

             nodes_test : List - List of DataFrames of test set where all nodes must be unseen in the train set
             nodes_validation : List - List of DataFrames of validation set where all nodes must be unseen in the train set

             edges_test : List - List of DataFrames of test set where the rows must be unseen in the train set
             edges_validation : List - List of DataFrames of validation set where the rows must be unseen in the train set

             model_out_dir : String - Path to save trained models
             
             debug : Bool - Flag to print debug lines
         
        '''
        
        # Set Variables
        self.interactions_location = interactions_location
        self.interactions = interactions
        self.interaction_y_name = interaction_y_name
        
        self.drugs_location = drugs_location
        self.drugs_dataframe = drugs_dataframe 
        self.drug_inchi_name = drug_inchi_name
        self.drug_smile_name = drug_smile_name
        
        self.targets_location = targets_location
        self.targets_dataframe = targets_dataframe
        self.target_seq_name = target_seq_name
        
        self.mol2vec_location = mol2vec_location
        self.mol2vec_model = mol2vec_model
        
        self.protvec_location = protvec_location
        self.protvec_model = protvec_model
        
        self.nodes_test = nodes_test
        self.nodes_validation = nodes_validation
        self.edges_test = edges_test
        self.edges_validation = edges_validation
        
        self.model_out_dir = model_out_dir
        
        self.debug = debug
        
        # Read In Drugs 
        if type(self.drugs_dataframe) == type(None):
            self.drugs_dataframe = self.read_input_files(self.drugs_location)
        
        # Read In Targets
        if type(self.targets_dataframe) == type(None):
            self.targets_dataframe = self.read_input_files(self.targets_location)

        # Create Drug Target Lists
        self.drug_list = list(self.drugs_dataframe[self.drug_inchi_name])
        self.target_list = list(self.targets_dataframe[self.target_seq_name])
        
        # Read In Interactions File
        if type(self.interactions) == type(None):
            self.interactions = self.read_input_files(self.interactions_location)
            
        # Column Name Assertions 
        assert self.drug_inchi_name in self.interactions.columns, "Please ensure columns with InChi Keys have the same name across all dataframes"
        assert self.drug_inchi_name in self.drugs_dataframe.columns, "Please ensure columns with InChi Keys have the same name across all dataframes"
        assert self.drug_inchi_name in self.nodes_test[0].columns, "Please ensure columns with InChi Keys have the same name across all dataframes"
        assert self.drug_inchi_name in self.nodes_validation[0].columns, "Please ensure columns with InChi Keys have the same name across all dataframes"
        assert self.drug_inchi_name in self.edges_test[0].columns, "Please ensure columns with InChi Keys have the same name across all dataframes"
        assert self.drug_inchi_name in self.edges_validation[0].columns, "Please ensure columns with InChi Keys have the same name across all dataframes"
        
        assert self.target_seq_name in self.interactions.columns, "Please ensure columns with Amino Acid Sequences have the same name across all dataframes"
        assert self.target_seq_name in self.targets_dataframe.columns, "Please ensure columns with Amino Acid Sequences have the same name across all dataframes"
        assert self.target_seq_name in self.nodes_test[0].columns, "Please ensure columns with Amino Acid Sequences have the same name across all dataframes"
        assert self.target_seq_name in self.nodes_validation[0].columns, "Please ensure columns with Amino Acid Sequences have the same name across all dataframes"
        assert self.target_seq_name in self.edges_test[0].columns, "Please ensure columns with Amino Acid Sequences have the same name across all dataframes"
        assert self.target_seq_name in self.edges_validation[0].columns, "Please ensure columns withAmino Acid Sequences have the same name across all dataframes"
    
    
    ###################################################
    ############    General Functions      ############
    ###################################################
    
    # Read Input Files 
    def read_input_files(self, input_location):
        
        '''
        Reads in files into a dataframe given a file location. Currently works with CSV and Pickle files. 
        
        Inputs : 
            input_location : String - Location of file to read in - accepts only CSV and Pickle files
        Outputs : 
            Pandas DatraFrame 
        
        '''
        
        assert type(input_location) == type(""), 'Location should be of type str'
        
        if input_location.split('.')[-1] == 'pkl':
            with open(input_location, 'rb') as file: 
                return pkl.load(file)
                
        elif input_location.split('.')[-1] == 'csv':
            return pd.read_csv(input_location)
        
        else:
            raise TypeError("Unknown input file type, only pkl and csv are supported")
          
    def create_test_splits(self):
        None
    
    def create_train_sets(self, unseen_nodes_flag = True, data_leak_check = True):    
        
        self.train_sets = []
        self.train_pos_neg_ratio = []

        for i in tqdm(range(len(self.nodes_test))):

            # Unseen Targets
            unseen_targets = list(set(self.nodes_test[i][self.target_seq_name])) + list(set(self.nodes_validation[i][self.target_seq_name]))
            
            # Unseen Drugs
            unseen_drugs = list(set(self.nodes_test[i][self.drug_inchi_name])) + list(set(self.nodes_validation[i][self.drug_inchi_name]))

            # Seen Targets
            seen_targets = set(self.targets_dataframe[self.target_seq_name]).difference(unseen_targets)
            
            # Seen Drugs
            seen_drugs = set(drugs[self.drug_inchi_name]).difference(unseen_drugs)

            # Seen Targets 
            seen_target_df = self.interactions[self.interactions[self.target_seq_name].isin(seen_targets)]
            seen_target_df = seen_target_df[[self.drug_inchi_name, self.target_seq_name, self.interaction_y_name]]

            # Create dataframe with train interactions
            # pd.concat + drop duplicates amounts to a set interesection
            train_interactions = pd.concat([seen_target_df,
                                            self.edges_test[i],
                                            self.edges_test[i],
                                            self.edges_validation[i],
                                            self.edges_validation[i]]).drop_duplicates(keep = False)
            
            # Ensure unseen nodes if flag is on, else train sets only satisfy unseen targets criteria
            if unseen_nodes_flag: 
                # Ensure Unseen Drugs
                train_interactions = train_interactions.reset_index(drop = True)
                drop_index = []
                for idx, row in tqdm(train_interactions.iterrows()):
                    if row[self.drug_inchi_name] in unseen_drugs:
                        drop_index.append(idx)
                train_interactions.drop(train_interactions.index[drop_index], inplace = True)

            self.train_sets.append(train_interactions)
            self.train_pos_neg_ratio.append(1 / np.divide(*np.array(train_interactions['Y'].value_counts().values)))
            
        # Sanity check section
        if data_leak_check:
            for i in range(len(self.nodes_test)):

                print ("Set : ", i)

                # No Overlap Between Unseen Nodes and Train
                unseen_targets = list(set(self.nodes_test[i][self.target_seq_name])) + list(set(self.nodes_validation[i][self.target_seq_name]))
                print ("Train - Test - Validation Overlap For Unseen Targets : ", len(list(set(self.train_sets[i][self.target_seq_name]).intersection(unseen_targets))))

                if unseen_nodes_flag:
                    # No overlap Between Drugs
                    unseen_drugs = list(set(self.nodes_test[i][self.drug_inchi_name])) + list(set(self.nodes_validation[i][self.drug_inchi_name]))
                    print ("Train - Test - Validation Overlap For Unseen Drugs : ", len(list(set(self.train_sets[i][self.drug_inchi_name]).intersection(unseen_drugs))))


                # No Overlap Between Unseen Edges and Train
                train_edges = list(zip(list(self.train_sets[i][self.drug_inchi_name]), list(self.train_sets[i][self.target_seq_name])))
                temp_df = pd.concat([self.edges_test[i], self.edges_validation[i]])
                test_edges = list(zip(list(temp_df[self.drug_inchi_name]), list(temp_df[self.target_seq_name])))
                train_edges = set(train_edges)
                test_edges = set(test_edges)
                print ("Train - Test - Validation Overlap For Unseen Edges : ", len(list(train_edges.intersection(test_edges))))

                print ("Train Set : ", self.train_sets[i].shape)
                print ("Nodes Test : ", self.nodes_test[i].shape)
                print ("Nodes Val : ", self.nodes_validation[i].shape)
                print ("Edge Test : ", self.edges_test[i].shape)
                print ("Edge Val : ", self.edges_validation[i].shape)
                print ("Positive / Negatative Ratio : ", self.train_pos_neg_ratio[i])
                print ("")
                
    def dataframe_to_embed_array(self, interactions_df, drug_list, target_list, drug_embed_len):
    
        X_0_list = []
        X_1_list = []

        skipped_drugs = 0
        
        # Iterate over all rows in dataframe
        for idx, row in interactions_df.iterrows():
            
            # Get InChiKey and AA Sequence
            drug = row[self.drug_inchi_name]
            target = row[self.target_seq_name]
            
            # Get drug index for this drug in drug_list
            try:
                drug_index = drug_list.index(drug)
            except: 
                drug_index = -1
            
            # Get target index for this target in target_list
            target_index = target_list.index(target)
        
            # Index into target embedding array and add to X_0
            X_0_list.append(self.normalized_target_embeddings[target_index])
            
            # If drug index not found, add random vector to X_1
            if drug_index == -1:
                X_1_list.append(np.random.randn(drug_embed_len,))
                skipped_drugs = skipped_drugs + 1
            else:
                # Index into drug embedding array and add to X_1
                try:
                    X_1_list.append(self.normalized_drug_embeddings[drug_index])
                # If drug index not found, add random vector to X_1
                except: 
                    X_1_list.append(np.random.randn(drug_embed_len,))
                    skipped_drugs = skipped_drugs + 1
        
        # Convert lists to arrays
        X_0 = np.array(X_0_list)
        X_1 = np.array(X_1_list)
        Y   = np.array(list(interactions_df['Y']))

        if self.debug:
            print ("Number of drugs skipped : ", skipped_drugs)

        return X_0, X_1, Y
    
    def get_validation_results(self, model_name = None, show_plots = True, plot_title = None, num_cols = 2, plot_height = 1500, plot_width = 1500, write_plot_to_html = False, plot_dir = None, plot_name = None):

        self.averaged_results = {}
        
        if type(model_name) == type(None):
            model_name = list(self.results.keys())[0]

        num_rows = (len(self.train_sets) // num_cols) + (len(self.train_sets) % num_cols)

        fig = make_subplots(
            rows = num_rows, cols = num_cols,
            subplot_titles = ['temp' for _ in range(num_rows * num_cols)])

        row_counter = 1
        col_counter = 1

        # Get length of the x axis to ensure avergaes make sense 
        x_length = [len(self.results[model_name][run]['val_auc_ut']) for run in self.results[model_name].keys()]
        # Pick the length that is most common to compute aligned averages
        x_length = list(Counter(x_length))[0]

        for run in self.results[model_name].keys():

            # Plot legend only once
            if run == 0:
                legend = True
            else: 
                legend = False
                
            # X axis list
            x_list = [x for x in range(len(self.results[model_name][run]['val_auc_ut']))]

            # Ensure lengths match up 
            if len(x_list) == x_length:

                # Save validation AUC averaged scores for Unseen Nodes
                if 'val_auc_ut' in self.averaged_results:
                    self.averaged_results['val_auc_ut'] = self.averaged_results['val_auc_ut'] + np.array(self.results[model_name][run]['val_auc_ut']).reshape(-1, 1)
                elif 'val_auc_ut' not in self.averaged_results: 
                    self.averaged_results['val_auc_ut'] = np.array(self.results[model_name][run]['val_auc_ut']).reshape(-1, 1)

                # Save validation AUC averaged scores for Unseen Edges
                if 'val_auc_ue' in self.averaged_results:
                    self.averaged_results['val_auc_ue'] = self.averaged_results['val_auc_ue'] + np.array(self.results[model_name][run]['val_auc_ue']).reshape(-1, 1)
                elif 'val_auc_ue' not in self.averaged_results: 
                    self.averaged_results['val_auc_ue'] = np.array(self.results[model_name][run]['val_auc_ue']).reshape(-1, 1)

                # Save validation AUP averaged scores for Unseen Nodes
                if 'val_aup_ut' in self.averaged_results:
                    self.averaged_results['val_aup_ut'] = self.averaged_results['val_aup_ut'] + np.array(self.results[model_name][run]['val_aup_ut']).reshape(-1, 1)
                elif 'val_aup_ut' not in self.averaged_results: 
                    self.averaged_results['val_aup_ut'] = np.array(self.results[model_name][run]['val_aup_ut']).reshape(-1, 1)

                # Save validation AUP averaged scores for Unseen Edges
                if 'val_aup_ue' in self.averaged_results:
                    self.averaged_results['val_aup_ue'] = self.averaged_results['val_aup_ue'] + np.array(self.results[model_name][run]['val_aup_ue']).reshape(-1, 1)
                elif 'val_aup_ue' not in self.averaged_results: 
                    self.averaged_results['val_aup_ue'] = np.array(self.results[model_name][run]['val_aup_ue']).reshape(-1, 1)

            if show_plots:
                # Plot validation AUC for Unseen Nodes    
                

                fig.add_trace(go.Scatter(x = x_list,
                                         y = self.results[model_name][run]['val_auc_ut'],
                                         mode = 'lines',
                                         name = 'Unseen Targets AUC',
                                         line_color = 'deepskyblue',
                                         legendgroup = str(run),
                                         showlegend = legend),
                             row = row_counter,
                             col = col_counter )


                # Plot validation AUC for Unseen Edges
                fig.add_trace(go.Scatter(x = x_list,
                                         y = self.results[model_name][run]['val_auc_ue'],
                                         mode = 'lines',
                                         name = 'Unseen Edges AUC',
                                         line_color = 'blue',
                                         legendgroup = str(run),
                                         showlegend = legend),
                             row = row_counter,
                             col = col_counter )



                # Plot validation AUP for Unseen Nodes
                fig.add_trace(go.Scatter(x = x_list,
                                         y = self.results[model_name][run]['val_aup_ut'],
                                         mode = 'lines',
                                         name = 'Unseen Targets AUP',
                                         line_color = 'red',
                                         legendgroup = str(run),
                                         showlegend = legend),
                             row = row_counter,
                             col = col_counter )


                # Plot validation AUP for Unseen Edges
                fig.add_trace(go.Scatter(x = x_list,
                                         y = self.results[model_name][run]['val_aup_ue'],
                                         mode = 'lines',
                                         name = 'Unseen Edges AUP',
                                         line_color = 'green',
                                         legendgroup = str(run),
                                         showlegend = legend),
                             row = row_counter,
                             col = col_counter)

    
                fig.update_xaxes(title_text = "Epochs * Chunks", row = row_counter, col = col_counter)
                fig.update_yaxes(title_text = "Performance", row = row_counter, col = col_counter)
                fig.layout.annotations[run]['text'] = model_name + " Run " + str(run)

                if col_counter == num_cols: 
                    col_counter = 1
                    row_counter = row_counter + 1
                else: 
                    col_counter = col_counter + 1

            

            # Averaged Results Plot
            avg_fig = go.Figure()

            x_list = [x for x in range(len(self.averaged_results['val_auc_ut']))]

            avg_fig.add_trace(go.Scatter(x = x_list,
                                         y = (self.averaged_results['val_auc_ut'] / len(x_list)).ravel(),
                                         mode = 'lines',
                                         name = 'Unseen Targets AUC',
                                         line_color = 'deepskyblue'),
                         )

            avg_fig.add_trace(go.Scatter(x = x_list,
                                         y = (self.averaged_results['val_auc_ue'] / len(x_list)).ravel(),
                                         mode = 'lines',
                                         name = 'Unseen Edges AUC',
                                         line_color = 'blue'),
                         )

            avg_fig.add_trace(go.Scatter(x = x_list,
                                         y = (self.averaged_results['val_aup_ut'] / len(x_list)).ravel(),
                                         mode = 'lines',
                                         name = 'Unseen Targets AUP',
                                         line_color = 'red'),
                         )

            avg_fig.add_trace(go.Scatter(x = x_list,
                                         y = (self.averaged_results['val_aup_ue'] / len(x_list)).ravel(),
                                         mode = 'lines',
                                         name = 'Unseen Edges AUP',
                                         line_color = 'green'),
                         )

            


        
        # Optimal epoch
        perf = np.zeros((self.averaged_results['val_aup_ue'].shape[0], 4))
        ut_c = 0
        ut_p = 1
        ue_c = 2
        ue_p = 3

        perf[:, ut_c] = self.averaged_results['val_auc_ut'].ravel()
        perf[:, ut_p] = self.averaged_results['val_aup_ut'].ravel()
        perf[:, ue_c] = self.averaged_results['val_auc_ue'].ravel()
        perf[:, ue_p] = self.averaged_results['val_aup_ue'].ravel()
        perf = perf / self.averaged_results['val_aup_ue'].shape[0]

        # UT AUC + UE AUC
        edge_target = np.argmax(np.sum(perf[:, [ut_c, ue_c]], axis = 1))

        # UT AUC + UT AUP
        target_only = np.argmax(np.sum(perf[:, [ut_c, ut_p]], axis = 1))

        # UE AUC + UE AUP
        edge_only = np.argmax(np.sum(perf[:, [ue_c, ue_p]], axis = 1))

        print ("(Epoch * Chunk) With Highest Unseen Node and Edge Score : ", edge_target)
        print ("(Epoch * Chunk) With Highest Unseen Node Score : ", target_only)
        print ("(Epoch * Chunk) With Highest Unseen Edge Score : ", edge_target)
        
        ut_auc = []
        ut_aup = []
        ue_auc = []
        ue_aup = []

        model_key = model_name
        best_model = edge_target

        for run in self.results[model_key].keys():

            ut_auc.append(self.results[model_key][run]['val_auc_ut'][best_model])
            ut_aup.append(self.results[model_key][run]['val_aup_ut'][best_model])
            ue_auc.append(self.results[model_key][run]['val_auc_ue'][best_model])
            ue_aup.append(self.results[model_key][run]['val_aup_ue'][best_model])

        print ("Validation Performance")
        print ("Best Model Suffix : ", self.model_name_index[model_name][best_model])
        print ("Unseen Node AUC : ", np.mean(ut_auc), "+/-", np.std(ut_auc))
        print ("Unseen Node AUP : ", np.mean(ut_aup), "+/-", np.std(ut_aup))
        print ("Unseen Edges AUC : ", np.mean(ue_auc), "+/-", np.std(ue_auc))
        print ("Unseen Edges AUP : ", np.mean(ue_aup), "+/-", np.std(ue_aup))
        
        self.optimal_validation_model = best_model
        
        
        if show_plots:
            fig.update_layout(title_text = plot_title, 
                                  height = plot_height,
                                  width = plot_width,
                                  showlegend = True)
            fig.show()
            
            avg_fig.update_layout(title_text = plot_title + " - Averaged Results Across " + str(len(x_list)) + " Runs", 
                              xaxis_title_text = 'Epochs * Chunks',
                              yaxis_title_text = 'Performance',
                              showlegend = True)
            avg_fig.show()

            if write_plot_to_html:
                fig.write_html(plot_dir.rstrip('/') + plot_name + '_k_fold_split_plots.html')
                avg_fig.write_html(plot_dir.rstrip('/') + plot_name + '_averaged_results_plots.html')

    def get_test_results(self, model_name = None, optimal_validation_model = None, drug_filter_list = [], target_filter_list = []):
        
        # Initialise dictionary
        try: 
            self.test_results
        except: 
            self.test_results = {}
            
        if type(model_name) == type(None):
                model_name = list(self.results.keys())[0]
        if type(optimal_validation_model) == type(None):    
            optimal_validation_model = self.optimal_validation_model
            
        if model_name not in self.test_results.keys():
            self.test_results[model_name] = {}
        
        for run_number in range(len(self.train_sets)):
                
            model_prefix = "_".join(os.listdir(self.model_out_dir.rstrip('/') + '/Run_' + str(run_number))[0].split('_')[:-4])
            model_suffix = self.model_name_index[model_name][optimal_validation_model]
            model_location = model_prefix + model_suffix
            
            drug_embed_len = self.normalized_drug_embeddings[0].shape[0]
            
            filtered_nodes_test = self.nodes_test[run_number]
            filtered_edges_test = self.edges_test[run_number]
            
            if drug_filter_list != [] and target_filter_list != []:
                filtered_nodes_test = filtered_nodes_test[(filtered_nodes_test[self.drug_inchi_name].isin(drug_filter_list)) & (filtered_nodes_test[self.target_seq_name].isin(target_filter_list))]
                filtered_edges_test = filtered_edges_test[(filtered_edges_test[self.drug_inchi_name].isin(drug_filter_list)) & (filtered_edges_test[self.target_seq_name].isin(target_filter_list))]
            
            elif drug_filter_list != [] and target_filter_list == []:
                filtered_nodes_test = filtered_nodes_test[(filtered_nodes_test[self.drug_inchi_name].isin(drug_filter_list))]
                filtered_edges_test = filtered_edges_test[(filtered_edges_test[self.drug_inchi_name].isin(drug_filter_list))]
            
            elif drug_filter_list == [] and target_filter_list != []:
                filtered_nodes_test = filtered_nodes_test[(filtered_nodes_test[self.target_seq_name].isin(target_filter_list))]
                filtered_edges_test = filtered_edges_test[(filtered_edges_test[self.target_seq_name].isin(target_filter_list))]
            
            else: 
                None
            
            
            
            
            X_0_test_ut, X_1_test_ut, Y_test_actual_ut = self.dataframe_to_embed_array(interactions_df = filtered_nodes_test,
                                                                              drug_list = self.drug_list,
                                                                              target_list = self.target_list,
                                                                              drug_embed_len = drug_embed_len)

            X_0_test_ue, X_1_test_ue, Y_test_actual_ue = self.dataframe_to_embed_array(interactions_df = filtered_edges_test,
                                                                                  drug_list = self.drug_list,
                                                                                  target_list = self.target_list,
                                                                                  drug_embed_len = drug_embed_len)
            
            model = load_model(self.model_out_dir.rstrip('/') + '/Run_' + str(run_number) + '/' + model_location)

            # Test on unseen nodes
            Y_test_predictions_ut = []
            Y_test_predictions_ut.extend(model.predict([X_0_test_ut, X_1_test_ut]))
            Y_test_predictions_ut = [x[0] if not np.isnan(x[0]) else 0 for x in Y_test_predictions_ut]

            true = Y_test_actual_ut
            pred = Y_test_predictions_ut

            f1_scores = []

            for j in np.arange(0.0, 1.0, 0.01):
                f1_scores.append(f1_score(true, [1 if (i > j) else 0 for i in pred]))

            f_1_thresh = [idx for idx, x in list(zip(np.arange(0.0, 1.0, 0.01), f1_scores)) if x == max(f1_scores)][0]

            pred_bin = [1 if (i > f_1_thresh) else 0 for i in pred]

                
            try: 
                self.test_results[model_name][run_number]
            except:
                self.test_results[model_name][run_number] = {}

            self.test_results[model_name][run_number]['unseen_targets_auc'] = roc_auc_score(true, pred)
            self.test_results[model_name][run_number]['unseen_targets_aup'] = average_precision_score(true, pred)
            self.test_results[model_name][run_number]['unseen_targets_f1_scores'] = f1_scores
            self.test_results[model_name][run_number]['unseen_targets_max_f1'] = np.max(f1_scores)
            self.test_results[model_name][run_number]['unseen_targets_f1_threshold'] = f_1_thresh
            self.test_results[model_name][run_number]['targets_confusion_matrix'] = confusion_matrix(true, pred_bin)
            
            # Test on unseen edges
            Y_test_predictions_ue = []
            Y_test_predictions_ue.extend(model.predict([X_0_test_ue, X_1_test_ue]))
            Y_test_predictions_ue = [x[0] if not np.isnan(x[0]) else 0 for x in Y_test_predictions_ue]

            true = Y_test_actual_ue
            pred = Y_test_predictions_ue

            f1_scores = []

            for j in np.arange(0.0, 1.0, 0.01):
                f1_scores.append(f1_score(true, [1 if (i > j) else 0 for i in pred]))

            f_1_thresh = [idx for idx, x in list(zip(np.arange(0.0, 1.0, 0.01), f1_scores)) if x == max(f1_scores)][0]

            pred_bin = [1 if (i > f_1_thresh) else 0 for i in pred]

            self.test_results[model_name][run_number]['unseen_edges_auc'] = roc_auc_score(true, pred)
            self.test_results[model_name][run_number]['unseen_edges_aup'] = average_precision_score(true, pred)
            self.test_results[model_name][run_number]['unseen_edges_f1_scores'] = f1_scores
            self.test_results[model_name][run_number]['unseen_edges_max_f1'] = np.max(f1_scores)
            self.test_results[model_name][run_number]['unseen_edges_f1_threshold'] = f_1_thresh
            self.test_results[model_name][run_number]['edges_confusion_matrix'] = confusion_matrix(true, pred_bin)
            
        ue_auc = []
        ue_aup = []
        ut_auc = []
        ut_aup = []
        f1_t_e = []
        f1_t_t = []
        f1_t = []
        f1_e = []

        conf_t = []
        conf_e = []


        for run_number in self.test_results[model_name].keys():
            
            # Averaged confusion matrix 
            conf_tot_t = np.sum(self.test_results[model_name][run_number]['targets_confusion_matrix'], axis = 0)
            conf_tot_e = np.sum(self.test_results[model_name][run_number]['edges_confusion_matrix'], axis = 0)

            ue_auc.append(self.test_results[model_name][run_number]['unseen_edges_auc'])
            ue_aup.append(self.test_results[model_name][run_number]['unseen_edges_aup'])
            ut_auc.append(self.test_results[model_name][run_number]['unseen_targets_auc'])
            ut_aup.append(self.test_results[model_name][run_number]['unseen_targets_aup'])
            f1_t_e.append(self.test_results[model_name][run_number]['unseen_edges_f1_threshold'])
            f1_t_t.append(self.test_results[model_name][run_number]['unseen_targets_f1_threshold'])    
            f1_t.append(self.test_results[model_name][run_number]['unseen_targets_max_f1'])
            f1_e.append(self.test_results[model_name][run_number]['unseen_edges_max_f1'])
            if self.test_results[model_name][run_number]['targets_confusion_matrix'][0][0] != 0:
                conf_t.append(self.test_results[model_name][run_number]['targets_confusion_matrix'] / conf_tot_t)
                conf_e.append(self.test_results[model_name][run_number]['edges_confusion_matrix'] / conf_tot_e)
        
        # Compute mean and deviation for the confusion matrix 
        target_conf = np.zeros((2, 2), dtype = object)
        t_conf_mean = np.mean(conf_t, axis = 0)
        t_conf_err = np.std(conf_t, axis = 0)

        for i in range(2):
            for j in range(2):
                target_conf[i][j] = str(np.round(t_conf_mean[i][j], 2)) + " +/- " + str(np.round(t_conf_err[i][j], 2))
        target_conf = pd.DataFrame(target_conf) 

        print ("Test Set Performance : ")
        print ("")
        print ("\tUnseen Nodes : \n")
        print ("\t\tAUC          : ", np.mean(ut_auc), "+/-", np.std(ut_auc))
        print ("\t\tAUP          : ", np.mean(ut_aup), "+/-", np.std(ut_aup))
        print ("\t\tMax F1 Score : ", np.mean(f1_t), "+/-", np.std(f1_t))
        print ("\t\tF1 Threshold : ", np.mean(f1_t_t), "+/-", np.std(f1_t_t))
        print ("\t\tConfusion Matrix : ")
        target_conf.columns = ['Pred (0)', 'Pred (1)']
        target_conf.index = ['True (0)', 'True (1)']
        display(target_conf)
        
        # Compute mean and deviation for the confusion matrix 
        edge_conf = np.zeros((2, 2), dtype = object)
        e_conf_mean = np.mean(conf_e, axis = 0)
        e_conf_err = np.std(conf_e, axis = 0)

        for i in range(2):
            for j in range(2):
                edge_conf[i][j] = str(np.round(e_conf_mean[i][j], 2)) + " +/- " + str(np.round(e_conf_err[i][j], 2))
        edge_conf = pd.DataFrame(edge_conf) 

        print ("")
        print ("\tUnseen Edges : \n")
        print ("\t\tAUC          : ", np.mean(ue_auc), "+/-", np.std(ue_auc))
        print ("\t\tAUP          : ", np.mean(ue_aup), "+/-", np.std(ue_aup))
        print ("\t\tMax F1 Score : ", np.mean(f1_e), "+/-", np.std(f1_e))
        print ("\t\tF1 Threshold : ", np.mean(f1_t_e), "+/-", np.std(f1_t_e))
        print ("\t\tConfusion Matrix : ")
        edge_conf.columns = ['Pred (0)', 'Pred (1)']
        edge_conf.index = ['True (0)', 'True (1)']
        display(edge_conf)

        
    ###################################################
    ############ VecNet Specific Functions ############
    ###################################################
    
    # Get Drug Embeddings From Mol2Vec
    def get_mol2vec_embeddings(self, embedding_dimension = 300, replace_dataframe = True, return_normalisation_conststants = False):
        
        '''
        Generate Mol2Vec embeddings for all drugs in the drugs dataframe 
        
        Inputs : 
            embedding_dimension : Integer - Number of dimensions the Mol2Vec model expects
            replace_dataframe : Bool - Replace existing drugs dataframe with one that contains InChi Key and its respective normalised Mol2Vec embedding
            return_normalisation_conststants : Bool - Returns normalisation constant if true
        
        Outputs (optional): 
            centered_drug_embeddings : Numpy Array
            centered_drug_embeddings_length : Float
            normalized_drug_embeddings : Numpy Array
        '''
        
        # Create dictionary to hold drug_inchi : drug_smile
        drug_smiles = {}

        for index, row in tqdm(self.drugs_dataframe.iterrows()):

            drug_id = row[self.drug_inchi_name]
            drug_smile = row[self.drug_smile_name]

            drug_smiles[drug_id] = drug_smile

        # Read in Mol2Vec model
        if type(self.mol2vec_model) == type(None):
            self.mol2vec_model = word2vec.Word2Vec.load(self.mol2vec_location)
        
        # Create empty array to hold embeddings
        drug_embeddings = np.zeros((len(drug_smiles.keys()), embedding_dimension))
        miss_words = []
        hit_words = 0
        bad_mol = 0
        percent_unknown = []
    
        # Iterate over all drugs in dataset
        for idx, drug in tqdm(enumerate(drug_smiles.keys())):
            flag = 0
            mol_miss_words = 0
            
            # Create molecule object from smiles
            molecule = Chem.MolFromSmiles(drug_smiles[drug])
            try:
                # Get fingerprint from molecule
                sub_structures = mol2alt_sentence(molecule, 2)
            except Exception as e: 
                if self.debug: 
                    print (e)
                percent_unknown.append(100)
                continue    
            
            # Iterate over each sub structure
            for sub in sub_structures:
                # Check to see if substructure exists
                try:
                    drug_embeddings[idx, :] = drug_embeddings[idx, :] + self.mol2vec_model.wv[sub]
                    hit_words = hit_words + 1
                
                # If not, replace with UNK (unknown)
                except Exception as e:
                    if self.debug : 
                        print ("Sub structure not found")
                        print (e)
                    drug_embeddings[idx, :] = drug_embeddings[idx, :] + self.mol2vec_model.wv['UNK']
                    miss_words.append(sub)
                    flag = 1
                    mol_miss_words = mol_miss_words + 1

            percent_unknown.append((mol_miss_words / len(sub_structures)) * 100)

            if flag == 1:
                bad_mol = bad_mol + 1 
        
        # Normalise embeddings
        self.centered_drug_embeddings = drug_embeddings - np.mean(drug_embeddings, axis = 0)
        self.centered_drug_embeddings_length = np.mean(np.sqrt(np.sum(self.centered_drug_embeddings * self.centered_drug_embeddings, axis = 1)))
        self.normalized_drug_embeddings = self.centered_drug_embeddings / np.expand_dims(self.centered_drug_embeddings_length, axis = -1)

        # Replace drugs dataframe with one with two columns - InChi Key and 'normalized_embeddings'
        if replace_dataframe: 
            self.drugs_dataframe = pd.DataFrame([list(drug_smiles.keys()), self.normalized_drug_embeddings]).T
            self.drugs_dataframe.columns = [self.drug_inchi_name, 'normalized_embeddings']
            self.drug_list = list(self.drugs_dataframe[self.drug_inchi_name])
        
        # Return normalized constants and values to save
        if return_normalisation_conststants: 
            return self.centered_drug_embeddings, self.centered_drug_embeddings_length, self.normalized_drug_embeddings
    
    # Get Target Embeddings From ProtVec
    def get_protvec_embeddings(self, embedding_dimension = 100, replace_dataframe = True, return_normalisation_conststants = False, delimiter = '\t'):
        
        # Read in ProtVec model
        if type(self.protvec_model) == type(None): 
            self.protvec_model = pd.read_csv(self.protvec_location, delimiter = delimiter)
            
        # Create dictionary of words : values for faster indexing
        trigram_dict = {}
        for idx, row in tqdm(self.protvec_model.iterrows()):

            trigram_dict[row['words']] = self.protvec_model.iloc[idx, 1:].values.astype(np.float)

        trigram_list = set(trigram_dict.keys())

        self.target_embeddings = np.zeros((len(self.target_list), embedding_dimension))
        length_of_target = [0 for _ in range(len(self.target_list))]
        
        # For each target in target list
        for idx, target in tqdm(enumerate(self.target_list)):

            n = 3
            split_by_three = [target[i : i + n] for i in range(0, len(target), n)]
            length_of_target[idx] = len(split_by_three)

            for trigram in split_by_three: 

                if len(trigram) == 2: 
                    trigram = "X" + trigram

                elif len(trigram) == 1:
                    trigram = "XX" + trigram

                if trigram in trigram_list:
                    self.target_embeddings[idx, :] = self.target_embeddings[idx, :] + trigram_dict[trigram]
        
        self.centered_target_embeddings = self.target_embeddings - np.mean(self.target_embeddings, axis = 0)
        self.centered_target_embeddings_length = np.mean(np.sqrt(np.sum(self.centered_target_embeddings * self.centered_target_embeddings, axis = 1)))
        self.normalized_target_embeddings = self.centered_target_embeddings / np.expand_dims(self.centered_target_embeddings_length, axis = -1)
        
        # Replace targets dataframe with 
        if replace_dataframe: 
            self.targets_dataframe = pd.DataFrame([self.target_list, self.normalized_target_embeddings]).T
            self.targets_dataframe.columns = [self.target_seq_name, 'normalized_embeddings']
        
        if return_normalisation_conststants:
            return self.target_embeddings, self.centered_target_embeddings_length, self.normalized_target_embeddings
 
    def vecnet_2048_2048_concat_512_512(self):
    
        target_input = Input(shape = (100,))
        X_0 = Dense(2048, kernel_initializer = glorot_uniform(), activation = 'relu')(target_input)

        drugs_input = Input(shape = (300,))
        X_1 = Dense(2048, kernel_initializer = glorot_uniform(), activation = 'relu')(drugs_input)

        combined = Concatenate(axis = -1)([X_0, X_1])
        X = Dropout(0.5)(combined)

        X = Dense(512, kernel_initializer = glorot_uniform())(X)
        X = Activation('relu')(X)

        X = Dense(512, kernel_initializer = glorot_uniform())(X)
        X = Activation('relu')(X)

        X = Dense(1, kernel_initializer = glorot_uniform())(X)
        X = Activation('sigmoid')(X)

        model = Model(inputs = [target_input, drugs_input] , outputs = X)

        return model
    
    def train_vecnet(self, model_name, epochs, version = None, learning_rate = 0.00001, beta_1 = 0.9, beta_2 = 0.999, batch_size = 16, chunk_test_frequency = 250):
        
        self.normalized_target_embeddings = np.array(list(self.targets_dataframe['normalized_embeddings']))
        self.normalized_drug_embeddings = np.array(list(self.drugs_dataframe['normalized_embeddings']))
        
        # Check if variable exists
        try:
            self.results
        except:
            self.results = {}
        try:
            self.model_name_index
        except:
            self.model_name_index = {}
        
        if type(version) == type(None):
            version = input("Version : ")
        version = str(version)
        v_num = version
        
        # Iterate over k folds
        for run_number in tqdm(range(len(self.train_sets))):
            
            # Set class weights to reflect train set positive to negative ratio
            class_weight = {0: self.train_pos_neg_ratio[run_number],
                            1: 1}
            
            # Create Lists To Hold Information
            val_auc_ut = []
            val_auc_ue = []
            val_aup_ut = []
            val_aup_ue = []

            loss = []
            acc = []

            # Reinitialise Model At Each Run 
            model = self.vecnet_2048_2048_concat_512_512()
            model_optimizer = tensorflow.keras.optimizers.Adam(lr = learning_rate, beta_1 = beta_1, beta_2 = beta_2, amsgrad = False)
            model.compile(loss = 'binary_crossentropy', optimizer = model_optimizer, metrics = ['binary_accuracy'])

            # Create TQDM Object So We Can Play With Printed String
            t = tqdm(np.random.choice(range(epochs), epochs, replace = False))
            
            # Create File Name To Save Model
            version = v_num + "_run" + str(run_number) + "_" + pd.to_datetime(time.time(), unit = 's').strftime('%m-%d_%Hh%M')

            # Create Validation DataFrames For Each Run
            drug_embed_len = self.normalized_drug_embeddings[0].shape[0]
            
            X_0_val_ut, X_1_val_ut, Y_val_actual_ut = self.dataframe_to_embed_array(interactions_df = self.nodes_validation[run_number],
                                                                                  drug_list = self.drug_list,
                                                                                  target_list = self.target_list,
                                                                                  drug_embed_len = drug_embed_len)

            X_0_val_ue, X_1_val_ue, Y_val_actual_ue = self.dataframe_to_embed_array(interactions_df = self.edges_validation[run_number],
                                                                                  drug_list = self.drug_list,
                                                                                  target_list = self.target_list,
                                                                                  drug_embed_len = drug_embed_len)

            # Create Variable For Seen Targets Needed Later
            seen_targets = list(self.train_sets[run_number][self.target_seq_name])
            
            # Counter to keep track of model names during testing
            model_index_counter = 0
            
            model_key = model_name + '_v' + str(v_num)
            if model_key not in self.model_name_index.keys():
                self.model_name_index[model_key] = {}
            
            
            # For Each Epoch
            for ep, i in enumerate(t):


                # Slice Into Chunks
                interactions_sliced = np.array_split(self.train_sets[run_number], len(self.train_sets[run_number]) / 500)
                
                # Train On Each Chunk
                for idx, interaction in enumerate(interactions_sliced):

                    output_string = ""

                    X_0, X_1, Y = self.dataframe_to_embed_array(interactions_df = interaction,
                                                           drug_list = self.drug_list, 
                                                           target_list = self.target_list,
                                                           drug_embed_len = drug_embed_len)

                    history = model.fit([X_0, X_1], Y,
                                          batch_size = batch_size,
                                          epochs = 1,
                                          class_weight = class_weight,
                                          verbose = 0)

                    if idx % chunk_test_frequency == 0:

                        # Calculate and Save Unseen Target Performance
                        Y_val_predictions_ut = []
                        Y_val_predictions_ut.extend(model.predict([X_0_val_ut, X_1_val_ut]))
                        Y_val_predictions_ut = [x[0] for x in Y_val_predictions_ut]
                        curr_val_auc = roc_auc_score(Y_val_actual_ut, Y_val_predictions_ut)
                        curr_val_aup = average_precision_score(Y_val_actual_ut, Y_val_predictions_ut)
                        val_auc_ut.append(curr_val_auc)
                        val_aup_ut.append(curr_val_aup)

                        Y_val_predictions_ue = []
                        Y_val_predictions_ue.extend(model.predict([X_0_val_ue, X_1_val_ue]))
                        Y_val_predictions_ue = [x[0] for x in Y_val_predictions_ue]
                        curr_val_auc = roc_auc_score(Y_val_actual_ue, Y_val_predictions_ue)
                        curr_val_aup = average_precision_score(Y_val_actual_ue, Y_val_predictions_ue)
                        val_aup_ue.append(curr_val_aup)
                        val_auc_ue.append(curr_val_auc)

                        # Print Stuff
                        output_string = output_string + "Unseen Nodes AUC : " + str(np.round(val_auc_ut[-1], 2)) + "\nUnseen Edges AUC : " +  str(np.round(val_auc_ue[-1], 2)) + "\n"
                        output_string = output_string + "Unseen Nodes AUP : " + str(np.round(val_aup_ut[-1], 2)) + "\nUnseen Edges AUP : " +  str(np.round(val_aup_ue[-1], 2)) + "\n"

                        # Save Model
                        if not os.path.isdir(self.model_out_dir.rstrip('/') + '/Run_' + str(run_number)):
                            os.mkdir(self.model_out_dir.rstrip('/') + '/Run_' + str(run_number))
                        model.save(self.model_out_dir.rstrip('/') + '/Run_' + str(run_number) + '/' + model_name + str(version) + "_epoch_" + str(ep) + "_idx_" + str(idx) + '.model')
                        
                        self.model_name_index[model_key][model_index_counter] = "_epoch_" + str(ep) + "_idx_" + str(idx) + '.model'
                        model_index_counter = model_index_counter + 1
                        
                        t.write(output_string)

                        loss = loss + history.history['loss']
                        acc = acc + history.history['binary_accuracy']
                        
            
    
            try:
                self.results[model_key]
            except: 
                self.results[model_key] = {}

            self.results[model_key][run_number] = {}
            self.results[model_key][run_number]['val_auc_ut'] = val_auc_ut
            self.results[model_key][run_number]['val_auc_ue'] = val_auc_ue
            self.results[model_key][run_number]['val_aup_ut'] = val_aup_ut
            self.results[model_key][run_number]['val_aup_ue'] = val_aup_ue
            self.results[model_key][run_number]['loss'] = loss
            self.results[model_key][run_number]['acc'] = acc  
            
            with open(self.model_out_dir.rstrip('/') + '/results_' + str(v_num) + '.json', 'w') as file: 
                json.dump(self.results, file)
                



### VecNet

In [5]:
with open('/data/sars-busters-consolidated/interactions/targets_test.pkl', 'rb') as file: 
    nodes_test = pkl.load(file)
    
with open('/data/sars-busters-consolidated/interactions/targets_validation.pkl', 'rb') as file: 
    nodes_validation = pkl.load(file)
    
with open('/data/sars-busters-consolidated/interactions/edges_test.pkl', 'rb') as file: 
    edges_test = pkl.load(file)
    
with open('/data/sars-busters-consolidated/interactions/edges_validation.pkl', 'rb') as file: 
    edges_validation = pkl.load(file)
    
with open('/data/sars-busters/Mol2Vec/chemicals_01_w_embed.pkl', 'rb') as file: 
    drugs = pkl.load(file)
    
with open('/data/sars-busters/Mol2Vec/amino_01_w_embed.pkl', 'rb') as file: 
    targets = pkl.load(file)
    
    
drugs = drugs.rename(columns = {'Label' : 'InChiKey'})
targets = targets.rename(columns = {'Label' : 'target_aa_code'})

In [6]:
vecnet_object = AIBind(interactions_location = '/data/sars-busters-consolidated/interactions/dataset_2_filtered.csv',
                       interaction_y_name = 'Y',
                       drugs_location = None,
                       drugs_dataframe = drugs,
                       drug_inchi_name = 'InChiKey',
                       targets_location = None,
                       targets_dataframe = targets, 
                       target_seq_name = 'target_aa_code',
                       drug_smile_name = 'SMILE',
                       mol2vec_location = '/data/sars-busters/Mol2Vec/model_300dim.pkl',
                       protvec_location = '/home/sars-busters/Mol2Vec/Results/protVec_100d_3grams.csv',
                       nodes_test = nodes_test,
                       nodes_validation = nodes_validation,
                       edges_test = edges_test,
                       edges_validation = edges_validation,
                       model_out_dir = './',
                       debug = False)


In [7]:
vecnet_object.get_mol2vec_embeddings()
vecnet_object.get_protvec_embeddings()

vecnet_object.create_train_sets(unseen_nodes_flag = True,
                                data_leak_check = True)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

Set :  0
Train - Test - Validation Overlap For Unseen Targets :  0
Train - Test - Validation Overlap For Unseen Drugs :  0
Train - Test - Validation Overlap For Unseen Edges :  0
Train Set :  (14005, 3)
Nodes Test :  (5473, 3)
Nodes Val :  (5474, 3)
Edge Test :  (3843, 3)
Edge Val :  (3843, 3)
Positive / Negatative Ratio :  0.5287632354546448

Set :  1
Train - Test - Validation Overlap For Unseen Targets :  0
Train - Test - Validation Overlap For Unseen Drugs :  0
Train - Test - Validation Overlap For Unseen Edges :  0
Train Set :  (10716, 3)
Nodes Test :  (5755, 3)
Nodes Val :  (5755, 3)
Edge Test :  (3759, 3)
Edge Val :  (3759, 3)
Positive / Negatative Ratio :  0.5899109792284867

Set :  2
Train - Test - Validation Overlap For Unseen Targets :  0
Train - Test - Validation Overlap For Unseen Drugs :  0
Train - Test - Validation Overlap For Unseen Edges :  0
Train Set :  (14871, 3)
Nodes Test :  (5460, 3)
Nodes Val :  (5461, 3)
Edge Test :  (3847, 3)
Edge Val :  (3847, 3)
Positive / Ne

In [8]:
vecnet_object.train_vecnet(model_name = 'vecnet_class_test',
                           epochs = 3,
                           version = 0,
                           learning_rate = 0.00001,
                           beta_1 = 0.9,
                           beta_2 = 0.999,
                           batch_size = 16,
                           chunk_test_frequency = 250)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Unseen Nodes AUC : 0.43
Unseen Edges AUC : 0.4
Unseen Nodes AUP : 0.47
Unseen Edges AUP : 0.49

Unseen Nodes AUC : 0.71
Unseen Edges AUC : 0.7
Unseen Nodes AUP : 0.69
Unseen Edges AUP : 0.7

Unseen Nodes AUC : 0.75
Unseen Edges AUC : 0.73
Unseen Nodes AUP : 0.72
Unseen Edges AUP : 0.7



  0%|          | 0/3 [00:00<?, ?it/s]

Unseen Nodes AUC : 0.39
Unseen Edges AUC : 0.42
Unseen Nodes AUP : 0.45
Unseen Edges AUP : 0.49

Unseen Nodes AUC : 0.71
Unseen Edges AUC : 0.69
Unseen Nodes AUP : 0.72
Unseen Edges AUP : 0.68

Unseen Nodes AUC : 0.75
Unseen Edges AUC : 0.71
Unseen Nodes AUP : 0.71
Unseen Edges AUP : 0.69



  0%|          | 0/3 [00:00<?, ?it/s]

Unseen Nodes AUC : 0.44
Unseen Edges AUC : 0.41
Unseen Nodes AUP : 0.46
Unseen Edges AUP : 0.48

Unseen Nodes AUC : 0.68
Unseen Edges AUC : 0.71
Unseen Nodes AUP : 0.67
Unseen Edges AUP : 0.7

Unseen Nodes AUC : 0.71
Unseen Edges AUC : 0.74
Unseen Nodes AUP : 0.65
Unseen Edges AUP : 0.7



  0%|          | 0/3 [00:00<?, ?it/s]

Unseen Nodes AUC : 0.44
Unseen Edges AUC : 0.49
Unseen Nodes AUP : 0.46
Unseen Edges AUP : 0.53

Unseen Nodes AUC : 0.72
Unseen Edges AUC : 0.71
Unseen Nodes AUP : 0.67
Unseen Edges AUP : 0.71

Unseen Nodes AUC : 0.74
Unseen Edges AUC : 0.75
Unseen Nodes AUP : 0.66
Unseen Edges AUP : 0.73



  0%|          | 0/3 [00:00<?, ?it/s]

Unseen Nodes AUC : 0.42
Unseen Edges AUC : 0.44
Unseen Nodes AUP : 0.46
Unseen Edges AUP : 0.52

Unseen Nodes AUC : 0.67
Unseen Edges AUC : 0.72
Unseen Nodes AUP : 0.63
Unseen Edges AUP : 0.72

Unseen Nodes AUC : 0.69
Unseen Edges AUC : 0.75
Unseen Nodes AUP : 0.63
Unseen Edges AUP : 0.72



  0%|          | 0/3 [00:00<?, ?it/s]

Unseen Nodes AUC : 0.39
Unseen Edges AUC : 0.41
Unseen Nodes AUP : 0.43
Unseen Edges AUP : 0.47

Unseen Nodes AUC : 0.69
Unseen Edges AUC : 0.7
Unseen Nodes AUP : 0.65
Unseen Edges AUP : 0.69

Unseen Nodes AUC : 0.72
Unseen Edges AUC : 0.74
Unseen Nodes AUP : 0.65
Unseen Edges AUP : 0.7



In [9]:
vecnet_object.model_name_index

{'vecnet_class_test_v0': {0: '_epoch_0_idx_0.model',
  1: '_epoch_1_idx_0.model',
  2: '_epoch_2_idx_0.model'}}

In [10]:
vecnet_object.get_validation_results(model_name = 'vecnet_class_test_v0',
                   show_plots = False,
                   plot_title = 'Test Plots',
                   num_cols = 2,
                   plot_height = 1500,
                   plot_width = 1500,
                   write_plot_to_html = False,
                   plot_dir = None,
                   plot_name = None)

(Epoch * Chunk) With Highest Unseen Node and Edge Score :  2
(Epoch * Chunk) With Highest Unseen Node Score :  2
(Epoch * Chunk) With Highest Unseen Edge Score :  2
Validation Performance
Best Model Suffix :  _epoch_2_idx_0.model
Unseen Node AUC :  0.725797632817363 +/- 0.02329358871730686
Unseen Node AUP :  0.6689468802904407 +/- 0.03167942244216613
Unseen Edges AUC :  0.7353252305640452 +/- 0.0117381953913359
Unseen Edges AUP :  0.7062604226306105 +/- 0.014499403534990064


In [11]:
vecnet_object.get_test_results(model_name = None,
                          optimal_validation_model = 2,
                          drug_filter_list = [],
                          target_filter_list = [])

Test Set Performance : 

	Unseen Nodes : 

		AUC          :  0.7301269066303475 +/- 0.027035049410379233
		AUP          :  0.6712119089930869 +/- 0.03362075458573387
		Max F1 Score :  0.6907250884317606 +/- 0.016198227580438193
		F1 Threshold :  0.12666666666666668 +/- 0.014907119849998597
		Confusion Matrix : 


Unnamed: 0,Pred (0),Pred (1)
True (0),0.75 +/- 0.02,0.37 +/- 0.03
True (1),0.25 +/- 0.02,0.63 +/- 0.03



	Unseen Edges : 

		AUC          :  0.7382971684116503 +/- 0.00657318589583467
		AUP          :  0.7033040927572177 +/- 0.009990008088617981
		Max F1 Score :  0.7088187006501232 +/- 0.0044808720371446505
		F1 Threshold :  0.10833333333333334 +/- 0.02192157739660984
		Confusion Matrix : 


Unnamed: 0,Pred (0),Pred (1)
True (0),0.74 +/- 0.01,0.37 +/- 0.02
True (1),0.26 +/- 0.01,0.63 +/- 0.02
