# Imports

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import seaborn as sns
from pylab import rcParams
import string
import re
import matplotlib.pyplot as plt
import math
from matplotlib import rc
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict
from sklearn.metrics import accuracy_score, average_precision_score
import matplotlib.ticker as ticker
from math import sqrt

2023-01-23 17:14:52.528095: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-23 17:14:54.579026: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-23 17:14:54.579943: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


# Preparing Data

In [2]:
pauwel = pd.read_csv("pauwel-dataset.txt", sep='\t', header=0)
pauwel

Index(['carnitine', 'GABA', 'delta-aminolevulinic acid', 'leucovorin', 'PGE2',
       'prostacyclin', 'acetate', 'acetylcholine', 'adenosine', 'galactose',
       ...
       'rifapentine', 'sucralfate', 'cefdinir', 'ivermectin', 'rifaximin',
       'pimecrolimus', 'auranofin', 'cefditoren', 'nitroprusside',
       'gold sodium thiomalate'],
      dtype='object', length=888)

In [3]:
mizutani = pd.read_csv("mizutani-dataset.txt", sep='\t', header=0)
mizutani # drug names are coded with pubchem ID

Unnamed: 0,abdominal.cramps,abdominal.distention,abdominal.pain,malformations,spontaneous.abortion,missed.abortion,abscess,acanthosis.nigricans,acidosis,renal.tubular.acidosis,...,vitamin.deficiency,drug.dependence,diverticulosis,prostatic.hypertrophy,allergic.reaction,dysphonia,eosinophilic.pneumonia,retinal.vein.thrombosis,renal.insufficiency,glioblastoma.multiforme
85,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
119,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
137,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
143,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
158,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6398525,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6398970,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
6447131,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6918453,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Check number of missing values
new_pauwel = pauwel.fillna(999).values
num_missing_pauwel = np.argwhere(new_pauwel == 999)
print(len(num_missing_pauwel))

new_mizutani = mizutani.fillna(999).values
num_missing_mizutani = np.argwhere(new_mizutani == 999)
print(len(num_missing_mizutani))

0
0


In [5]:
# Check the density of the pauwel dataset
density = float(len(np.nonzero(pauwel.to_numpy())[0]))
density /= (pauwel.shape[0]*pauwel.shape[1])
density *= 100
print('{:.2f}%'.format(density))

4.97%


In [11]:
# Check the density of the mizutani dataset
density = float(len(np.nonzero(mizutani.to_numpy())[0]))
density /= (mizutani.shape[0]*mizutani.shape[1])
density *= 100
print('{:.2f}%'.format(density))

5.57%


# Training

## Model Implementation

In [13]:
class SGD_Recommender:
    
    def __init__(self, k, lmbda, n_iter=350, learn_rate=0.005, tolerance=1e-04):
        """Set the parameters for SGD"""
        
        self.k=k
        self.lmbda=lmbda
        self.n_iter=n_iter
        self.learn_rate=learn_rate
        self.tolerance=tolerance
        
    def predictions(self, U: np.ndarray, V: np.ndarray) -> np.ndarray:
        """Return dot product of the matrices U and V."""
        
        return np.dot(U, V.T)
    
    def check_convergence(self, U_old: np.ndarray, U_curr: np.ndarray, V_old: np.ndarray, V_curr: np.ndarray) -> bool:
        """Check if matrices have reached convergence."""
        
        # subtract old and current matrices and see if each value in result matrix is less than tolerance 
        U_converged = np.all(np.abs(np.subtract(U_old,U_curr)) <= self.tolerance)
        V_converged = np.all(np.abs(np.subtract(V_old,V_curr)) <= self.tolerance)
        return U_converged and V_converged
        
    def fit(self, train: pd.DataFrame, dataset: pd.DataFrame) -> None:
        """Train the SGD model.
        
        Args:
            train (pd.DataFrame): The training set
            dataset (pd.DataFrame): The dataset
        Returns: 
            None        
        """     
        
        m, n = dataset.shape
        
        # Initialize the low rank matrices U and V with values from the normal distribution N(0,0.01)
        mu, sigma = 0, 0.1
        self.U = np.random.normal(loc=mu, scale=sigma, size=(m, self.k))
        self.V = np.random.normal(loc=mu, scale=sigma, size=(n, self.k))
        
        # Get the indices of known drug-se associations in the training set from the dataset
        drugs, side_fx = train.values.nonzero()
        drug_names = train.index[drugs]
        side_fx_names = train.columns[side_fx]
        drug_index = [dataset.index.get_loc(x) for x in drug_names]
        side_fx_index = [dataset.columns.get_loc(x) for x in side_fx_names]
        
        drug_se = list(zip(drug_index, side_fx_index))
        rng = np.random.default_rng()
        
        # Start of training
        for epoch in range(self.n_iter):
            rng.shuffle(drug_se) # Shuffle in place
            U_old = self.U.copy()
            V_old = self.V.copy()
            
            for (drug, se) in drug_se:
                error = train[drug,se]-self.predictions(self.U[drug,:], self.V[se,:])
                temp_u = self.U[drug,:] + self.learn_rate(error*self.V[se,:] - self.lmbda*self.U[drug,:])
                temp_v = self.V[se,:] + self.learn_rate(error*self.U[se,:] - self.lmbda*self.V[se,:])
                self.U[drug,:] = temp_u 
                self.V[se,:] = temp_v
            
            if self.check_convergence(U_old, self.U, V_old, self.V):
                break
            
    def predict(self) -> np.ndarray:
        """Predict the entire drug-side effect matrix values."""
        
        return self.predictions(self.U, self.V)

In [8]:
# def five_fold_split(dataset):
#     """
#     Splits dataset into 5 equal parts
#     Input: Dataframe
#     """
#     np.random.shuffle(dataset)
#     fold_size = dataset.shape[0]//5
    
#     a = dataset.iloc[0:fold_size]
#     b = dataset.iloc[fold_size:2*fold_size]
#     c = dataset.iloc[2*fold_size:3*fold_size]
#     d = dataset.iloc[3*fold_size:4*fold_size]
#     e = dataset.iloc[4*fold_size:5*fold_size]
    
#     return a,b,c,d,e

In [14]:
def cross_val_5(dataset: pd.DataFrame, lmbda: float|int, k: int) -> np.ndarray:
    """Perform 5 fold CV, given regularization term (lambda) and number of latent features (k).
    
    Args:
        dataset (pd.DataFrame): The dataset
        lmbda (float): The regularization term
        k (int): Number of latent features
    Returns:
        np.ndarray: Array of length 6 containing AUPR values of the 5 folds and the average value.
    
    """
    
    np.random.shuffle(dataset)
    fold_size = dataset.shape[0]//5
    
    # Specify the test sets of each fold
    a = dataset.iloc[0:fold_size]
    b = dataset.iloc[fold_size:2*fold_size]
    c = dataset.iloc[2*fold_size:3*fold_size]
    d = dataset.iloc[3*fold_size:4*fold_size]
    e = dataset.iloc[4*fold_size:5*fold_size]
    
    # Create a model for each fold
    model_a = SGD_recommender(k=k, lmbda=lmbda)
    model_b = SGD_recommender(k=k, lmbda=lmbda)
    model_c = SGD_recommender(k=k, lmbda=lmbda)
    model_d = SGD_recommender(k=k, lmbda=lmbda)
    model_e = SGD_recommender(k=k, lmbda=lmbda)
    
    
    # model a
    train_a = dataset.drop(a.index)
    model_a.fit(train_a, dataset)
    pred = pd.DataFrame(model_a.predict(), index=dataset.index, columns=dataset.columns)
    aupr_a = get_aupr(truth=a.values, predictions=pred.iloc[0:fold_size].values)
    
    # model b
    train_b = dataset.drop(b.index)
    model_b.fit(train_b, dataset)
    pred = pd.DataFrame(model_b.predict(), index=dataset.index, columns=dataset.columns)
    aupr_b = get_aupr(truth=b.values, predictions=pred.iloc[fold_size:2*fold_size].values)
    
    # model c
    train_c = dataset.drop(c.index)
    model_c.fit(train_c, dataset)
    pred = pd.DataFrame(model_c.predict(), index=dataset.index, columns=dataset.columns)
    aupr_c = get_aupr(truth=c.values, predictions=pred.iloc[2*fold_size:3*fold_size].values)
    
    # model d
    train_d = dataset.drop(d.index)
    model_d.fit(train_d, dataset)
    pred = pd.DataFrame(model_d.predict(), index=dataset.index, columns=dataset.columns)
    aupr_d = get_aupr(truth=d.values, predictions=pred.iloc[3*fold_size:4*fold_size].values)
    
    # model e
    train_e = dataset.drop(e.index)
    model_e.fit(train_e, dataset)
    pred = pd.DataFrame(model_e.predict(), index=dataset.index, columns=dataset.columns)
    aupr_e = get_aupr(truth=e.values, predictions=pred.iloc[4*fold_size:5*fold_size].values)
    
    mean = np.mean([aupr_a, aupr_b, aupr_c, aupr_d, aupr_e])
    return np.array([aupr_a, aupr_b, aupr_c, aupr_d, aupr_e, mean])
    
    

## Evaluation Functions

In [1]:
def param_selection(dataset: pd.Dataframe) -> dict:
    """Find the optimal set of parameters for the SGD model."""
    
    k_values = [1,5,10,15,20,25,30,35,40,50,100]
    lmda_values = [0.1,1,5,10,15,20]
    
    results = {}
    
    # Do 5 fold CV for each possible combination of lambda and k
    for (k,lmbda) in product(k_values, lmda_values):
        res = cross_val_5(dataset=dataset, lmbda=lmbda, k=k)
        results[(k,lmbda)] = res
    
    return results
        
    

In [None]:
def train_optimal_params(dataset: pd.DataFrame, lmbda: float|int, k: int) -> dict:
    """Do 20 runs of 5 fold CV."""
    
    results = {}
    
    for run in range(20)
        res = cross_val_5(dataset=dataset, lmbda=lmbda, k=k)
        results[run] = res
    
    return results


In [None]:
def aupr(truth: np.ndarray, predictions: np.ndarray) -> float:
    """Get the area under the precision-recall curve."""
    
    max_value = predictions.max()
    min_value = predictions.min()
    # Create an array of 99 representing the thresholds
    threshold = min_value + (max_value-min_value)*(np.arange(1,100,1)/100)
    
    tn = np.zeros((threshold.size, 1))
    tp = np.zeros((threshold.size, 1))
    fn = np.zeros((threshold.size, 1))
    tp = np.zeros((threshold.size, 1))
    
    # Calculate the tp, tn, fp, fn for every threshold.
    for index in range(99):
        tp[index,0] = np.logical_and(predictions>=threshold[index], truth==1).sum()
        tn[index,0] = np.logical_and(predictions<threshold[index], truth==0).sum()
        fp[index,0] = np.logical_and(predictions>=threshold[index], truth==0).sum()
        fn[index,0] = np.logical_and(predictions<threshold[index], truth==1).sum()
    
    # Calculate the area under the precision-recall curve
    recall = tp/(tp+fn)
    prec = tp/(tp+fp)
    
    x = recall.copy()
    y = prec.copy()
    
    sorted_index = x.argsort()
    x = x[sorted_index]
    y = y[sorted_index]

    # Note that Zhang implementation sets the first values for x and y as 0 and 1 respectively
    x[0][0] = 0
    y[0][0] = 1
#     x = np.insert(x, 0, 0, 0)
#     y = np.insert(y, 0, 1, 0)
    # x and y now have 100 values, not 99, causing no errors in trapezoidal rule calculation (indexoutofbound)
    x = np.append(x, [[1]], 0)
    y = np.append(y, [[0]], 0)

    
    # Calculate the area using the trapezoidal rule: (b-a)*0.5*(f(b)+f(a))
    area = 0
    area = 0.5*x[0][0]*(1+y[0][0]) # still 0
    for idx in range(99):
        area += (x[idx+1][0]-x[idx][0])*0.5*(y[idx][0]+ y[idx+1][0])
    
    return area

In [7]:
def get_aupr(truth: np.ndarray, predictions: np.ndarray) -> float:
    """Get the AUPR."""
    
    return average_precision_score(truth.flatten(), predictions.flatten())

In [None]:
def auroc(truth: np.ndarray, predictions: np.ndarray) -> float:
    """Get the area under the ROC curve."""
    
    max_value = predictions.max()
    min_value = predictions.min()
    # Create an array of 99 representing the thresholds
    threshold = min_value + (max_value-min_value)*(np.arange(1,100,1)/100)
    
    tn = np.zeros((threshold.size, 1))
    tp = np.zeros((threshold.size, 1))
    fn = np.zeros((threshold.size, 1))
    tp = np.zeros((threshold.size, 1))
    
    # Calculate the tp, tn, fp, fn for every threshold.
    for index in range(99):
        tp[index,0] = np.logical_and(predictions>=threshold[index], truth==1).sum()
        tn[index,0] = np.logical_and(predictions<threshold[index], truth==0).sum()
        fp[index,0] = np.logical_and(predictions>=threshold[index], truth==0).sum()
        fn[index,0] = np.logical_and(predictions<threshold[index], truth==1).sum()
    
    # Calculate the area under the precision-recall curve
    sn = tp/(tp+fn)
    sp = tn/(tn+fp)
    x = 1 - sp
    y = sn.copy()
    
    # Zhang sorted twice - I think it's redundant
#     sorted_index = x.argsort()
#     x = x[sorted_index]
#     y = y[sorted_index]

    sorted_index = y.argsort()
    y = y[sorted_index]
    x = x[sorted_index]

    # x and y now have 100 values, not 99, causing no errors in trapezoidal rule calculation (indexoutofbound)
    x = np.append(x, [[1]], 0)
    y = np.append(y, [[1]], 0)

    
    # Calculate the area using the trapezoidal rule: (b-a)*0.5*(f(b)+f(a))
    area = 0
    area = 0.5*x[0][0]*(y[0][0]) # still 0
    for idx in range(99):
        area += (x[idx+1][0]-x[idx][0])*0.5*(y[idx][0]+ y[idx+1][0])
    
    return area

## Running Training