# Imports

In [102]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import seaborn as sns
from pylab import rcParams
import string
import re
import matplotlib.pyplot as plt
import math
from matplotlib import rc
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict
from sklearn.metrics import accuracy_score, average_precision_score
import matplotlib.ticker as ticker
from math import sqrt
from itertools import product


# Preparing Data

In [103]:
pauwel = pd.read_csv("pauwel-dataset.txt", sep='\t', header=0)
pauwel

Unnamed: 0,abdominal cramps,abdominal distention,abdominal pain,malformations,spontaneous abortion,missed abortion,abscess,acanthosis nigricans,acidosis,renal tubular acidosis,...,drug dependence,diverticulosis,prostatic hypertrophy,allergic reaction,dysphonia,eosinophilic pneumonia,retinal vein thrombosis,renal insufficiency,glioblastoma multiforme,portal cirrhosis
carnitine,1,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
GABA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
delta-aminolevulinic acid,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
leucovorin,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGE2,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pimecrolimus,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
auranofin,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cefditoren,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
nitroprusside,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [104]:
mizutani = pd.read_csv("mizutani-dataset.txt", sep='\t', header=0)
mizutani # drug names are coded with pubchem ID

Unnamed: 0,abdominal.cramps,abdominal.distention,abdominal.pain,malformations,spontaneous.abortion,missed.abortion,abscess,acanthosis.nigricans,acidosis,renal.tubular.acidosis,...,vitamin.deficiency,drug.dependence,diverticulosis,prostatic.hypertrophy,allergic.reaction,dysphonia,eosinophilic.pneumonia,retinal.vein.thrombosis,renal.insufficiency,glioblastoma.multiforme
85,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
119,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
137,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
143,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
158,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6398525,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6398970,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
6447131,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6918453,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
# Check number of missing values
new_pauwel = pauwel.fillna(999).values
num_missing_pauwel = np.argwhere(new_pauwel == 999)
print(len(num_missing_pauwel))

new_mizutani = mizutani.fillna(999).values
num_missing_mizutani = np.argwhere(new_mizutani == 999)
print(len(num_missing_mizutani))

0
0


In [106]:
# Check the density of the pauwel dataset
density = float(len(np.nonzero(pauwel.to_numpy())[0]))
density /= (pauwel.shape[0]*pauwel.shape[1])
density *= 100
print('{:.2f}%'.format(density))

4.97%


In [107]:
# Check the density of the mizutani dataset
density = float(len(np.nonzero(mizutani.to_numpy())[0]))
density /= (mizutani.shape[0]*mizutani.shape[1])
density *= 100
print('{:.2f}%'.format(density))

5.57%


# Training

## Model Implementation

In [108]:
class SGD_Recommender:
    
    def __init__(self, k, lmbda, n_iter=350, learn_rate=0.005, tolerance=1e-04):
        """Set the parameters for SGD"""
        
        self.k=k
        self.lmbda=lmbda
        self.n_iter=n_iter
        self.learn_rate=learn_rate
        self.tolerance=tolerance
        
    def predictions(self, U: np.ndarray, V: np.ndarray) -> np.ndarray:
        """Return dot product of the matrices U and V."""
        
        return np.dot(U, V.T)
    
    def check_convergence(self, U_old: np.ndarray, U_curr: np.ndarray, V_old: np.ndarray, V_curr: np.ndarray) -> bool:
        """Check if matrices have reached convergence."""
        
        # subtract old and current matrices and see if each value in result matrix is less than tolerance 
        U_converged = np.all(np.abs(np.subtract(U_old,U_curr)) <= self.tolerance)
        V_converged = np.all(np.abs(np.subtract(V_old,V_curr)) <= self.tolerance)
        return U_converged and V_converged
        
    def fit(self, train: pd.DataFrame, dataset: pd.DataFrame) -> None:
        """Train the SGD model.
        
        Args:
            train (pd.DataFrame): The training set
            dataset (pd.DataFrame): The dataset
        Returns: 
            None        
        """     
        
        m, n = dataset.shape
        
        # Initialize the low rank matrices U and V with values from the normal distribution N(0,0.01)
        mu, sigma = 0, 0.1
        self.U = np.random.normal(loc=mu, scale=sigma, size=(m, self.k))
        self.V = np.random.normal(loc=mu, scale=sigma, size=(n, self.k))
        
        # Get the indices of known drug-se associations in the training set from the dataset
        drugs, side_fx = train.values.nonzero()
        drug_names = train.index[drugs]
        side_fx_names = train.columns[side_fx]
        drug_index = [dataset.index.get_loc(x) for x in drug_names]
        side_fx_index = [dataset.columns.get_loc(x) for x in side_fx_names]
        
        drug_se = list(zip(drug_index, side_fx_index))
        rng = np.random.default_rng()
        
        # Start of training
        for epoch in range(self.n_iter):
            rng.shuffle(drug_se) # Shuffle in place
            U_old = self.U.copy()
            V_old = self.V.copy()
            
            for (drug, se) in drug_se:
                error = dataset.iloc[drug,se]-self.predictions(self.U[drug,:], self.V[se,:])
                temp_u = self.U[drug,:] + self.learn_rate*(error*self.V[se,:] - self.lmbda*self.U[drug,:])
                temp_v = self.V[se,:] + self.learn_rate*(error*self.U[drug,:] - self.lmbda*self.V[se,:])
                self.U[drug,:] = temp_u 
                self.V[se,:] = temp_v
            
            if self.check_convergence(U_old, self.U, V_old, self.V):
                break
            
    def predict(self) -> np.ndarray:
        """Predict the entire drug-side effect matrix values."""
        
        return self.predictions(self.U, self.V)

In [109]:
# def five_fold_split(dataset):
#     """
#     Splits dataset into 5 equal parts
#     Input: Dataframe
#     """
#     np.random.shuffle(dataset)
#     fold_size = dataset.shape[0]//5
    
#     a = dataset.iloc[0:fold_size]
#     b = dataset.iloc[fold_size:2*fold_size]
#     c = dataset.iloc[2*fold_size:3*fold_size]
#     d = dataset.iloc[3*fold_size:4*fold_size]
#     e = dataset.iloc[4*fold_size:5*fold_size]
    
#     return a,b,c,d,e

In [1]:
def cross_val_5_param_selection(dataset: pd.DataFrame, lmbda: float|int, k: int) -> float:
    """Perform 5 fold CV, given regularization term (lambda) and number of latent features (k).
    
    Args:
        dataset (pd.DataFrame): The dataset
        lmbda (float): The regularization term
        k (int): Number of latent features
    Returns:
        np.ndarray: Average AUPR over the 5 folds.
    """
    
#     np.random.shuffle(dataset)
    dataset.sample(frac=1)
    fold_size = dataset.shape[0]//5
    
    # Specify the test sets of each fold
    a = dataset.iloc[0:fold_size]
    b = dataset.iloc[fold_size:2*fold_size]
    c = dataset.iloc[2*fold_size:3*fold_size]
    d = dataset.iloc[3*fold_size:4*fold_size]
    e = dataset.iloc[4*fold_size:5*fold_size]
    
    # model a
#     print("First Fold")
    model = SGD_Recommender(k=k, lmbda=lmbda)
    model.fit(dataset.drop(a.index), dataset)
    pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    aupr_a = aupr(truth=a.values, predictions=pred.iloc[0:fold_size].values)
    
    
    # model b
#     print("Second Fold")
    model = SGD_Recommender(k=k, lmbda=lmbda)
    model.fit(dataset.drop(b.index), dataset)
    pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    aupr_b = aupr(truth=b.values, predictions=pred.iloc[fold_size:2*fold_size].values)
    
    # model c
#     print("Third Fold")
    model = SGD_Recommender(k=k, lmbda=lmbda)
    model.fit(dataset.drop(c.index), dataset)
    pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    aupr_c = aupr(truth=c.values, predictions=pred.iloc[2*fold_size:3*fold_size].values)
    
    # model d
#     print("Fourth Fold")
    model = SGD_Recommender(k=k, lmbda=lmbda)
    model.fit(dataset.drop(d.index), dataset)
    pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    aupr_d = aupr(truth=d.values, predictions=pred.iloc[3*fold_size:4*fold_size].values)
    
    # model e
#     print("Fifth Fold")
    model = SGD_Recommender(k=k, lmbda=lmbda)
    model.fit(dataset.drop(e.index), dataset)
    pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    aupr_e = aupr(truth=e.values, predictions=pred.iloc[4*fold_size:5*fold_size].values)
    
    mean = np.mean([aupr_a, aupr_b, aupr_c, aupr_d, aupr_e])
    print("Done 5-fold CV")
    return mean
    

NameError: name 'pd' is not defined

## Evaluation Functions

In [111]:
# def param_selection(dataset: pd.DataFrame) -> dict:
#     """Find the optimal set of parameters for the SGD model.
    
#     Args:
#         dataset (pd.DataFrame): The dataset
#     Returns:
#         dict: Dictionary of parameter combinations and associated aupr score.
#     """
    
#     k_values = [1,5,10,15,20,25,30,35,40,50,100]
#     lmda_values = [0.1,1,5,10,15,20]
    
#     results = {}
    
#     # Do 5 fold CV for each possible combination of lambda and k (product function gives the cartesian product)
#     for (k,lmbda) in product(k_values, lmda_values):
#         print("Trying parameters: (" , k, ", ", lmbda, ")")
#         res = cross_val_5_param_selection(dataset=dataset, lmbda=lmbda, k=k)
#         results[(k,lmbda)] = res
    
#     return results
        
    

In [None]:
def param_selection_1(dataset: pd.DataFrame) -> dict:
    """Find the optimal set of parameters for the SGD model.
    
    Args:
        dataset (pd.DataFrame): The dataset
    Returns:
        dict: Dictionary of parameter combinations and associated aupr score.
    """
    
    k_values = [1,5,10,15,20]
    lmda_values = [0.1,1,5,10,15,20]
    
    results = {}
    
    # Do 5 fold CV for each possible combination of lambda and k (product function gives the cartesian product)
    for (k,lmbda) in product(k_values, lmda_values):
        print("Trying parameters: (" , k, ", ", lmbda, ")")
        res = cross_val_5_param_selection(dataset=dataset, lmbda=lmbda, k=k)
        results[(k,lmbda)] = res
    
    return results

In [None]:
def param_selection_2(dataset: pd.DataFrame) -> dict:
    """Find the optimal set of parameters for the SGD model.
    
    Args:
        dataset (pd.DataFrame): The dataset
    Returns:
        dict: Dictionary of parameter combinations and associated aupr score.
    """
    
    k_values = [25,30,35]
    lmda_values = [0.1,1,5,10,15,20]
    
    results = {}
    
    # Do 5 fold CV for each possible combination of lambda and k (product function gives the cartesian product)
    for (k,lmbda) in product(k_values, lmda_values):
        print("Trying parameters: (" , k, ", ", lmbda, ")")
        res = cross_val_5_param_selection(dataset=dataset, lmbda=lmbda, k=k)
        results[(k,lmbda)] = res
    
    return results

In [None]:
def param_selection_3(dataset: pd.DataFrame) -> dict:
    """Find the optimal set of parameters for the SGD model.
    
    Args:
        dataset (pd.DataFrame): The dataset
    Returns:
        dict: Dictionary of parameter combinations and associated aupr score.
    """
    
    k_values = [40,50,100]
    lmda_values = [0.1,1,5,10,15,20]
    
    results = {}
    
    # Do 5 fold CV for each possible combination of lambda and k (product function gives the cartesian product)
    for (k,lmbda) in product(k_values, lmda_values):
        print("Trying parameters: (" , k, ", ", lmbda, ")")
        res = cross_val_5_param_selection(dataset=dataset, lmbda=lmbda, k=k)
        results[(k,lmbda)] = res
    
    return results

In [112]:
def cross_val_5(dataset: pd.DataFrame, lmbda: float|int, k: int) -> np.ndarray:
    """Perform 5 fold CV, given regularization term (lambda) and number of latent features (k).
    
    Args:
        dataset (pd.DataFrame): The dataset
        lmbda (float): The regularization term
        k (int): Number of latent features
    Returns:
        np.ndarray: Average of the metrics (sn, sp, prec, recall, acc, f1, aupr, auc) over the 5 folds.
    
    """
    
#     np.random.shuffle(dataset)
    dataset.sample(frac=1)

    fold_size = dataset.shape[0]//5
    
    # Specify the test sets of each fold
    a = dataset.iloc[0:fold_size]
    b = dataset.iloc[fold_size:2*fold_size]
    c = dataset.iloc[2*fold_size:3*fold_size]
    d = dataset.iloc[3*fold_size:4*fold_size]
    e = dataset.iloc[4*fold_size:5*fold_size]   
    
    # model a
    print("First Fold")
    model = SGD_Recommender(k=k, lmbda=lmbda)
    model.fit(dataset.drop(a.index), dataset)
    pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    
    aupr_a = aupr(truth=a.values, predictions=pred.iloc[0:fold_size].values)
    auroc_a = auroc(truth=a.values, predictions=pred.iloc[0:fold_size].values)
    sn_a, sp_a, prec_a, acc_a, f1_a = get_metric(truth=a.values, predictions=pred.iloc[0:fold_size].values)
    
    
    # model b
    print("Second Fold")
    model = SGD_Recommender(k=k, lmbda=lmbda)
    model.fit(dataset.drop(b.index), dataset)
    pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    
    aupr_b = aupr(truth=b.values, predictions=pred.iloc[fold_size:2*fold_size].values)
    auroc_b = auroc(truth=b.values, predictions=pred.iloc[fold_size:2*fold_size].values)
    sn_b, sp_b, prec_b, acc_b, f1_b = get_metric(truth=b.values, predictions=pred.iloc[fold_size:2*fold_size].values)

    
    # model c
    print("Third Fold")
    model = SGD_Recommender(k=k, lmbda=lmbda)
    model.fit(dataset.drop(c.index), dataset)
    pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    
    aupr_c = aupr(truth=c.values, predictions=pred.iloc[2*fold_size:3*fold_size].values)
    auroc_c = auroc(truth=c.values, predictions=pred.iloc[2*fold_size:3*fold_size].values)
    sn_c, sp_c, prec_c, acc_c, f1_c = get_metric(truth=c.values, predictions=pred.iloc[2*fold_size:3*fold_size].values)
    
    # model d
    print("Fourth Fold")
    model = SGD_Recommender(k=k, lmbda=lmbda)
    model.fit(dataset.drop(d.index), dataset)
    pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    
    aupr_d = aupr(truth=d.values, predictions=pred.iloc[3*fold_size:4*fold_size].values)
    auroc_d = auroc(truth=d.values, predictions=pred.iloc[3*fold_size:4*fold_size].values)
    sn_d, sp_d, prec_d, acc_d, f1_d = get_metric(truth=d.values, predictions=pred.iloc[3*fold_size:4*fold_size].values)
    
    # model e
    print("Fifth Fold")
    model = SGD_Recommender(k=k, lmbda=lmbda)
    model.fit(dataset.drop(e.index), dataset)
    pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    
    aupr_e = aupr(truth=e.values, predictions=pred.iloc[4*fold_size:5*fold_size].values)
    auroc_e = auroc(truth=e.values, predictions=pred.iloc[4*fold_size:5*fold_size].values)
    sn_e, sp_e, prec_e, acc_e, f1_e = get_metric(truth=e.values, predictions=pred.iloc[4*fold_size:5*fold_size].values)
    
    
    # Calculate the average
    aupr_mean = np.mean([aupr_a, aupr_b, aupr_c, aupr_d, aupr_e])
    auroc_mean = np.mean([auroc_a, auroc_b, auroc_c, auroc_d, auroc_e])
    sn_mean = np.mean([sn_a, sn_b, sn_c, sn_d, sn_e])
    sp_mean = np.mean([sp_a, sp_b, sp_c, sp_d, sp_e])
    prec_mean = np.mean([prec_a, prec_b, prec_c, prec_d, prec_e])
    acc_mean = np.mean([acc_a, acc_b, acc_c, acc_d, acc_e])
    f1_mean = np.mean([f1_a, f1_b, f1_c, f1_d, f1_e])
    
    return aupr_mean, auroc_mean, sn_mean, sp_mean, prec_mean, acc_mean, f1_mean
    

In [113]:
def train_optimal_params(dataset: pd.DataFrame, lmbda: float|int, k: int) -> np.ndarray:
    """Do 20 runs of 5 fold CV."""
        
    for run in range(20):
        t_aupr, t_auroc, t_sn, t_sp, t_prec, t_acc, t_f1 =  cross_val_5(dataset=dataset, lmbda=lmbda, k=k)
        aupr, auroc, sn, sp, prec, acc, f1 = aupr+t_aupr, auroc+t_auroc, sn+t_sn, sp+t_sp, prec+t_prec, acc+t_acc, f1+t_f1    
    
    # Return the mean of the metrics over the 20 runs
    return np.array([aupr, auroc, sn, sp, prec, acc, f1])/20


In [114]:
def aupr(truth: np.ndarray, predictions: np.ndarray) -> float:
    """Get the area under the precision-recall curve."""
#     truth = truth.flatten()
#     predictions = predictions.flatten()
    
    max_value = predictions.max()
    min_value = predictions.min()
    # Create an array of 99 representing the thresholds
    threshold = min_value + (max_value-min_value)*(np.arange(1,100,1)/100)
    
    tn = np.zeros((threshold.size, 1))
    tp = np.zeros((threshold.size, 1))
    fn = np.zeros((threshold.size, 1))
    fp = np.zeros((threshold.size, 1))
    
    # Calculate the tp, tn, fp, fn for every threshold.
    for index in range(99):
        tp[index,0] = np.logical_and(predictions>=threshold[index], truth==1).sum()
        tn[index,0] = np.logical_and(predictions<threshold[index], truth==0).sum()
        fp[index,0] = np.logical_and(predictions>=threshold[index], truth==0).sum()
        fn[index,0] = np.logical_and(predictions<threshold[index], truth==1).sum()
    
    # Calculate the area under the precision-recall curve
    recall = tp/(tp+fn)
    prec = tp/(tp+fp)
    
    x = recall.copy()
    y = prec.copy()
    
    sorted_index = x.argsort()
    y = np.take_along_axis(y, sorted_index, axis=0)
    x = np.take_along_axis(x, sorted_index, axis=0)

    # Note that Zhang implementation sets the first values for x and y as 0 and 1 respectively
    x[0][0] = 0
    y[0][0] = 1
#     x = np.insert(x, 0, 0, 0)
#     y = np.insert(y, 0, 1, 0)
    # x and y now have 100 values, not 99, causing no errors in trapezoidal rule calculation (indexoutofbound)
    x = np.append(x, [[1]], 0)
    y = np.append(y, [[0]], 0)

    
    # Calculate the area using the trapezoidal rule: (b-a)*0.5*(f(b)+f(a))
    area = 0
    area = 0.5*x[0][0]*(1+y[0][0]) # still 0
    for idx in range(99):
        area += (x[idx+1][0]-x[idx][0])*0.5*(y[idx][0]+ y[idx+1][0])
    
    return area

In [115]:
# def get_aupr(truth: np.ndarray, predictions: np.ndarray) -> float:
#     """Get the AUPR."""
    
#     return average_precision_score(truth.flatten(), predictions.flatten())

In [116]:
def auroc(truth: np.ndarray, predictions: np.ndarray) -> float:
    """Get the area under the ROC curve."""
    
#     truth = truth.flatten()
#     predictions = predictions.flatten()
    
    max_value = predictions.max()
    min_value = predictions.min()
    # Create an array of 99 representing the thresholds
    threshold = min_value + (max_value-min_value)*(np.arange(1,100,1)/100)
    
    tn = np.zeros((threshold.size, 1))
    tp = np.zeros((threshold.size, 1))
    fn = np.zeros((threshold.size, 1))
    fp = np.zeros((threshold.size, 1))
    
    # Calculate the tp, tn, fp, fn for every threshold.
    for index in range(99):
        tp[index,0] = np.logical_and(predictions>=threshold[index], truth==1).sum()
        tn[index,0] = np.logical_and(predictions<threshold[index], truth==0).sum()
        fp[index,0] = np.logical_and(predictions>=threshold[index], truth==0).sum()
        fn[index,0] = np.logical_and(predictions<threshold[index], truth==1).sum()
    
    # Calculate the area under the precision-recall curve
    sn = tp/(tp+fn)
    sp = tn/(tn+fp)
    x = 1 - sp
    y = sn.copy()
    
    # Zhang sorted twice - I think it's redundant
#     sorted_index = x.argsort()
#     x = x[sorted_index]
#     y = y[sorted_index]

    sorted_index = y.argsort()
    y = np.take_along_axis(y, sorted_index, axis=0)
    x = np.take_along_axis(x, sorted_index, axis=0)


    # x and y now have 100 values, not 99, causing no errors in trapezoidal rule calculation (indexoutofbound)
    x = np.append(x, [[1]], 0)
    y = np.append(y, [[1]], 0)

    
    # Calculate the area using the trapezoidal rule: (b-a)*0.5*(f(b)+f(a))
    area = 0
    area = 0.5*x[0][0]*(y[0][0]) # still 0
    for idx in range(99):
        area += (x[idx+1][0]-x[idx][0])*0.5*(y[idx][0]+ y[idx+1][0])
    
    return area

In [117]:
def classification_metric(truth: np.ndarray, predictions: np.ndarray) -> tuple:
    """Calculate the evaulation metrics given 1-D vector of ground truth and predictions."""
#     truth = truth.flatten()
#     predictions = predictions.flatten()
    
    tp = np.logical_and(predictions==1, truth==1).sum()
    tn = np.logical_and(predictions==0, truth==0).sum()
    fp = np.logical_and(predictions==1, truth==0).sum()
    fn = np.logical_and(predictions==0, truth==1).sum()
    
    acc = (tp+tn)/(tn+tp+fn+fp)
    sn = tp/(tp+fn)
    recall = sn
    sp = tn/(tn+fp)
    prec = tp/(tp+fp)
    f1 = (2*prec*recall)/(recall+prec)
    
    return sn,sp,prec,acc,f1
    

In [118]:
def get_metric(truth: np.ndarray, predictions: np.ndarray) -> tuple:
    """Calculate the metrics of the drug-side effect matrix."""
#     truth = truth.flatten()
#     predictions = predictions.flatten()
    
    max_value = predictions.max()
    min_value = predictions.min()
    # Create an 1-D array of 999 threshold values in ascending order
    threshold = min_value + (max_value-min_value)*(np.arange(1,1000,1)/1000)
    
    temp_sn = np.zeros(999)
    temp_sp = np.zeros(999)
    temp_prec = np.zeros(999)
    temp_acc = np.zeros(999)
    temp_f1 = np.zeros(999)
    
    for index in range(999):
        # assign values above threshold to 1
        pred = predictions>threshold[index]
        # calculate the metrics for the predictions under threshold i
        temp_sn[index],temp_sp[index],temp_prec[index],temp_acc[index],temp_f1[index] = classification_metric(truth, pred)
    
    # Get index corresponding to max f1 score (optimal value of prec and recall)
    index_max_f1 = temp_f1.argmax()
    sn = temp_sn[index_max_f1]
    sp = temp_sp[index_max_f1]
    prec = temp_prec[index_max_f1]
    acc = temp_acc[index_max_f1]
    f1 = temp_f1[index_max_f1]
    
    return sn, sp, prec, acc, f1    

## Running Training

### Training Pauwel

In [119]:
import time

In [None]:
start = time.time()

print("Finding optimal params.....")
results_1 = param_selection_1(pauwel)
# Get the parameter combination corresponding to highest AUPR
# k, lmda = max(res, key=res.get)

elapsed_time_1 = time.time() - start
print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))


In [None]:
start = time.time()

print("Finding optimal params.....")
results_2 = param_selection_2(pauwel)
# Get the parameter combination corresponding to highest AUPR
# k, lmda = max(res, key=res.get)

elapsed_time_2 = time.time() - start
print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))


In [None]:
start = time.time()

print("Finding optimal params.....")
results_3 = param_selection_3(pauwel)
# Get the parameter combination corresponding to highest AUPR
# k, lmda = max(res, key=res.get)

elapsed_time_3 = time.time() - start
print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))


In [None]:
# find the optimal number of params

# print the dictionaries



In [None]:
print("Now training with optimal params.....")

# 20 runs of 5-CV with optimal parameters
res_optimal = train_optimal_params(dataset=pauwel, lmbda=lmda, k=k)
