# Imports

In [1]:
import numpy as np
import pandas as pd
from itertools import product
from numba import jit, njit, prange
import time
%matplotlib inline

# Preparing Data

In [2]:
pauwel = pd.read_csv("pauwel-dataset.txt", sep='\t', header=0)
pauwel

Unnamed: 0,abdominal cramps,abdominal distention,abdominal pain,malformations,spontaneous abortion,missed abortion,abscess,acanthosis nigricans,acidosis,renal tubular acidosis,...,drug dependence,diverticulosis,prostatic hypertrophy,allergic reaction,dysphonia,eosinophilic pneumonia,retinal vein thrombosis,renal insufficiency,glioblastoma multiforme,portal cirrhosis
carnitine,1,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
GABA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
delta-aminolevulinic acid,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
leucovorin,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGE2,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pimecrolimus,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
auranofin,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cefditoren,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
nitroprusside,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
mizutani = pd.read_csv("mizutani-dataset.txt", sep='\t', header=0)
mizutani # drug names are coded with pubchem ID

Unnamed: 0,abdominal.cramps,abdominal.distention,abdominal.pain,malformations,spontaneous.abortion,missed.abortion,abscess,acanthosis.nigricans,acidosis,renal.tubular.acidosis,...,vitamin.deficiency,drug.dependence,diverticulosis,prostatic.hypertrophy,allergic.reaction,dysphonia,eosinophilic.pneumonia,retinal.vein.thrombosis,renal.insufficiency,glioblastoma.multiforme
85,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
119,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
137,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
143,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
158,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6398525,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6398970,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
6447131,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6918453,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
liu = pd.read_csv("liu-dataset.csv", sep=',', header=None)
liu

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1375,1376,1377,1378,1379,1380,1381,1382,1383,1384
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
827,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
828,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
829,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
830,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [5]:
# Check number of missing values
new_pauwel = pauwel.fillna(999).values
num_missing_pauwel = np.argwhere(new_pauwel == 999)
print("The number of missing values in pauwel's dataset:", len(num_missing_pauwel))

The number of missing values in pauwel's dataset: 0


In [6]:
new_mizutani = mizutani.fillna(999).values
num_missing_mizutani = np.argwhere(new_mizutani == 999)
print("The number of missing values in mizutani's dataset:",len(num_missing_mizutani))

new_liu = liu.fillna(999).values
num_missing_liu = np.argwhere(new_liu == 999)
print("The number of missing values in pauwel's dataset:" ,len(num_missing_liu))


The number of missing values in mizutani's dataset: 0
The number of missing values in pauwel's dataset: 0


In [7]:
def density(df):
    """Calculate the density of the matrix."""
    
    density = float(len(np.nonzero(df.to_numpy())[0]))
    known_se = density
    density /= (df.shape[0]*df.shape[1])
    density *= 100
    return '{:.2f}%'.format(density), known_se

In [8]:
print("Pauwel:", density(pauwel))
print("Liu:", density(liu))
print("Mizutani:", density(mizutani))

Pauwel: ('4.97%', 61102.0)
Liu: ('5.14%', 59205.0)
Mizutani: ('5.57%', 49051.0)


# Model Implementation

### Non-Numba Implementation

In [9]:
class SGD_Recommender:
    
    def __init__(self, k:float, lmbda:float, max_iter:int=350, learn_rate=0.005, tolerance=1e-06):
        """Sets the parameters for SGD."""
        
        self.k=k
        self.lmbda=lmbda
        self.max_iter=max_iter
        self.learn_rate=learn_rate
        self.tolerance=tolerance

    # Original Implementation
    # def fit(self, drug_index: np.ndarray, side_fx_index: np.ndarray, dataset: np.ndarray) -> None:
    #     """Train the SGD model.
        
    #     Args:
    #         drug_index (np.ndarray): Row indices of the non-zero values in train set.
    #         side_fx_index (np.ndarray): Column indices of the non-zero values in train set.
    #         dataset (np.ndarray): The dataset
    #     Returns: 
    #         None        
    #     """     
        
    #     m, n = dataset.shape
        
    #     # Initialize the low rank matrices U and V with values from the normal distribution N(0,0.01)
    #     mu, sigma = 0, 0.1
    #     self.U = np.random.normal(loc=mu, scale=sigma, size=(m, self.k))
    #     self.V = np.random.normal(loc=mu, scale=sigma, size=(n, self.k))
        
    #     # Get the indices of known drug-se associations in the training set with respect to the dataset
    #     drug_se = np.array(list(zip(drug_index,side_fx_index)))
    #     rng = np.random.default_rng()
        
    #     # Start of training
    #     for _ in range(self.max_iter):
    #         rng.shuffle(drug_se) # Shuffle in place
    #         U_old = self.U.copy()
    #         V_old = self.V.copy()
            
    #         for (drug, se) in drug_se:
    #             error = dataset[drug,se] - self.predictions(self.U[drug,:], self.V[se,:])
    #             temp_u = self.U[drug,:] + self.learn_rate*(error*self.V[se,:] - self.lmbda*self.U[drug,:])
    #             temp_v = self.V[se,:] + self.learn_rate*(error*self.U[drug,:] - self.lmbda*self.V[se,:])
    #             self.U[drug,:] = temp_u 
    #             self.V[se,:] = temp_v
            
    #         if self.converged(U_old, self.U) and self.converged(V_old, self.V):
    #             break
            
    def fit(self, train: np.ndarray) -> None:
        """Train the SGD model.
        
        Args:            
            train (np.ndarray): The training set
        Returns: 
            None        
        """     
        
        m, n = train.shape
        
        # Initialize the low rank matrices U and V with values from the normal distribution N(0,0.01)
        mu, sigma = 0, 0.1
        self.U = np.random.normal(loc=mu, scale=sigma, size=(m, self.k))
        self.V = np.random.normal(loc=mu, scale=sigma, size=(n, self.k))
        
        # Get non-zero values in train set
        drug, se = train.nonzero()
        drug_se = list(zip(drug, se))
        rng = np.random.default_rng()
        
        # Start of training
        for _ in range(self.max_iter):
            rng.shuffle(drug_se) # Shuffle in place
            U_old = self.U.copy()
            V_old = self.V.copy()
            
            for (drug, se) in drug_se:
                error = train[drug,se] - self.predictions(self.U[drug,:], self.V[se,:])
                temp_u = self.U[drug,:] + self.learn_rate*(error*self.V[se,:] - self.lmbda*self.U[drug,:])
                temp_v = self.V[se,:] + self.learn_rate*(error*self.U[drug,:] - self.lmbda*self.V[se,:])
                self.U[drug,:] = temp_u 
                self.V[se,:] = temp_v
            
            if self.converged(U_old, self.U) and self.converged(V_old, self.V):
                break
    
    def predict(self) -> np.ndarray:
        """Predict the entire drug-side effect matrix values."""
        
        return self.predictions(self.U, self.V)

            
    def predictions(self, U: np.ndarray, V: np.ndarray) -> np.ndarray:
        """Return dot product of the matrices U and V."""
        
        return np.dot(U, V.T)
    
    def converged(self, old: np.ndarray, curr: np.ndarray) -> bool:
        """Check if matrices have reached convergence."""
        
        return np.all(np.abs(np.subtract(old,curr)) <= self.tolerance)
            
    

### Numba Implementation

In [10]:
  
@njit(cache=True)
def predictions(U: np.ndarray, V: np.ndarray) -> np.ndarray:
    """Return dot product of the matrices U and V."""
    return np.dot(U, V.T)

@njit(cache=True)
def converged(old: np.ndarray, curr: np.ndarray, tolerance:float=1e-06) -> bool:
    """Check if matrices have reached convergence."""

    return np.all(np.abs(np.subtract(old,curr)) <= tolerance)
    
@njit(cache=True)
def fit(drug_index: np.ndarray, side_fx_index: np.ndarray, dataset: np.ndarray, 
        k: float, lmbda: float, max_iter:int=350, learn_rate:float=0.005):
    """Train the SGD model.

    Args:
        drug_index (np.ndarray): Row indices of the non zero values in train set.
        side_fx_index (np.ndarray): Column indices of the non zero values in train set
        dataset (np.ndarray): The dataset
        k (float): Number of latent features
        lmbda (float): Regularization term.
        max_iter (int): Max number of iterations
        learn_rate (float): Learning rate
    Returns: 
        U, V (np.ndarray): The low rank matrix representation of the drug-side effect matrix.        
    """     

    m, n = dataset.shape
    # Initialize the low rank matrices U and V with values from the normal distribution N(0,0.01)
    mu, sigma = 0, 0.1
    U = np.random.normal(loc=mu, scale=sigma, size=(m, k))
    V = np.random.normal(loc=mu, scale=sigma, size=(n, k))

    
    drug_se = np.array(list(zip(drug_index,side_fx_index)))

    # Start of training
    for epoch in range(max_iter):

        np.random.shuffle(drug_se) # Shuffle in place
        U_old = U.copy()
        V_old = V.copy()

        # Learn from the known  associations in the training set (drug_se)
        for (drug, se) in drug_se:
            error = dataset[drug,se] - predictions(U[drug,:], V[se,:])
            temp_u = U[drug,:] + learn_rate*(error*V[se,:] - lmbda*U[drug,:])
            temp_v = V[se,:] + learn_rate*(error*U[drug,:] - lmbda*V[se,:])
            U[drug,:] = temp_u 
            V[se,:] = temp_v

        if converged(U_old, U) and converged(V_old, V):
            break
    
    return U,V

@njit(cache=True)    
def predict(U:np.ndarray, V:np.ndarray) -> np.ndarray:
        """Predict the entire drug-side effect matrix values."""
        
        return predictions(U, V)

# Evaluation Functions

## AUPR

In [11]:

def aupr(truth: np.ndarray, predictions: np.ndarray) -> float:
    """Get the area under the precision-recall curve, using trapezoidal rule.
    
    Args:
        truth: 1-D vector of ground truth values
        predictions: 1-D vector of predictions

    Returns:
        (float): The area.
    """
    
    max_value = predictions.max()
    min_value = predictions.min()
    # Create an array of 99 representing the thresholds
    threshold = min_value + (max_value-min_value)*np.arange(1,100,1)/100
    
    tn = np.zeros((threshold.size, 1))
    tp = np.zeros((threshold.size, 1))
    fn = np.zeros((threshold.size, 1))
    fp = np.zeros((threshold.size, 1))
    
    # Calculate the tp, tn, fp, fn for every threshold.
    for i in range(threshold.size):
        tp[i,0] = np.logical_and(predictions>=threshold[i], truth==1).sum()
        tn[i,0] = np.logical_and(predictions<threshold[i], truth==0).sum()
        fp[i,0] = np.logical_and(predictions>=threshold[i], truth==0).sum()
        fn[i,0] = np.logical_and(predictions<threshold[i], truth==1).sum()
    
    # Calculate the area under the precision-recall curve
    recall = tp/(tp+fn)
    prec = tp/(tp+fp)
    
    x = recall
    y = prec
    
    sorted_index = x.argsort(axis=0)
    y = np.take_along_axis(y, sorted_index, axis=0)
    x = np.take_along_axis(x, sorted_index, axis=0)    

    x[0][0] = 0
    y[0][0] = 1
    x = np.append(x, [[1]], axis=0)
    y = np.append(y, [[0]], axis=0)

    
    # Calculate the area using the trapezoidal rule: (b-a)*0.5*(f(b)+f(a))
    area = 0
    area = 0.5*x[0][0]*(1+y[0][0]) # still 0
    for i in range(threshold.size):
        area += (y[i][0]+ y[i+1][0])*(x[i+1][0]-x[i][0])*0.5
    return area
    

## AUROC

In [12]:

def auroc(truth: np.ndarray, predictions: np.ndarray) -> float:
    """Get the area under the ROC curve.
    
    Args:
        truth: 1-D vector of ground truth values
        predictions: 1-D vector of predictions

    Returns:
        (float): The area.
    """
    
    max_value = predictions.max()
    min_value = predictions.min()
    # Create an array of 99 representing the thresholds
    threshold = min_value + (max_value-min_value)*np.arange(1,100,1)/100
    
    tn = np.zeros((threshold.size, 1))
    tp = np.zeros((threshold.size, 1))
    fn = np.zeros((threshold.size, 1))
    fp = np.zeros((threshold.size, 1))
    
    # Calculate the tp, tn, fp, fn for every threshold.
    for i in range(threshold.size):
        tp[i,0] = np.logical_and(predictions>=threshold[i], truth==1).sum()
        tn[i,0] = np.logical_and(predictions<threshold[i], truth==0).sum()
        fp[i,0] = np.logical_and(predictions>=threshold[i], truth==0).sum()
        fn[i,0] = np.logical_and(predictions<threshold[i], truth==1).sum()
    
    # Calculate the area under the precision-recall curve
    sn = tp/(tp+fn)
    sp = tn/(tn+fp)
    x = 1 - sp
    y = sn

    sorted_index = x.argsort(axis=0)
    y = np.take_along_axis(y, sorted_index, axis=0)
    x = np.take_along_axis(x, sorted_index, axis=0)

    sorted_index = y.argsort(axis=0)
    y = np.take_along_axis(y, sorted_index, axis=0)
    x = np.take_along_axis(x, sorted_index, axis=0)

    x = np.append(x, [[1]], 0)
    y = np.append(y, [[1]], 0)
    
    # Calculate the area using the trapezoidal rule: (b-a)*0.5*(f(b)+f(a))
    area = 0
    area = 0.5*x[0][0]*y[0][0] # still 0
    for i in range(threshold.size):
        area += (y[i][0]+ y[i+1][0])*(x[i+1][0]-x[i][0])*0.5
    
    return area

## Other Metrics
Sensitivity, specificity, precision, recall, accuracy, f1-measure

In [13]:

def classification_metric(truth: np.ndarray, predictions: np.ndarray) -> tuple:
    """Calculate the evaulation metrics given 1-D vector of ground truth and predictions.
    
    Args:
        truth: 1-D vector of ground truth values
        predictions: 1-D vector of predictions

    Returns:
        sensitivity, specificity, precision, accuracy, f1
    """
    
    tp = np.logical_and(predictions==1, truth==1).sum()
    tn = np.logical_and(predictions==0, truth==0).sum()
    fp = np.logical_and(predictions==1, truth==0).sum()
    fn = np.logical_and(predictions==0, truth==1).sum()
    
    acc = (tp+tn)/(tn+tp+fn+fp)
    sn = tp/(tp+fn)
    recall = sn
    sp = tn/(tn+fp)
    prec = tp/(tp+fp)
    f1 = (2.0*prec*recall)/(recall+prec)  
      
    return sn,sp,prec,acc,f1
    

In [14]:

def get_metric(truth: np.ndarray, predictions: np.ndarray) -> tuple:
    """Calculate the metrics of the drug-side effect matrix.
    Args:
        truth: 1-D vector of ground truth values
        predictions: 1-D vector of predictions

    Returns:
        sensitivity, specificity, precision, accuracy, f1
    """

    max_value = predictions.max()
    min_value = predictions.min()
    # Create an 1-D array of 999 threshold values in ascending order
    threshold = min_value + (max_value-min_value)*np.arange(1,1000,1)/1000
    temp_sn = np.zeros(threshold.size)
    temp_sp = np.zeros(threshold.size)
    temp_prec = np.zeros(threshold.size)
    temp_acc = np.zeros(threshold.size)
    temp_f1 = np.zeros(threshold.size)

    for i in range(threshold.size):
        # assign values above threshold to 1
        predict_label = predictions>threshold[i]
        # calculate the metrics for the predictions under threshold i
        temp_sn[i],temp_sp[i],temp_prec[i],temp_acc[i],temp_f1[i] = classification_metric(truth, predict_label)
    
    # Get index corresponding to max f1 score (optimal value of prec and recall)
    indx_max_f1 = np.nanargmax(temp_f1)
    sn = temp_sn[indx_max_f1]
    sp = temp_sp[indx_max_f1]
    prec = temp_prec[indx_max_f1]
    acc = temp_acc[indx_max_f1]
    f1 = temp_f1[indx_max_f1]
    
    return sn, sp, prec, acc, f1    

# Cross-validation

In [15]:
def cross_val(cv:int, dataset:np.ndarray, lmda:float, k:float) -> np.ndarray:
    interaction_matrix = dataset
    row, col = dataset.shape
    cv_matrix = np.ceil(np.random.rand(row,col)*cv)

    predict_score_matrix = np.zeros((row,col))

    for fold in range(cv):
        print("Performing CV fold: {0}".format(fold))
        test_index_matrix = (cv_matrix==fold)
        train_index_matrix = np.logical_not(test_index_matrix)
        train_interaction_matrix = np.multiply(interaction_matrix,train_index_matrix)

        sgd = SGD_Recommender(k=k, lmbda=lmda)
        sgd.fit(train=train_interaction_matrix)
        predict_matrix = sgd.predict()

        predict_score_matrix = predict_score_matrix + np.multiply(predict_matrix, test_index_matrix)
    
    auc = auroc(truth=interaction_matrix.flatten(), predictions=predict_score_matrix.flatten())
    auprc = aupr(truth=interaction_matrix.flatten(), predictions=predict_score_matrix.flatten())
    sn, sp, prec, acc, f1 = get_metric(truth=interaction_matrix.flatten(), predictions=predict_score_matrix.flatten())

    return np.array([auprc, auc, sn, sp, prec, acc, f1])
        
        



# Training

### Cross validation for parameter selection (Not used)
Only calculates the aupr values.

In [16]:

def cross_val_5_param_selection(dataset: np.ndarray, lmbda: float, k: float) -> float:
    """Perform 5 fold CV, given regularization term (lambda) and number of latent features (k).
    
    Args:
        dataset (np.ndarray): The dataset
        lmbda (float): The regularization term
        k (int): Number of latent features
    Returns:
        np.ndarray: Average AUPR over the 5 folds.
    """    
 
    np.random.shuffle(dataset)

    fold_size = dataset.shape[0]//5
    a = dataset[0:fold_size]
    b = dataset[fold_size:2*fold_size]
    c = dataset[2*fold_size:3*fold_size]
    d = dataset[3*fold_size:4*fold_size]
    e = dataset[4*fold_size:]
    

    # First fold: model a
    
    test = a
    train = np.concatenate((b,c,d,e),0)
    # get the row and col indices of the nonzero values in train, relative to the dataset
    known_drug_se = train.nonzero()
    side_fx_index = known_drug_se[1]
    drug_index = known_drug_se[0] + fold_size
    # training
    U,V = fit(drug_index, side_fx_index, dataset, k, lmbda)
    aupr_a = aupr(truth=test, predictions=predict(U,V)[0:fold_size])


    # Second fold: model b
    
    test = b
    known_drug_se = a.nonzero()
    # get the row and col indices of the nonzero values in train, relative to the dataset
    side_fx_index = known_drug_se[1]
    drug_index = known_drug_se[0]
    cde = np.concatenate((c,d,e),axis=0)
    known_drug_se = cde.nonzero()
    side_fx_index = np.concatenate((side_fx_index,known_drug_se[1]), axis=0)
    drug_index = np.concatenate((drug_index,(known_drug_se[0]+(2*fold_size))), axis=0)
    # training
    U,V = fit(drug_index=drug_index, side_fx_index= side_fx_index, dataset=dataset, k=k, lmbda=lmbda)
    aupr_b = aupr(truth=test, predictions=predict(U,V)[fold_size:2*fold_size])


    # Third fold: model c
    test = c
    # get the row and col indices of the nonzero values in train, relative to the dataset
    ab = np.concatenate((a,b),0)
    known_drug_se = ab.nonzero()
    side_fx_index = known_drug_se[1]
    drug_index = known_drug_se[0]
    cde = np.concatenate((d,e),axis=0)
    known_drug_se = cde.nonzero()
    side_fx_index = np.concatenate((side_fx_index,known_drug_se[1]), axis=0)
    drug_index = np.concatenate((drug_index,known_drug_se[0]+(3*fold_size)), axis=0)
    # training
    U,V = fit(drug_index=drug_index, side_fx_index= side_fx_index, dataset=dataset, k=k, lmbda=lmbda)
    aupr_c = aupr(truth=test, predictions=predict(U,V)[2*fold_size:3*fold_size])
    
   
    # Fourth fold: model d
    test = d
    # get the row and col indices of the nonzero values in train, relative to the dataset
    ab = np.concatenate((a,b,c),0)
    known_drug_se = ab.nonzero()
    side_fx_index = known_drug_se[1]
    drug_index = known_drug_se[0]
    known_drug_se = e.nonzero()
    side_fx_index = np.concatenate((side_fx_index,known_drug_se[1]), axis=0)
    drug_index = np.concatenate((drug_index,(known_drug_se[0]+(4*fold_size))), axis=0)
    #training
    U,V = fit(drug_index=drug_index, side_fx_index= side_fx_index, dataset=dataset, k=k, lmbda=lmbda)
    aupr_d = aupr(truth=test, predictions=predict(U,V)[3*fold_size:4*fold_size])
    

    # Fifth fold: model e
    test = e
    # get the row and col indices of the nonzero values in train, relative to the dataset
    train = np.concatenate((a,b,c,d),0)
    known_drug_se = train.nonzero()
    side_fx_index = known_drug_se[1]
    drug_index = known_drug_se[0]
    U, V = fit(drug_index=drug_index, side_fx_index= side_fx_index, dataset=dataset, k=k, lmbda=lmbda)
    aupr_e = aupr(truth=test, predictions=predict(U,V)[4*fold_size:])
     
    return (aupr_a + aupr_b + aupr_c + aupr_d + aupr_e)/5
    

### Finding the optimal parameters (Not used)

In [17]:

# def param_selection(dataset: np.ndarray, k_values: np.ndarray, lmda_values: np.ndarray) -> dict:
#     """Calculate the metrics of the model, for each pair of given k and lambda parameter values.
    
#     Args:
#         dataset (np.ndarray): The dataset
#         k_values (np.ndarray): List of k values
#         lmda_values (np.ndarray): List of lambda values
#     Returns:
#         dict: Dictionary of parameter combinations and their associated score.
#     """
#     results = {}
#     # Do 5 fold CV for each possible combination of lambda and k (product function gives the cartesian product)
#     for (k,lmbda) in product(k_values, lmda_values):
#         print("Trying parameters: (" , k, ", ", lmbda, ")")
#         # print("Hello World")
#         res = cross_val_5_param_selection(dataset=dataset, lmbda=lmbda, k=k)
#         print("AUPR: ", res)
#         results[(k,lmbda)] = res
    
#     return results
        
    

## Finding the optimal parameters (currently used)
Optimal parameters are those which yield highest AUPR

In [18]:

def param_selection(dataset: np.ndarray, k_values: np.ndarray, lmda_values: np.ndarray) -> dict:
    """Calculate the metrics of the model, for each pair of given k and lambda parameter values.
    
    Args:
        dataset (np.ndarray): The dataset
        k_values (np.ndarray): List of k values
        lmda_values (np.ndarray): List of lambda values
    Returns:
        dict: Dictionary of parameter combinations and their associated score.
    """
    results = {}
    # Do 5 fold CV for each possible combination of lambda and k (product function gives the cartesian product)
    for (k,lmbda) in product(k_values, lmda_values):
        print("Trying parameters: (" , k, ", ", lmbda, ")")
        res = cross_val(cv=5, dataset=dataset, k=k, lmda=lmbda)
        print(res)
        results[(k,lmbda)] = res
    
    return results
        
    

### Cross Validation with the optimal parameters (Not used)
Calculates SN, SP, Precision, Recall, AUPR, F1, AUROC with the optimal parameters.


In [19]:

def cross_val_5_optimal(dataset: np.ndarray, lmbda: float, k: float) -> float:
    """Perform 5 fold CV, given regularization term (lambda) and number of latent features (k).
    
    Args:
        dataset (np.ndarray): The dataset
        lmbda (float): The regularization term
        k (int): Number of latent features
    Returns:
        np.ndarray: Average AUPR over the 5 folds.
    """    
 
    np.random.shuffle(dataset)

    fold_size = dataset.shape[0]//5
    a = dataset[0:fold_size]
    b = dataset[fold_size:2*fold_size]
    c = dataset[2*fold_size:3*fold_size]
    d = dataset[3*fold_size:4*fold_size]
    e = dataset[4*fold_size:]
    

    # First fold: model a
    
    test = a
    train = np.concatenate((b,c,d,e),0)
    # get the row and col indices of the nonzero values in train, relative to the dataset
    known_drug_se = train.nonzero()
    side_fx_index = known_drug_se[1]
    drug_index = known_drug_se[0] + fold_size
    # training
    U,V = fit(drug_index, side_fx_index, dataset, k, lmbda)
    aupr_a = aupr(truth=test, predictions=predict(U,V)[0:fold_size])
    auroc_a = auroc(truth=test, predictions=predict(U,V)[0:fold_size])
    sn_a, sp_a, prec_a, acc_a, f1_a = get_metric(truth=test, predictions=predict(U,V)[0:fold_size])


    # Second fold: model b
    
    test = b
    known_drug_se = a.nonzero()
    # get the row and col indices of the nonzero values in train, relative to the dataset
    side_fx_index = known_drug_se[1]
    drug_index = known_drug_se[0]
    cde = np.concatenate((c,d,e),axis=0)
    known_drug_se = cde.nonzero()
    side_fx_index = np.concatenate((side_fx_index,known_drug_se[1]), axis=0)
    drug_index = np.concatenate((drug_index,(known_drug_se[0]+(2*fold_size))), axis=0)
    # training
    U,V = fit(drug_index=drug_index, side_fx_index= side_fx_index, dataset=dataset, k=k, lmbda=lmbda)
    aupr_b = aupr(truth=test, predictions=predict(U,V)[fold_size:2*fold_size])
    auroc_b = auroc(truth=test, predictions=predict(U,V)[fold_size:2*fold_size])
    sn_b, sp_b, prec_b, acc_b, f1_b = get_metric(truth=test, predictions=predict(U,V)[fold_size:2*fold_size])


    # Third fold: model c
    test = c
    # get the row and col indices of the nonzero values in train, relative to the dataset
    ab = np.concatenate((a,b),0)
    known_drug_se = ab.nonzero()
    side_fx_index = known_drug_se[1]
    drug_index = known_drug_se[0]
    cde = np.concatenate((d,e),axis=0)
    known_drug_se = cde.nonzero()
    side_fx_index = np.concatenate((side_fx_index,known_drug_se[1]), axis=0)
    drug_index = np.concatenate((drug_index,known_drug_se[0]+(3*fold_size)), axis=0)
    # training
    U,V = fit(drug_index=drug_index, side_fx_index= side_fx_index, dataset=dataset, k=k, lmbda=lmbda)
    aupr_c = aupr(truth=test, predictions=predict(U,V)[2*fold_size:3*fold_size])
    auroc_c = auroc(truth=test, predictions=predict(U,V)[2*fold_size:3*fold_size])
    sn_c, sp_c, prec_c, acc_c, f1_c = get_metric(truth=test, predictions=predict(U,V)[2*fold_size:3*fold_size])

   
    # Fourth fold: model d
    test = d
    # get the row and col indices of the nonzero values in train, relative to the dataset
    ab = np.concatenate((a,b,c),0)
    known_drug_se = ab.nonzero()
    side_fx_index = known_drug_se[1]
    drug_index = known_drug_se[0]
    known_drug_se = e.nonzero()
    side_fx_index = np.concatenate((side_fx_index,known_drug_se[1]), axis=0)
    drug_index = np.concatenate((drug_index,(known_drug_se[0]+(4*fold_size))), axis=0)
    #training
    U,V = fit(drug_index=drug_index, side_fx_index= side_fx_index, dataset=dataset, k=k, lmbda=lmbda)
    aupr_d = aupr(truth=test, predictions=predict(U,V)[3*fold_size:4*fold_size])
    auroc_d = auroc(truth=test, predictions=predict(U,V)[3*fold_size:4*fold_size])
    sn_d, sp_d, prec_d, acc_d, f1_d = get_metric(truth=test, predictions=predict(U,V)[3*fold_size:4*fold_size])


    # Fifth fold: model e
    test = e
    # get the row and col indices of the nonzero values in train, relative to the dataset
    train = np.concatenate((a,b,c,d),0)
    known_drug_se = train.nonzero()
    side_fx_index = known_drug_se[1]
    drug_index = known_drug_se[0]
    U, V = fit(drug_index=drug_index, side_fx_index= side_fx_index, dataset=dataset, k=k, lmbda=lmbda)
    aupr_e = aupr(truth=test, predictions=predict(U,V)[4*fold_size:])
    auroc_e = auroc(truth=test, predictions=predict(U,V)[4*fold_size:])
    sn_e, sp_e, prec_e, acc_e, f1_e = get_metric(truth=test, predictions=predict(U,V)[4*fold_size:])

    # Calculate the average
    aupr_mean = np.mean([aupr_a, aupr_b, aupr_c, aupr_d, aupr_e])
    auroc_mean = np.mean([auroc_a, auroc_b, auroc_c, auroc_d, auroc_e])
    sn_mean = np.mean([sn_a, sn_b, sn_c, sn_d, sn_e])
    sp_mean = np.mean([sp_a, sp_b, sp_c, sp_d, sp_e])
    prec_mean = np.mean([prec_a, prec_b, prec_c, prec_d, prec_e])
    acc_mean = np.mean([acc_a, acc_b, acc_c, acc_d, acc_e])
    f1_mean = np.mean([f1_a, f1_b, f1_c, f1_d, f1_e])
    
    return aupr_mean, auroc_mean, sn_mean, sp_mean, prec_mean, acc_mean, f1_mean
    
    # return aupr_a, auroc_a, sn_a, sp_a, prec_a, acc_a, f1_a

In [20]:
# def cross_val_5(dataset: pd.DataFrame, lmbda: float|int, k: int) -> np.ndarray:
#     """Perform 5 fold CV, given regularization term (lambda) and number of latent features (k).
    
#     Args:
#         dataset (pd.DataFrame): The dataset
#         lmbda (float): The regularization term
#         k (int): Number of latent features
#     Returns:
#         np.ndarray: Average of the metrics (sn, sp, prec, recall, acc, f1, aupr, auc) over the 5 folds.
    
#     """
    
# #     np.random.shuffle(dataset)
#     dataset.sample(frac=1)

#     fold_size = dataset.shape[0]//5
    
#     # Specify the test sets of each fold
#     a = dataset.iloc[0:fold_size]
#     b = dataset.iloc[fold_size:2*fold_size]
#     c = dataset.iloc[2*fold_size:3*fold_size]
#     d = dataset.iloc[3*fold_size:4*fold_size]
#     e = dataset.iloc[4*fold_size:5*fold_size]   
    
#     # model a
#     print("First Fold")
#     model = SGD_Recommender(k=k, lmbda=lmbda)
#     model.fit(dataset.drop(a.index), dataset)
#     pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    
#     aupr_a = aupr(truth=a.values, predictions=pred.iloc[0:fold_size].values)
#     auroc_a = auroc(truth=a.values, predictions=pred.iloc[0:fold_size].values)
#     sn_a, sp_a, prec_a, acc_a, f1_a = get_metric(truth=a.values, predictions=pred.iloc[0:fold_size].values)
    
    
#     # model b
#     print("Second Fold")
#     model = SGD_Recommender(k=k, lmbda=lmbda)
#     model.fit(dataset.drop(b.index), dataset)
#     pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    
#     aupr_b = aupr(truth=b.values, predictions=pred.iloc[fold_size:2*fold_size].values)
#     auroc_b = auroc(truth=b.values, predictions=pred.iloc[fold_size:2*fold_size].values)
#     sn_b, sp_b, prec_b, acc_b, f1_b = get_metric(truth=b.values, predictions=pred.iloc[fold_size:2*fold_size].values)

    
#     # model c
#     print("Third Fold")
#     model = SGD_Recommender(k=k, lmbda=lmbda)
#     model.fit(dataset.drop(c.index), dataset)
#     pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    
#     aupr_c = aupr(truth=c.values, predictions=pred.iloc[2*fold_size:3*fold_size].values)
#     auroc_c = auroc(truth=c.values, predictions=pred.iloc[2*fold_size:3*fold_size].values)
#     sn_c, sp_c, prec_c, acc_c, f1_c = get_metric(truth=c.values, predictions=pred.iloc[2*fold_size:3*fold_size].values)
    
#     # model d
#     print("Fourth Fold")
#     model = SGD_Recommender(k=k, lmbda=lmbda)
#     model.fit(dataset.drop(d.index), dataset)
#     pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    
#     aupr_d = aupr(truth=d.values, predictions=pred.iloc[3*fold_size:4*fold_size].values)
#     auroc_d = auroc(truth=d.values, predictions=pred.iloc[3*fold_size:4*fold_size].values)
#     sn_d, sp_d, prec_d, acc_d, f1_d = get_metric(truth=d.values, predictions=pred.iloc[3*fold_size:4*fold_size].values)
    
#     # model e
#     print("Fifth Fold")
#     model = SGD_Recommender(k=k, lmbda=lmbda)
#     model.fit(dataset.drop(e.index), dataset)
#     pred = pd.DataFrame(model.predict(), index=dataset.index, columns=dataset.columns)
    
#     aupr_e = aupr(truth=e.values, predictions=pred.iloc[4*fold_size:5*fold_size].values)
#     auroc_e = auroc(truth=e.values, predictions=pred.iloc[4*fold_size:5*fold_size].values)
#     sn_e, sp_e, prec_e, acc_e, f1_e = get_metric(truth=e.values, predictions=pred.iloc[4*fold_size:5*fold_size].values)
    
    
#     # Calculate the average
#     aupr_mean = np.mean([aupr_a, aupr_b, aupr_c, aupr_d, aupr_e])
#     auroc_mean = np.mean([auroc_a, auroc_b, auroc_c, auroc_d, auroc_e])
#     sn_mean = np.mean([sn_a, sn_b, sn_c, sn_d, sn_e])
#     sp_mean = np.mean([sp_a, sp_b, sp_c, sp_d, sp_e])
#     prec_mean = np.mean([prec_a, prec_b, prec_c, prec_d, prec_e])
#     acc_mean = np.mean([acc_a, acc_b, acc_c, acc_d, acc_e])
#     f1_mean = np.mean([f1_a, f1_b, f1_c, f1_d, f1_e])
    
#     return aupr_mean, auroc_mean, sn_mean, sp_mean, prec_mean, acc_mean, f1_mean
    

## Training with optimal parameters (Not used)

In [21]:
# def train_optimal_params(dataset: np.ndarray, lmbda: float|int, k: int) -> dict:
#     """Calculate the metrics of the model given k and lambda.
   
#     Args:
#         dataset (np.ndarray): The dataset
#         k (float): k
#         lmbda (float): lambda
#     Returns:
#         np.ndarray: array of resulting metrics.
#     """
#     aupr,auroc,sn,sp,prec,acc,f1 = 0,0,0,0,0,0,0
#     # Do 5 fold CV for each possible combination of lambda and k (product function gives the cartesian product)
#     for run in range(20):
#         t_aupr, t_auroc, t_sn, t_sp, t_prec, t_acc, t_f1 =  cross_val_5_optimal(dataset=dataset, lmbda=lmbda, k=k)
#         aupr, auroc, sn, sp, prec, acc, f1 = aupr+t_aupr, auroc+t_auroc, sn+t_sn, sp+t_sp, prec+t_prec, acc+t_acc, f1+t_f1
    
#     print("dividing by 20")
#     return np.array([aupr, auroc, sn, sp, prec, acc, f1])/20
 
    


## Training with the optimal parameters (currently used)
20 independent runs of 5 fold CV, using optimal paramters.

In [22]:
def train_optimal_params(dataset: np.ndarray, lmbda: float|int, k: int) -> dict:
    """Calculate the metrics of the model given k and lambda.
   
    Args:
        dataset (np.ndarray): The dataset
        k (float): k
        lmbda (float): lambda
    Returns:
        np.ndarray: array of resulting metrics.
    """
    # stores the aupr, auroc, sn, sp, prec, acc, f1
    results = np.zeros(7)
    for _ in range(20):
        res =  cross_val(cv=5, dataset=dataset, lmbda=lmbda, k=k)
        # element wise addition
        results = results + res    
    
    # return the mean over the 2 independent runs
    return results/20

## Training Pauwel's Dataset

Possible k values = 1,5,10,15,20,25,30,35,40,50,100

Possible lambda values = 0.1,1,5,10,15,20

In [23]:
%time param_selection(dataset=pauwel.values.copy(), k_values=np.array([1]), lmda_values=np.array([0.1]))

Trying parameters: ( 1 ,  0.1 )
Performing CV fold: 0
Performing CV fold: 1
Performing CV fold: 2
Performing CV fold: 3
Performing CV fold: 4
[0.06184474 0.63055674 0.29514582 0.91428056 0.15254481 0.88352116
 0.20113427]
CPU times: user 18min 7s, sys: 2.8 s, total: 18min 10s
Wall time: 18min 4s


{(1,
  0.1): array([0.06184474, 0.63055674, 0.29514582, 0.91428056, 0.15254481,
        0.88352116, 0.20113427])}

In [22]:
print("Now training with optimal params.....")
lmda=0.1
k=1
# 20 runs of 5-CV with optimal parameters
%time train_optimal_params(dataset=pauwel.values.copy(), lmbda=lmda, k=k)


Now training with optimal params.....


  f1 = (2.0*prec*recall)/(recall+prec)


dividing by 20
CPU times: user 17min 12s, sys: 2min 59s, total: 20min 11s
Wall time: 14min 23s


array([0.05496916, 0.50412523, 0.55073693, 0.4977201 , 0.05990099,
       0.50045108, 0.10144114])

In [23]:
# Finding optimal parameters using Mizutani's Dataset
%time param_selection(dataset=mizutani.values.copy(), k_values=np.array([1,5,10,15,20,25,30,35,40,50,100]), lmda_values=np.array([0.1,1,5,10,15,20]))

Trying parameters: ( 1 ,  0.1 )
AUPR:  0.06224889497564064
Trying parameters: ( 1 ,  1.0 )
AUPR:  0.03645186577854685
Trying parameters: ( 1 ,  5.0 )
AUPR:  0.03280606033577939
Trying parameters: ( 1 ,  10.0 )
AUPR:  0.03306018644423086
Trying parameters: ( 1 ,  15.0 )
AUPR:  0.032504376300669865
Trying parameters: ( 1 ,  20.0 )
AUPR:  0.032932212431575125
Trying parameters: ( 5 ,  0.1 )
AUPR:  0.0564371150905675
Trying parameters: ( 5 ,  1.0 )
AUPR:  0.03808118178787601
Trying parameters: ( 5 ,  5.0 )
AUPR:  0.03277053595075863
Trying parameters: ( 5 ,  10.0 )
AUPR:  0.03236784080140081
Trying parameters: ( 5 ,  15.0 )
AUPR:  0.032142919327458955
Trying parameters: ( 5 ,  20.0 )
AUPR:  0.03268587290494078
Trying parameters: ( 10 ,  0.1 )
AUPR:  0.060898310400403624
Trying parameters: ( 10 ,  1.0 )
AUPR:  0.03972126113037351
Trying parameters: ( 10 ,  5.0 )
AUPR:  0.03271260040598352
Trying parameters: ( 10 ,  10.0 )
AUPR:  0.03272305039167829
Trying parameters: ( 10 ,  15.0 )
AUPR:  0

{(1, 0.1): 0.06224889497564064,
 (1, 1.0): 0.03645186577854685,
 (1, 5.0): 0.03280606033577939,
 (1, 10.0): 0.03306018644423086,
 (1, 15.0): 0.032504376300669865,
 (1, 20.0): 0.032932212431575125,
 (5, 0.1): 0.0564371150905675,
 (5, 1.0): 0.03808118178787601,
 (5, 5.0): 0.03277053595075863,
 (5, 10.0): 0.03236784080140081,
 (5, 15.0): 0.032142919327458955,
 (5, 20.0): 0.03268587290494078,
 (10, 0.1): 0.060898310400403624,
 (10, 1.0): 0.03972126113037351,
 (10, 5.0): 0.03271260040598352,
 (10, 10.0): 0.03272305039167829,
 (10, 15.0): 0.032979733186549454,
 (10, 20.0): 0.031985267328962355,
 (15, 0.1): 0.05937660070739539,
 (15, 1.0): 0.043081448817618044,
 (15, 5.0): 0.03307239914292412,
 (15, 10.0): 0.03283109614777878,
 (15, 15.0): 0.033021528559364334,
 (15, 20.0): 0.032692192919139565,
 (20, 0.1): 0.05694684998742794,
 (20, 1.0): 0.04226775803392507,
 (20, 5.0): 0.03247992499145989,
 (20, 10.0): 0.03244009097124427,
 (20, 15.0): 0.032636527685880413,
 (20, 20.0): 0.03293269335417152

In [24]:
print("Now training with optimal params.....")
lmda=0.1
k=1
# 20 runs of 5-CV with optimal parameters
%time train_optimal_params(dataset=mizutani.values.copy(), lmbda=lmda, k=k)


Now training with optimal params.....
dividing by 20
CPU times: user 15min 1s, sys: 2min 55s, total: 17min 57s
Wall time: 12min 12s


array([0.06288223, 0.50032762, 0.55613453, 0.49648276, 0.06792611,
       0.49970188, 0.11356298])

In [25]:
%time param_selection(dataset=liu.values.copy(), k_values=np.array([1,5,10,15,20,25,30,35,40,50,100]), lmda_values=np.array([0.1,1,5,10,15,20]))

Trying parameters: ( 1 ,  0.1 )
AUPR:  0.05877837406950227
Trying parameters: ( 1 ,  1.0 )
AUPR:  0.03889767327475126
Trying parameters: ( 1 ,  5.0 )
AUPR:  0.02960439416843994
Trying parameters: ( 1 ,  10.0 )
AUPR:  0.029599090510526366
Trying parameters: ( 1 ,  15.0 )
AUPR:  0.02936143563393645
Trying parameters: ( 1 ,  20.0 )
AUPR:  0.029052640206685642
Trying parameters: ( 5 ,  0.1 )
AUPR:  0.05201806715522176
Trying parameters: ( 5 ,  1.0 )
AUPR:  0.033261331880243716
Trying parameters: ( 5 ,  5.0 )
AUPR:  0.029305674179257106
Trying parameters: ( 5 ,  10.0 )
AUPR:  0.02956662086196128
Trying parameters: ( 5 ,  15.0 )
AUPR:  0.02901916476444617
Trying parameters: ( 5 ,  20.0 )
AUPR:  0.02996375135889895
Trying parameters: ( 10 ,  0.1 )
AUPR:  0.050351145350373674
Trying parameters: ( 10 ,  1.0 )
AUPR:  0.03537850430212669
Trying parameters: ( 10 ,  5.0 )
AUPR:  0.029173890683740634
Trying parameters: ( 10 ,  10.0 )
AUPR:  0.029372869865217043
Trying parameters: ( 10 ,  15.0 )
AUPR

{(1, 0.1): 0.05877837406950227,
 (1, 1.0): 0.03889767327475126,
 (1, 5.0): 0.02960439416843994,
 (1, 10.0): 0.029599090510526366,
 (1, 15.0): 0.02936143563393645,
 (1, 20.0): 0.029052640206685642,
 (5, 0.1): 0.05201806715522176,
 (5, 1.0): 0.033261331880243716,
 (5, 5.0): 0.029305674179257106,
 (5, 10.0): 0.02956662086196128,
 (5, 15.0): 0.02901916476444617,
 (5, 20.0): 0.02996375135889895,
 (10, 0.1): 0.050351145350373674,
 (10, 1.0): 0.03537850430212669,
 (10, 5.0): 0.029173890683740634,
 (10, 10.0): 0.029372869865217043,
 (10, 15.0): 0.02969057323496998,
 (10, 20.0): 0.02930076452054133,
 (15, 0.1): 0.050746038508815816,
 (15, 1.0): 0.03882678977547897,
 (15, 5.0): 0.029223757668999394,
 (15, 10.0): 0.029570739040979822,
 (15, 15.0): 0.029457837831893048,
 (15, 20.0): 0.029237339075591052,
 (20, 0.1): 0.05394841604178578,
 (20, 1.0): 0.03835953159686649,
 (20, 5.0): 0.029206082605668378,
 (20, 10.0): 0.029200098079882875,
 (20, 15.0): 0.02950767655486153,
 (20, 20.0): 0.029157045410

In [26]:
print("Now training with optimal params.....")
lmda=0.1
k=1
# 20 runs of 5-CV with optimal parameters
%time train_optimal_params(dataset=liu.values.copy(), lmbda=lmda, k=k)


Now training with optimal params.....


  f1 = (2.0*prec*recall)/(recall+prec)


dividing by 20
CPU times: user 16min 38s, sys: 2min 56s, total: 19min 35s
Wall time: 13min 49s


array([0.05735136, 0.49895421, 0.61117217, 0.43393913, 0.06015061,
       0.44329375, 0.1037746 ])