# Imports

In [1]:
import numpy as np
import pandas as pd
from itertools import product
from numba import njit
import time
%matplotlib inline
import pickle

# Preparing Data

Relevant SIDER data was downloaded from https://sideeffects.embl.de/, converted to DataFrame and stored in a .pkl file.

In [18]:
drugs = pd.read_pickle('sider-data/drugs.pkl') # drug STITCH code, ATC Code, and drug name
meddra_all_se = pd.read_pickle('sider-data/meddra_all_se.pkl') # Side effect and their drugs information
meddra = pd.read_pickle('sider-data/meddra.pkl') # Side effect information


In [19]:
drugs

Unnamed: 0,ID,ATC_Code,Drug_Name
0,CID100000085,A16AA01,carnitine
1,CID100000119,L03AA03,gamma-aminobutyric
2,CID100000119,N03AG03,gamma-aminobutyric
3,CID100000137,L01XD04,5-aminolevulinic
4,CID100000143,V03AF03,leucovorin
...,...,...,...
1555,CID154681041,J01AA12,tigecycline
1556,CID154687131,J01AA04,lymecycline
1557,CID156603655,S01LA03,pegaptanib
1558,CID170695640,C10AC01,colestyramine


In [21]:
meddra

Unnamed: 0,UML_Concept_ID,Kind_of_Term,MedDRA_ID,SE
0,C0000727,LT,10000647,Acute abdomen
1,C0000727,PT,10000647,Acute abdomen
2,C0000727,LT,10042784,Syndrome abdominal acute
3,C0000727,LT,10000096,Abdominal syndrome acute
4,C0000729,LT,10000057,Abdominal cramps
...,...,...,...,...
95907,C3666015,LT,10074155,Device material degradation
95908,C3666016,LT,10074210,Coarse breath sounds
95909,C3666017,LT,10074226,Cholangiopathy
95910,C3666018,LT,10074267,Spontaneous ejaculation


In [22]:
len(meddra_all_se.Stitch_Compound_ID_1.unique()), len(meddra_all_se.Stitch_Compound_ID_2.unique())

(1430, 1556)

In [23]:
# Combining the Stitch Columns because they are not 1-to-1, and both are used in determining the side effect.

drug_se_filtered = meddra_all_se.copy()
drug_se_filtered['Stitch'] = drug_se_filtered['Stitch_Compound_ID_1'] + ", " + drug_se_filtered['Stitch_Compound_ID_2']
drug_se_filtered

Unnamed: 0,Stitch_Compound_ID_1,Stitch_Compound_ID_2,UML_Concept_ID,MedDRA_Concept_Type,UML_Concept_ID_For_MedDRA,SE_Term,Stitch
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps,"CID100000085, CID000010917"
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain,"CID100000085, CID000010917"
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain,"CID100000085, CID000010917"
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain,"CID100000085, CID000010917"
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain,"CID100000085, CID000010917"
...,...,...,...,...,...,...,...
309844,CID171306834,CID071306834,C3203358,PT,C1145670,Respiratory failure,"CID171306834, CID071306834"
309845,CID171306834,CID071306834,C3665386,LLT,C3665386,Abnormal vision,"CID171306834, CID071306834"
309846,CID171306834,CID071306834,C3665386,PT,C3665347,Visual impairment,"CID171306834, CID071306834"
309847,CID171306834,CID071306834,C3665596,LLT,C3665596,Warts,"CID171306834, CID071306834"


In [24]:
# Considering the MedDra preferred terms, indicated by 'PT' in 'Concept Type' column.
drug_se_filtered = drug_se_filtered[drug_se_filtered['MedDRA_Concept_Type']=='PT']
drug_se_filtered

Unnamed: 0,Stitch_Compound_ID_1,Stitch_Compound_ID_2,UML_Concept_ID,MedDRA_Concept_Type,UML_Concept_ID_For_MedDRA,SE_Term,Stitch
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain,"CID100000085, CID000010917"
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain,"CID100000085, CID000010917"
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain,"CID100000085, CID000010917"
6,CID100000085,CID000010917,C0002418,PT,C0002418,Amblyopia,"CID100000085, CID000010917"
8,CID100000085,CID000010917,C0002871,PT,C0002871,Anaemia,"CID100000085, CID000010917"
...,...,...,...,...,...,...,...
309842,CID171306834,CID071306834,C3203358,PT,C0242184,Hypoxia,"CID171306834, CID071306834"
309843,CID171306834,CID071306834,C3203358,PT,C3203358,Hypoventilation,"CID171306834, CID071306834"
309844,CID171306834,CID071306834,C3203358,PT,C1145670,Respiratory failure,"CID171306834, CID071306834"
309846,CID171306834,CID071306834,C3665386,PT,C3665347,Visual impairment,"CID171306834, CID071306834"


In [25]:
# 1556 unique drugs
len(drug_se_filtered.Stitch.unique())

1556

In [26]:
drug_se_filtered = drug_se_filtered.drop_duplicates()
drug_se_filtered = drug_se_filtered[['Stitch', 'UML_Concept_ID']].drop_duplicates()
drug_se_filtered

Unnamed: 0,Stitch,UML_Concept_ID
1,"CID100000085, CID000010917",C0000729
3,"CID100000085, CID000010917",C0000737
6,"CID100000085, CID000010917",C0002418
8,"CID100000085, CID000010917",C0002871
10,"CID100000085, CID000010917",C0003123
...,...,...
309838,"CID171306834, CID071306834",C2830004
309840,"CID171306834, CID071306834",C2979982
309842,"CID171306834, CID071306834",C3203358
309846,"CID171306834, CID071306834",C3665386


In [27]:
# Getting the list of drugs and side effects that appear more than 5 times
drug_counts = drug_se_filtered['Stitch'].value_counts()
drug_counts_greater_than_5 = drug_counts > 5
drug_list_keep = drug_counts_greater_than_5[lambda x: x==True].index.values
print(drug_list_keep, len(drug_list_keep))

se_counts = drug_se_filtered['UML_Concept_ID'].value_counts()
se_counts_greater_than_5 = se_counts > 5
se_list_keep = se_counts_greater_than_5[lambda x: x==True].index.values
print(se_list_keep, len(se_list_keep))

['CID100060795, CID000060795' 'CID100125889, CID005486971'
 'CID100002771, CID000146570' ... 'CID100047471, CID000047471'
 'CID100065281, CID006714010' 'CID110178705, CID010178705'] 1507
['C0027497' 'C0018681' 'C0042963' ... 'C0151898' 'C0221395' 'C0001364'] 2188


In [28]:
drug_se_filtered = drug_se_filtered[drug_se_filtered.Stitch.isin(drug_list_keep)]
drug_se_filtered = drug_se_filtered[drug_se_filtered.UML_Concept_ID.isin(se_list_keep)]
drug_se_filtered = drug_se_filtered.drop_duplicates()
drug_se_filtered

Unnamed: 0,Stitch,UML_Concept_ID
1,"CID100000085, CID000010917",C0000729
3,"CID100000085, CID000010917",C0000737
6,"CID100000085, CID000010917",C0002418
8,"CID100000085, CID000010917",C0002871
10,"CID100000085, CID000010917",C0003123
...,...,...
309838,"CID171306834, CID071306834",C2830004
309840,"CID171306834, CID071306834",C2979982
309842,"CID171306834, CID071306834",C3203358
309846,"CID171306834, CID071306834",C3665386


In [29]:
len(drug_se_filtered['Stitch'].unique())

1507

In [30]:
len(drug_se_filtered['UML_Concept_ID'].unique())

2188

In [31]:
drug_names_fitered_unique = drug_se_filtered.Stitch.unique()
se_names_fitered_unique = drug_se_filtered.UML_Concept_ID.unique()

In [32]:
se_names_fitered_unique, len(se_names_fitered_unique)

(array(['C0000729', 'C0000737', 'C0002418', ..., 'C0149801', 'C0239940',
        'C0877365'], dtype=object),
 2188)

In [33]:
drug_names_fitered_unique, len(drug_names_fitered_unique)

(array(['CID100000085, CID000010917', 'CID100000137, CID000000137',
        'CID100000143, CID000000143', ..., 'CID170683024, CID070683024',
        'CID170695640, CID070695640', 'CID171306834, CID071306834'],
       dtype=object),
 1507)

In [34]:
# To be able to convert the dataset into a list of tuples.
drug_se_filtered.set_index(['Stitch', 'UML_Concept_ID'], inplace=True)

In [35]:
drug_se_filtered

Stitch,UML_Concept_ID
"CID100000085, CID000010917",C0000729
"CID100000085, CID000010917",C0000737
"CID100000085, CID000010917",C0002418
"CID100000085, CID000010917",C0002871
"CID100000085, CID000010917",C0003123
...,...
"CID171306834, CID071306834",C2830004
"CID171306834, CID071306834",C2979982
"CID171306834, CID071306834",C3203358
"CID171306834, CID071306834",C3665386


In [36]:
drug_se_filtered.index.values, len(drug_se_filtered.index.values)

(array([('CID100000085, CID000010917', 'C0000729'),
        ('CID100000085, CID000010917', 'C0000737'),
        ('CID100000085, CID000010917', 'C0002418'), ...,
        ('CID171306834, CID071306834', 'C3203358'),
        ('CID171306834, CID071306834', 'C3665386'),
        ('CID171306834, CID071306834', 'C3665596')], dtype=object),
 138424)

In [37]:
side_effect_matrix = pd.DataFrame(columns=se_names_fitered_unique, index=drug_names_fitered_unique)

In [38]:
se_associations = drug_se_filtered.index.values

In [39]:
# Each association between a drug and side effect is indicated with 1.
for drug, se in se_associations:
    side_effect_matrix.loc[drug, se] = 1

In [40]:
side_effect_matrix

Unnamed: 0,C0000729,C0000737,C0002418,C0002871,C0003123,C0003467,C0003811,C0004093,C0004238,C0004604,...,C0235329,C0853698,C0860901,C0221245,C0521500,C0037926,C1619692,C0149801,C0239940,C0877365
"CID100000085, CID000010917",1,1,1,1,1,1,1,1,1,1,...,,,,,,,,,,
"CID100000137, CID000000137",,,,1,,,,,,,...,,,,,,,,,,
"CID100000143, CID000000143",,,,,1,,,,,,...,,,,,,,,,,
"CID100000143, CID000006006",,,,,1,,,,,,...,,,,,,,,,,
"CID100000143, CID000149436",,1,,,1,,,1,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"CID156603655, CID056603655",,,,,,,,,,1,...,,,,,,,,,,
"CID156842239, CID056842239",,,,,,,,,,,...,,,,,,,,,,
"CID170683024, CID070683024",,,,,,,,,,,...,,,,,,,,,,
"CID170695640, CID070695640",,1,,1,1,1,,,,1,...,,,,,,,,,,


In [41]:
side_effect_matrix.fillna(0, inplace=True)

In [42]:
side_effect_matrix.isnull().values.any()

False

In [43]:
# Final side effect matrix to be used for Galeano's methodology.
side_effect_matrix

Unnamed: 0,C0000729,C0000737,C0002418,C0002871,C0003123,C0003467,C0003811,C0004093,C0004238,C0004604,...,C0235329,C0853698,C0860901,C0221245,C0521500,C0037926,C1619692,C0149801,C0239940,C0877365
"CID100000085, CID000010917",1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
"CID100000137, CID000000137",0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"CID100000143, CID000000143",0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"CID100000143, CID000006006",0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"CID100000143, CID000149436",0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"CID156603655, CID056603655",0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
"CID156842239, CID056842239",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"CID170683024, CID070683024",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"CID170695640, CID070695640",0,1,0,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [44]:
pauwel = pd.read_csv("datasets/pauwel-dataset.txt", sep='\t', header=0)
pauwel

Unnamed: 0,abdominal cramps,abdominal distention,abdominal pain,malformations,spontaneous abortion,missed abortion,abscess,acanthosis nigricans,acidosis,renal tubular acidosis,...,drug dependence,diverticulosis,prostatic hypertrophy,allergic reaction,dysphonia,eosinophilic pneumonia,retinal vein thrombosis,renal insufficiency,glioblastoma multiforme,portal cirrhosis
carnitine,1,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
GABA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
delta-aminolevulinic acid,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
leucovorin,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PGE2,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pimecrolimus,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
auranofin,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
cefditoren,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
nitroprusside,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
mizutani = pd.read_csv("datasets/mizutani-dataset.txt", sep='\t', header=0)
mizutani # drug names are coded with pubchem ID

Unnamed: 0,abdominal.cramps,abdominal.distention,abdominal.pain,malformations,spontaneous.abortion,missed.abortion,abscess,acanthosis.nigricans,acidosis,renal.tubular.acidosis,...,vitamin.deficiency,drug.dependence,diverticulosis,prostatic.hypertrophy,allergic.reaction,dysphonia,eosinophilic.pneumonia,retinal.vein.thrombosis,renal.insufficiency,glioblastoma.multiforme
85,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
119,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
137,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
143,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
158,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6398525,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6398970,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
6447131,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6918453,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
liu = pd.read_csv("datasets/liu-dataset.csv", sep=',', header=None)
liu

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1375,1376,1377,1378,1379,1380,1381,1382,1383,1384
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
827,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
828,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
829,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
830,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [47]:
pd.isna(pauwel).values.sum()

0

In [48]:
# Check number of missing values
print("The number of missing values in pauwel's dataset:", pd.isna(pauwel).values.sum())

print("The number of missing values in mizutani's dataset:", pd.isna(mizutani).values.sum())

print("The number of missing values in liu's dataset:" , pd.isna(liu).values.sum())

The number of missing values in pauwel's dataset: 0
The number of missing values in mizutani's dataset: 0
The number of missing values in pauwel's dataset: 0


In [49]:
def density(df):
    """Calculate the density of the matrix."""
    
    density = len(np.nonzero(df.to_numpy())[0])
    known_se = density
    size = (df.shape[0]*df.shape[1])
    density /= size
    density *= 100
    return 'density: {0}%\n number of known drug-se associations: {1}\n total drug-se associations: {2}'.format(np.around(density, 2), known_se, size)

print("Pauwel:\n", density(pauwel))
print("Liu:\n", density(liu))
print("Mizutani:\n", density(mizutani))
print("Galeano:\n", density(side_effect_matrix))

# Implementation

## Original (Non-Numba) Implementation

In [53]:
class SGD_Recommender:
    
    def __init__(self, k:int, lmbda:float, max_iter:int=1000, learn_rate=0.005, tolerance=1e-06, seed:int=0):
        """Sets the parameters for SGD."""
        
        self.k=k
        self.lmbda=lmbda
        self.max_iter=max_iter
        self.learn_rate=learn_rate
        self.tolerance=tolerance
        self.seed=seed
            
    def fit(self, train: np.ndarray) -> None:
        """Train the SGD model.
        
        Args:            
            train (np.ndarray): The training set
        Returns: 
            None        
        """     
        np.random.seed(self.seed)
        m, n = train.shape
        
        # Initialize the low rank matrices U and V with values from the normal distribution N(0,0.01)
        mu, sigma = 0, 0.01
        self.U = np.random.normal(loc=mu, scale=sigma, size=(m, self.k))
        self.V = np.random.normal(loc=mu, scale=sigma, size=(n, self.k))
        
        # Get non-zero values in train set
        drug, se = train.nonzero()
        drug_se = list(zip(drug, se))
        
        # Start of training
        for _ in range(self.max_iter):
            np.random.shuffle(drug_se) # Shuffle in place
            U_old = self.U.copy()
            V_old = self.V.copy()
            
            for (drug, se) in drug_se:
                error = train[drug,se] - self.predictions(self.U[drug,:], self.V[se,:])
                temp_u = self.U[drug,:] + self.learn_rate*(error*self.V[se,:] - self.lmbda*self.U[drug,:])
                temp_v = self.V[se,:] + self.learn_rate*(error*self.U[drug,:] - self.lmbda*self.V[se,:])
                self.U[drug,:] = temp_u 
                self.V[se,:] = temp_v
            
            if self.converged(U_old, self.U) and self.converged(V_old, self.V):
                break
    
    def predict(self) -> np.ndarray:
        """Predict the entire drug-side effect matrix values."""
        
        return self.predictions(self.U, self.V)

            
    def predictions(self, U: np.ndarray, V: np.ndarray) -> np.ndarray:
        """Return dot product of the matrices U and V."""
        
        return np.dot(U, V.T)
    
    def converged(self, old: np.ndarray, curr: np.ndarray) -> bool:
        """Check if matrices have reached convergence."""
        
        return np.all(np.abs(np.subtract(old,curr)) <= self.tolerance)
            
    

## Numba Implementation

In [54]:
  
@njit(cache=True)
def predictions(U: np.ndarray, V: np.ndarray) -> np.ndarray:
    """Return dot product of the matrices U and V."""
    return np.dot(U, V.T)

@njit(cache=True)
def converged(old: np.ndarray, curr: np.ndarray, tolerance:float=1e-4) -> bool:
    """Check if matrices have reached convergence."""

    return np.all(np.abs(np.subtract(old,curr)) <= tolerance)
    
@njit(cache=True)
def fit(train: np.ndarray, k: int, lmbda: float, max_iter:int=1000, learn_rate:float=0.005, seed:int=5, tolerance:float=1e-4):
    """Train the SGD model.

    Args:
        train (np.ndarray): The training set
        k (float): Number of latent features
        lmbda (float): Regularization term.
        max_iter (int): Max number of iterations
        learn_rate (float): Learning rate
    Returns: 
        U, V (np.ndarray): The low rank matrix representation of the drug-side effect matrix.        
    """     
    np.random.seed(seed)
    m, n = train.shape
    # Initialize the low rank matrices U and V with values from the normal distribution N(0,0.01)
    mu, sigma = 0, 0.01
    U = np.random.normal(mu, sigma, (m, k))
    V = np.random.normal(mu, sigma, (n, k))
    
    drug, se = train.nonzero()
    drug_se = np.array(list(zip(drug, se)))

    converged_flag = False
    # Start of training
    for epoch in range(max_iter):

        np.random.shuffle(drug_se) # Shuffle in place
        U_old = U.copy()
        V_old = V.copy()

        # Learn from the known  associations in the training set (drug_se)
        for (drug, se) in drug_se:
            error = train[drug,se] - predictions(U[drug,:], V[se,:])
            temp_u = U[drug,:] + learn_rate*(error*V[se,:] - lmbda*U[drug,:])
            temp_v = V[se,:] + learn_rate*(error*U[drug,:] - lmbda*V[se,:])
            U[drug,:] = temp_u 
            V[se,:] = temp_v

        if converged(U_old, U, tolerance) and converged(V_old, V, tolerance):
            print("Convergence reached at epoch", epoch)
            converged_flag=True
            break
    print("Convergence state:", converged_flag)
    return U, V

@njit(cache=True)   
def predict(U:np.ndarray, V:np.ndarray) -> np.ndarray:
        """Predict the entire drug-side effect matrix values."""
        
        return predictions(U, V)

## Performance Metrics

### AUPR

In [55]:
def aupr(truth: np.ndarray, predictions: np.ndarray) -> float:
    """Get the area under the precision-recall curve, using trapezoidal rule.
    
    Args:
        truth: 1-D vector of ground truth values
        predictions: 1-D vector of predictions

    Returns:
        (float): The area.
    """
    
    max_value = predictions.max()
    min_value = predictions.min()
    # Create an array of 99 representing the thresholds
    threshold = min_value + (max_value-min_value)*np.arange(1,100,1)/100
    
    tn = np.zeros((threshold.size, 1))
    tp = np.zeros((threshold.size, 1))
    fn = np.zeros((threshold.size, 1))
    fp = np.zeros((threshold.size, 1))
    
    # Calculate the tp, tn, fp, fn for every threshold.
    for i in range(threshold.size):
        tp[i,0] = np.logical_and(predictions>=threshold[i], truth==1).sum()
        tn[i,0] = np.logical_and(predictions<threshold[i], truth==0).sum()
        fp[i,0] = np.logical_and(predictions>=threshold[i], truth==0).sum()
        fn[i,0] = np.logical_and(predictions<threshold[i], truth==1).sum()
    
    # Calculate the area under the precision-recall curve
    recall = tp/(tp+fn)
    prec = tp/(tp+fp)
    
    x = recall
    y = prec
    
    sorted_index = x.argsort(axis=0)
    y = np.take_along_axis(y, sorted_index, axis=0)
    x = np.take_along_axis(x, sorted_index, axis=0)    

    x[0][0] = 0
    y[0][0] = 1
    x = np.append(x, [[1]], axis=0)
    y = np.append(y, [[0]], axis=0)

    
    # Calculate the area using the trapezoidal rule: (b-a)*0.5*(f(b)+f(a))
    area = 0
    area = 0.5*x[0][0]*(1+y[0][0]) # still 0
    for i in range(threshold.size):
        area += (y[i][0]+ y[i+1][0])*(x[i+1][0]-x[i][0])*0.5
    return area
    

### AUROC

In [70]:

def auroc(truth: np.ndarray, predictions: np.ndarray) -> float:
    """Get the area under the ROC curve.
    
    Args:
        truth: 1-D vector of ground truth values
        predictions: 1-D vector of predictions

    Returns:
        (float): The area.
    """
    
    max_value = predictions.max()
    min_value = predictions.min()
    # Create an array of 99 representing the thresholds
    threshold = min_value + (max_value-min_value)*np.arange(1,100,1)/100
    
    tn = np.zeros((threshold.size, 1))
    tp = np.zeros((threshold.size, 1))
    fn = np.zeros((threshold.size, 1))
    fp = np.zeros((threshold.size, 1))
    
    # Calculate the tp, tn, fp, fn for every threshold.
    for i in range(threshold.size):
        tp[i,0] = np.logical_and(predictions>=threshold[i], truth==1).sum()
        tn[i,0] = np.logical_and(predictions<threshold[i], truth==0).sum()
        fp[i,0] = np.logical_and(predictions>=threshold[i], truth==0).sum()
        fn[i,0] = np.logical_and(predictions<threshold[i], truth==1).sum()
    
    # Calculate the area under the precision-recall curve
    sn = tp/(tp+fn)
    sp = tn/(tn+fp)
    x = 1 - sp
    y = sn

    sorted_index = x.argsort(axis=0)
    y = np.take_along_axis(y, sorted_index, axis=0)
    x = np.take_along_axis(x, sorted_index, axis=0)

    sorted_index = y.argsort(axis=0)
    y = np.take_along_axis(y, sorted_index, axis=0)
    x = np.take_along_axis(x, sorted_index, axis=0)

    x = np.append(x, [[1]], 0)
    y = np.append(y, [[1]], 0)
    
    # Calculate the area using the trapezoidal rule: (b-a)*0.5*(f(b)+f(a))
    area = 0
    area = 0.5*x[0][0]*y[0][0] # still 0
    for i in range(threshold.size):
        area += (y[i][0]+ y[i+1][0])*(x[i+1][0]-x[i][0])*0.5
    
    return area

### Sensitivity, specificity, precision, recall, accuracy, f1-measure

In [71]:
@njit(cache=True)
def classification_metric(truth: np.ndarray, predictions: np.ndarray) -> tuple:
    """Calculate the evaulation metrics given 1-D vector of ground truth and predictions.
    
    Args:
        truth: 1-D vector of ground truth values
        predictions: 1-D vector of predictions

    Returns:
        sensitivity, specificity, precision, accuracy, f1
    """
    
    tp = np.logical_and(predictions==1, truth==1).sum()
    tn = np.logical_and(predictions==0, truth==0).sum()
    fp = np.logical_and(predictions==1, truth==0).sum()
    fn = np.logical_and(predictions==0, truth==1).sum()
    
    try:
        acc = (tp+tn)/(tn+tp+fn+fp)
    except:
        acc = 0
    try:
        sn = tp/(tp+fn)
    except:
        sn=0
    recall = sn
    try:
        sp = tn/(tn+fp)
    except:
        sp = 0
    try:
        prec = tp/(tp+fp)
    except:
        prec = 0
    try:
        f1 = (2.0*prec*recall)/(recall+prec)  
    except:
        f1 = 0
    return sn,sp,prec,acc,f1
    

In [72]:
def get_metric(truth: np.ndarray, predictions: np.ndarray) -> tuple:
    """Calculate the metrics of the drug-side effect matrix.
    Args:
        truth: 1-D vector of ground truth values
        predictions: 1-D vector of predictions

    Returns:
        sensitivity, specificity, precision, accuracy, f1
    """

    max_value = predictions.max()
    min_value = predictions.min()
    # Create an 1-D array of 999 threshold values in ascending order
    threshold = min_value + (max_value-min_value)*np.arange(1,1000,1)/1000
    temp_sn = np.zeros(threshold.size)
    temp_sp = np.zeros(threshold.size)
    temp_prec = np.zeros(threshold.size)
    temp_acc = np.zeros(threshold.size)
    temp_f1 = np.zeros(threshold.size)

    for i in range(threshold.size):
        # assign values above threshold to 1
        predict_label = predictions>threshold[i]
        # calculate the metrics for the predictions under threshold i
        temp_sn[i],temp_sp[i],temp_prec[i],temp_acc[i],temp_f1[i] = classification_metric(truth, predict_label)
    
    # Get index corresponding to max f1 score (optimal value of prec and recall)
    try:
        indx_max_f1 = np.nanargmax(temp_f1)
        sn = temp_sn[indx_max_f1]
        sp = temp_sp[indx_max_f1]
        prec = temp_prec[indx_max_f1]
        acc = temp_acc[indx_max_f1]
        f1 = temp_f1[indx_max_f1]
    except:
        print("Exception: F1 vector is all 0.0")
        sn, sp, prec, acc, f1  = 1, 1, 1, 1, 1
        
    return sn, sp, prec, acc, f1    

## Cross-validation

In [73]:
def cross_val(cv:int, dataset:np.ndarray, lmda:float, k:int, 
              learn_rate:float=0.005, tolerance:float=1e-4, seed:int=0) -> np.ndarray:
    interaction_matrix = dataset
    row, col = dataset.shape
    np.random.seed(seed)
    cv_matrix = np.ceil(np.random.rand(row,col)*cv)
    predict_score_matrix = np.zeros((row,col))

    for fold in range(cv):
        test_index_matrix = (cv_matrix==fold)
        train_index_matrix = np.logical_not(test_index_matrix)
        train_interaction_matrix = np.multiply(interaction_matrix,train_index_matrix)        
        
        U, V = fit(train=train_interaction_matrix, k=k, lmbda=lmda, learn_rate=learn_rate, tolerance=tolerance, seed=seed)
        predict_matrix = predict(U, V)

        predict_score_matrix = predict_score_matrix + np.multiply(predict_matrix, test_index_matrix)
    
    auc = auroc(truth=interaction_matrix.flatten(), predictions=predict_score_matrix.flatten())
    auprc = aupr(truth=interaction_matrix.flatten(), predictions=predict_score_matrix.flatten())
    sn, sp, prec, acc, f1 = get_metric(truth=interaction_matrix.flatten(), predictions=predict_score_matrix.flatten())

    return np.array([auprc, auc, sn, sp, prec, acc, f1])


## Functions to run training

Optimal parameters are those which yield highest AUPR

In [74]:
def param_selection(dataset: np.ndarray, k_values: np.ndarray, lmda_values: np.ndarray, 
                    learn_rates: np.ndarray=np.empty(0), tolerance_values:np.ndarray=np.empty(0), cv=5) -> dict:
    """Calculate the metrics of the model, for each pair of given k and lambda parameter values.
    
    Args:
        dataset (np.ndarray): The dataset
        k_values (np.ndarray): List of k values
        lmda_values (np.ndarray): List of lambda values
        learn_rates (np.ndarray): List of learn rates
        tolerance_values (np.ndarray): List of tolerance values
    Returns:
        dict: Dictionary of parameter combinations and their associated score.
    """
    results = {}
    if len(learn_rates) == 0:
        # Do 5 fold CV for each possible combination of lambda and k (product function gives the cartesian product)
        for (k,lmbda) in product(k_values, lmda_values):
            print("Trying parameters: (" , k, ", ", lmbda, ")")
            res = cross_val(cv=cv, dataset=dataset, k=k, lmda=lmbda)
            print(res)
            results[(k,lmbda)] = res
    else:
        # Do 5 fold CV for each possible combination of lambda and k and learn rate
        for (k,lmbda,learn_rate,tolerance_rate) in product(k_values, lmda_values, learn_rates, tolerance_values):
            print("Trying parameters: k={0},lambda={1},learn_rate={2},tolerance={3}".format(k,lmbda,learn_rate,tolerance_rate))
            res = cross_val(cv=cv, dataset=dataset, k=k, lmda=lmbda, learn_rate = learn_rate, tolerance=tolerance_rate)
            print(res)
            results[(k, lmbda, learn_rate, tolerance_rate)] = res
    
    return results
        
    

20 independent runs of 5 fold CV, using optimal paramters.

In [75]:
def train_optimal_params(dataset: np.ndarray, lmbda: float, k: int, learn_rate: float, tolerance: float) -> dict:
    """20 independent runs of 5 fold CV.
   
    Args:
        dataset (np.ndarray): The dataset
        k (float): k
        lmbda (float): Lambda
        learn_rate (float): Learning rate
        tolerance (float): Tolerance value
    Returns:
        np.ndarray: array of resulting metrics.
    """
    # stores the aupr, auroc, sn, sp, prec, acc, f1
    results = np.zeros(7)
    for i in range(20):
        # ensures independence through the different seed value in every run
        res =  cross_val(cv=5, dataset=dataset, lmda=lmbda, k=k, learn_rate=learn_rate, tolerance=tolerance, seed=i)
        # element wise addition
        results = results + res    
    
    # return the mean over the 20 independent runs
    return results/20.0

# Training parameters


In [45]:
k_values = np.array([1,3,5,10,15,20,25,30,35,40,50,100])
lambda_values = np.array([0.01,0.1,0.5,1.,5.,10.,15.,20.])
learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
tolerance_values = np.array([1e-4,1e-3,1e-5])


In [46]:
k_values, lambda_values, learning_rates, tolerance_values

(array([  1,   3,   5,  10,  15,  20,  25,  30,  35,  40,  50, 100]),
 array([1.0e-02, 1.0e-01, 5.0e-01, 1.0e+00, 5.0e+00, 1.0e+01, 1.5e+01,
        2.0e+01]),
 array([1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05]),
 array([1.e-04, 1.e-03, 1.e-05]))

## Galeano's Methodology

### Splitting the data 90/10 train test

In [76]:
# Splitting data 90% train and 10% test
interaction_matrix = side_effect_matrix.values.copy()
row,col = interaction_matrix.shape
np.random.seed(0)
train_test_matrix = np.random.choice([0,1], size=(row,col), p=[0.1, 0.9])
test_index_matrix = (train_test_matrix==0)
train_index_matrix = np.logical_not(test_index_matrix)

# Used for training, and finding optimal parameters
train_interaction_matrix = np.multiply(interaction_matrix, train_index_matrix) 
# Used for testing
test_interaction_matrix = np.multiply(interaction_matrix, test_index_matrix) 

In [77]:
test_interaction_matrix.sum()

13729

### Selecting Optimal Parameters using Grid Search
Parameter search was divided to reduce strain on compute resources, and results were saved as .pkl files in the /results folder. The results are later compounded into one file in the 'Analyzing Results' section below.

In [50]:
# %%time
# k_values = np.array([1,3,5])
# lambda_values = np.array([0.01,0.1,0.5,1.,5.,10.,15.,20.])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
# tolerance_values = np.array([1e-4,1e-3])
# results = param_selection(cv=10, dataset=train_interaction_matrix.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates, tolerance_values=tolerance_values)

# with open('results-latest/results_retry_again.pkl', 'wb') as f:
#     pickle.dump(results, f)

# %%time
# k_values = np.array([10,15,20,25])
# lambda_values = np.array([0.01,0.1,0.5,1.,5.,10.,15.,20.])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
# tolerance_values = np.array([1e-4])
# results_2 = param_selection(cv=10, dataset=train_interaction_matrix.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates, tolerance_values=tolerance_values)

# with open('results-latest/results_2_retry_again.pkl', 'wb') as f:
#     pickle.dump(results_2, f)


# %%time
# k_values = np.array([30,35,40,50,100])
# lambda_values = np.array([0.01,0.1,0.5,1.,5.,10.,15.,20.])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
# tolerance_values = np.array([1e-4])
# results_3 = param_selection(cv=10, dataset=train_interaction_matrix.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates, tolerance_values=tolerance_values)

# with open('results-latest/results_3_retry.pkl', 'wb') as f:
#     pickle.dump(results_3, f)


# %%time
# k_values = np.array([10,15,20,25])
# lambda_values = np.array([0.01,0.1,0.5,1.,5.,10.,15.,20.])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
# tolerance_values = np.array([1e-3])
# results_4 = param_selection(cv=10, dataset=train_interaction_matrix.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates, tolerance_values=tolerance_values)

# with open('results-latest/results_4_retry.pkl', 'wb') as f:
#     pickle.dump(results_4, f)


# %%time
# k_values = np.array([30,35,40,50,100])
# lambda_values = np.array([0.01,0.1,0.5,1.,5.,10.,15.,20.])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
# tolerance_values = np.array([1e-3])
# results_5 = param_selection(cv=10, dataset=train_interaction_matrix.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates, tolerance_values=tolerance_values)

# with open('results-latest/results_5_retry.pkl', 'wb') as f:
#     pickle.dump(results_5, f)

# %%time
# k_values = np.array([1,3,5,10,15,20,25])
# lambda_values = np.array([15.,20.])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
# tolerance_values = np.array([1e-5])
# results_6 = param_selection(cv=10, dataset=train_interaction_matrix.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates, tolerance_values=tolerance_values)

# with open('results-latest/results_6_retry_1.pkl', 'wb') as f:
#     pickle.dump(results_6, f)

# %%time
# k_values = np.array([1,3,5,10,15,20,25])
# lambda_values = np.array([5.,10.])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
# tolerance_values = np.array([1e-5])
# results_6 = param_selection(cv=10, dataset=train_interaction_matrix.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates, tolerance_values=tolerance_values)

# with open('results-latest/results_6_retry_2.pkl', 'wb') as f:
#     pickle.dump(results_6, f)

# %%time
# k_values = np.array([1,3,5,10,15,20,25])
# lambda_values = np.array([0.5,1.])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
# tolerance_values = np.array([1e-5])
# results_6 = param_selection(cv=10, dataset=train_interaction_matrix.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates, tolerance_values=tolerance_values)

# with open('results-latest/results_6_retry_3.pkl', 'wb') as f:
#     pickle.dump(results_6, f)

# %%time
# k_values = np.array([1,3,5,10,15,20,25])
# lambda_values = np.array([0.01,0.1])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
# tolerance_values = np.array([1e-5])
# results_6 = param_selection(cv=10, dataset=train_interaction_matrix.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates, tolerance_values=tolerance_values)

# with open('results-latest/results_6_retry_4.pkl', 'wb') as f:
#     pickle.dump(results_6, f)

# %%time
# k_values = np.array([30,35,40,50,100])
# lambda_values = np.array([0.01,0.1,0.5,1.,5.,10.,15.,20.])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
# tolerance_values = np.array([1e-5])
# results_7 = param_selection(cv=10, dataset=train_interaction_matrix.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates, tolerance_values=tolerance_values)

# with open('results-latest/results_7_retry.pkl', 'wb') as f:
#     pickle.dump(results_7, f)

### Analyzing Results of grid search

#### Loading results and aggregating them into one DataFrame

In [51]:


results_1 = pd.read_pickle('results-latest/results_retry_again.pkl')
results_2 = pd.read_pickle('results-latest/results_2_retry_again.pkl')
results_3 = pd.read_pickle('results-latest/results_3_retry.pkl')
results_4 = pd.read_pickle('results-latest/results_4_retry.pkl')
results_5 = pd.read_pickle('results-latest/results_5_retry.pkl')
results_6_1 = pd.read_pickle('results-latest/results_6_retry_1.pkl')
results_6_2 = pd.read_pickle('results-latest/results_6_retry_2.pkl')
results_6_3 = pd.read_pickle('results-latest/results_6_retry_3.pkl')
results_6_4 = pd.read_pickle('results-latest/results_6_retry_4.pkl')
results_7 = pd.read_pickle('results-latest/results_7_retry.pkl')

In [52]:
galeano_cross_val_res_1 = pd.DataFrame.from_dict(results_1)
galeano_cross_val_res_1.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']), inplace=True)
galeano_cross_val_res_1.replace(1.0,0.0,inplace=True)
galeano_cross_val_res_1 = galeano_cross_val_res_1.T
galeano_cross_val_res_1.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
galeano_cross_val_res_1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.01,0.10000,0.0001,0.463702,0.494749,0.999976,0.000044,0.037818,0.037859,0.072880
1,0.01,0.10000,0.0010,0.463702,0.494755,0.882241,0.141302,0.038814,0.169322,0.074356
1,0.01,0.01000,0.0001,0.463702,0.494755,0.871695,0.215585,0.041849,0.240398,0.079864
1,0.01,0.01000,0.0010,0.462535,0.476780,0.799318,0.538000,0.063670,0.547882,0.117946
1,0.01,0.00100,0.0001,0.031156,0.584665,0.723862,0.680280,0.081714,0.681928,0.146850
...,...,...,...,...,...,...,...,...,...,...
5,20.00,0.00100,0.0010,0.019525,0.500208,0.999310,0.029459,0.038895,0.066136,0.074875
5,20.00,0.00010,0.0001,0.019498,0.495389,0.998982,0.044055,0.039452,0.080168,0.075907
5,20.00,0.00010,0.0010,0.026926,0.529047,0.922571,0.344021,0.052381,0.365900,0.099134
5,20.00,0.00001,0.0001,0.028541,0.546219,0.893364,0.373537,0.053074,0.393195,0.100195


In [53]:
galeano_cross_val_res_2 = pd.DataFrame.from_dict(results_2)
galeano_cross_val_res_2.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']), inplace=True)
galeano_cross_val_res_2.replace(1.0,0.0,inplace=True)
galeano_cross_val_res_2 = galeano_cross_val_res_2.T
galeano_cross_val_res_2.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
galeano_cross_val_res_2


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10,0.01,0.10000,0.0001,0.463702,0.494749,0.999976,0.000044,0.037818,0.037859,0.072880
10,0.01,0.01000,0.0001,0.463687,0.494973,0.856017,0.335870,0.048217,0.355541,0.091291
10,0.01,0.00100,0.0001,0.031652,0.590804,0.722699,0.693908,0.084917,0.694997,0.151977
10,0.01,0.00010,0.0001,0.037693,0.499856,0.971803,0.029614,0.037870,0.065245,0.072900
10,0.01,0.00001,0.0001,0.037701,0.499858,0.971707,0.029642,0.037868,0.065268,0.072895
...,...,...,...,...,...,...,...,...,...,...
25,20.00,0.10000,0.0001,,,0.000000,0.000000,0.000000,0.962183,0.000000
25,20.00,0.01000,0.0001,0.018909,0.500000,0.000000,0.000043,0.037819,0.037859,0.072881
25,20.00,0.00100,0.0001,0.018910,0.500042,0.999952,0.001479,0.037869,0.039238,0.072975
25,20.00,0.00010,0.0001,0.019534,0.480124,0.999302,0.028290,0.038849,0.065010,0.074791


In [54]:
galeano_cross_val_res_3 = pd.DataFrame.from_dict(results_3)
galeano_cross_val_res_3.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']), inplace=True)
galeano_cross_val_res_3.replace(1.0,0.0,inplace=True)
galeano_cross_val_res_3 = galeano_cross_val_res_3.T
galeano_cross_val_res_3.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
galeano_cross_val_res_3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
30,0.01,0.10000,0.0001,0.463703,0.494748,0.889972,0.100025,0.037413,0.129899,0.071806
30,0.01,0.01000,0.0001,0.463658,0.495376,0.553222,0.819672,0.107603,0.809596,0.180164
30,0.01,0.00100,0.0001,0.033459,0.608673,0.706620,0.733143,0.094263,0.732140,0.166336
30,0.01,0.00010,0.0001,0.050683,0.509467,0.081832,0.985855,0.185258,0.951668,0.113520
30,0.01,0.00001,0.0001,0.037724,0.501256,0.993897,0.006595,0.037835,0.043932,0.072895
...,...,...,...,...,...,...,...,...,...,...
100,20.00,0.10000,0.0001,,,0.000000,0.000000,0.000000,0.962183,0.000000
100,20.00,0.01000,0.0001,0.018909,0.500000,0.000000,0.000021,0.037818,0.037837,0.072880
100,20.00,0.00100,0.0001,0.018914,0.500153,0.999912,0.001795,0.037879,0.039541,0.072993
100,20.00,0.00010,0.0001,0.019344,0.499750,0.998845,0.042327,0.039379,0.078499,0.075771


In [55]:
galeano_cross_val_res_4 = pd.DataFrame.from_dict(results_4)
galeano_cross_val_res_4.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']), inplace=True)
galeano_cross_val_res_4.replace(1.0,0.0,inplace=True)
galeano_cross_val_res_4 = galeano_cross_val_res_4.T
galeano_cross_val_res_4.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
galeano_cross_val_res_4

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10,0.01,0.10000,0.001,0.463702,0.494842,0.749749,0.656635,0.079037,0.660156,0.143000
10,0.01,0.01000,0.001,0.440390,0.635060,0.536686,0.766185,0.082750,0.757506,0.143391
10,0.01,0.00100,0.001,0.037662,0.499949,0.972942,0.028712,0.037879,0.064420,0.072919
10,0.01,0.00010,0.001,0.037697,0.499860,0.971731,0.029623,0.037868,0.065251,0.072895
10,0.01,0.00001,0.001,0.037701,0.499858,0.971707,0.029642,0.037868,0.065268,0.072895
...,...,...,...,...,...,...,...,...,...,...
25,20.00,0.10000,0.001,,,0.000000,0.000000,0.000000,0.962183,0.000000
25,20.00,0.01000,0.001,0.018911,0.500062,0.999984,0.000258,0.037826,0.038065,0.072895
25,20.00,0.00100,0.001,0.019340,0.495855,0.999415,0.025059,0.038730,0.061906,0.074570
25,20.00,0.00010,0.001,0.026177,0.524891,0.942780,0.312929,0.051171,0.336748,0.097074


In [56]:
galeano_cross_val_res_5 = pd.DataFrame.from_dict(results_5)
galeano_cross_val_res_5.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']), inplace=True)
galeano_cross_val_res_5.replace(1.0,0.0,inplace=True)
galeano_cross_val_res_5 = galeano_cross_val_res_5.T
galeano_cross_val_res_5.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
galeano_cross_val_res_5

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
30,0.01,0.10000,0.001,0.463680,0.495071,0.815758,0.544822,0.065804,0.555068,0.121783
30,0.01,0.01000,0.001,0.032192,0.577789,0.730286,0.706472,0.089075,0.707373,0.158783
30,0.01,0.00100,0.001,0.035389,0.509084,0.083187,0.981139,0.147736,0.947181,0.106440
30,0.01,0.00010,0.001,0.037718,0.501242,0.993881,0.006597,0.037835,0.043934,0.072894
30,0.01,0.00001,0.001,0.037724,0.501256,0.993897,0.006595,0.037835,0.043932,0.072895
...,...,...,...,...,...,...,...,...,...,...
100,20.00,0.10000,0.001,,,0.000000,0.000000,0.000000,0.962183,0.000000
100,20.00,0.01000,0.001,0.018909,0.500000,0.999992,0.000261,0.037826,0.038068,0.072895
100,20.00,0.00100,0.001,0.019323,0.499792,0.998861,0.038232,0.039218,0.074560,0.075474
100,20.00,0.00010,0.001,0.024887,0.461777,0.902274,0.371590,0.053418,0.391659,0.100864


In [57]:
galeano_cross_val_res_6_1 = pd.DataFrame.from_dict(results_6_1)
galeano_cross_val_res_6_1.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']), inplace=True)
galeano_cross_val_res_6_1.replace(1.0,0.0,inplace=True)
galeano_cross_val_res_6_1 = galeano_cross_val_res_6_1.T
galeano_cross_val_res_6_1.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
galeano_cross_val_res_6_1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,15.0,0.10000,0.00001,0.018909,0.499994,0.000000,0.000020,0.037818,0.037836,0.072880
1,15.0,0.01000,0.00001,0.018909,0.500000,0.000000,0.000040,0.037819,0.037855,0.072881
1,15.0,0.00100,0.00001,0.020055,0.499871,0.000000,0.000346,0.037830,0.038150,0.072902
1,15.0,0.00010,0.00001,0.019501,0.499840,0.999960,0.003179,0.037932,0.040874,0.073091
1,15.0,0.00001,0.00001,0.019878,0.504334,0.997618,0.067043,0.040332,0.102235,0.077530
...,...,...,...,...,...,...,...,...,...,...
25,20.0,0.10000,0.00001,,,0.000000,0.000000,0.000000,0.962183,0.000000
25,20.0,0.01000,0.00001,0.018909,0.500000,0.000000,0.000002,0.037817,0.037819,0.072878
25,20.0,0.00100,0.00001,0.018909,0.500000,0.000000,0.000036,0.037818,0.037852,0.072881
25,20.0,0.00010,0.00001,0.018926,0.500478,0.999976,0.000980,0.037852,0.038759,0.072943


In [58]:
galeano_cross_val_res_6_2 = pd.DataFrame.from_dict(results_6_2)
galeano_cross_val_res_6_2.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']), inplace=True)
galeano_cross_val_res_6_2.replace(1.0,0.0,inplace=True)
galeano_cross_val_res_6_2 = galeano_cross_val_res_6_2.T
galeano_cross_val_res_6_2.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
galeano_cross_val_res_6_2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,5.0,0.10000,0.00001,0.018909,0.500000,0.000000,1.386866e-05,0.037818,0.037830,0.072879
1,5.0,0.01000,0.00001,0.022679,0.499917,0.999984,1.468817e-04,0.037822,0.037958,0.072887
1,5.0,0.00100,0.00001,0.019884,0.500007,0.999968,1.072615e-03,0.037855,0.038848,0.072948
1,5.0,0.00010,0.00001,0.019280,0.498831,0.999816,1.134425e-02,0.038228,0.048725,0.073640
1,5.0,0.00001,0.00001,0.021556,0.482739,0.980617,1.962541e-01,0.045758,0.225916,0.087437
...,...,...,...,...,...,...,...,...,...,...
25,10.0,0.10000,0.00001,0.518909,0.500000,0.000000,3.151968e-07,0.037817,0.037817,0.072878
25,10.0,0.01000,0.00001,0.018909,0.500000,0.999992,4.160598e-05,0.037818,0.037857,0.072880
25,10.0,0.00100,0.00001,0.018909,0.500000,0.999984,1.957372e-04,0.037824,0.038005,0.072890
25,10.0,0.00010,0.00001,0.019456,0.501027,0.999960,2.398648e-03,0.037903,0.040124,0.073038


In [59]:
galeano_cross_val_res_6_3 = pd.DataFrame.from_dict(results_6_3)
galeano_cross_val_res_6_3.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']), inplace=True)
galeano_cross_val_res_6_3.replace(1.0,0.0,inplace=True)
galeano_cross_val_res_6_3 = galeano_cross_val_res_6_3.T
galeano_cross_val_res_6_3.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
galeano_cross_val_res_6_3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.5,0.10000,0.00001,0.465781,0.494751,0.999976,0.000044,0.037818,0.037859,0.072880
1,0.5,0.01000,0.00001,0.465781,0.494751,0.999976,0.000044,0.037818,0.037859,0.072880
1,0.5,0.00100,0.00001,0.462154,0.532268,0.853803,0.352921,0.049303,0.371863,0.093223
1,0.5,0.00010,0.00001,0.151500,0.782935,0.439023,0.940408,0.224538,0.921447,0.297116
1,0.5,0.00001,0.00001,0.032230,0.495078,0.820739,0.310989,0.044724,0.330266,0.084825
...,...,...,...,...,...,...,...,...,...,...
25,1.0,0.10000,0.00001,0.465781,0.494774,0.999976,0.000044,0.037818,0.037859,0.072880
25,1.0,0.01000,0.00001,0.025318,0.494134,0.885208,0.123150,0.038164,0.151969,0.073173
25,1.0,0.00100,0.00001,0.023816,0.486736,0.876154,0.185379,0.040558,0.211502,0.077527
25,1.0,0.00010,0.00001,0.022156,0.507215,0.939677,0.291296,0.049532,0.315816,0.094103


In [60]:
galeano_cross_val_res_6_4 = pd.DataFrame.from_dict(results_6_4)
galeano_cross_val_res_6_4.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']), inplace=True)
galeano_cross_val_res_6_4.replace(1.0,0.0,inplace=True)
galeano_cross_val_res_6_4 = galeano_cross_val_res_6_4.T
galeano_cross_val_res_6_4.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
galeano_cross_val_res_6_4

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.01,0.10000,0.00001,0.463702,0.494749,0.999976,0.000044,0.037818,0.037859,0.072880
1,0.01,0.01000,0.00001,0.463702,0.494749,0.999976,0.000044,0.037818,0.037859,0.072880
1,0.01,0.00100,0.00001,0.031156,0.584665,0.723862,0.680280,0.081714,0.681928,0.146850
1,0.01,0.00010,0.00001,0.264120,0.790450,0.421813,0.963391,0.311699,0.942910,0.358491
1,0.01,0.00001,0.00001,0.037715,0.498200,0.679354,0.336194,0.038669,0.349171,0.073172
...,...,...,...,...,...,...,...,...,...,...
25,0.10,0.10000,0.00001,0.463703,0.494748,0.890012,0.099487,0.037393,0.129383,0.071770
25,0.10,0.01000,0.00001,0.463703,0.494748,0.890012,0.099487,0.037393,0.129383,0.071770
25,0.10,0.00100,0.00001,0.452461,0.579969,0.845615,0.408155,0.053170,0.424699,0.100049
25,0.10,0.00010,0.00001,0.256286,0.788342,0.417916,0.962784,0.306208,0.942178,0.353445


In [61]:
galeano_cross_val_res_7 = pd.DataFrame.from_dict(results_7)
galeano_cross_val_res_7.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']), inplace=True)
galeano_cross_val_res_7.replace(1.0,0.0,inplace=True)
galeano_cross_val_res_7 = galeano_cross_val_res_7.T
galeano_cross_val_res_7.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
galeano_cross_val_res_7

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
30,0.01,0.10000,0.00001,0.463703,0.494748,0.890012,0.099487,0.037393,0.129383,0.071770
30,0.01,0.01000,0.00001,0.463703,0.494748,0.889907,0.101625,0.037474,0.131435,0.071920
30,0.01,0.00100,0.00001,0.033295,0.602833,0.707703,0.728034,0.092785,0.727265,0.164060
30,0.01,0.00010,0.00001,0.234102,0.787691,0.429704,0.957508,0.284418,0.937548,0.342282
30,0.01,0.00001,0.00001,0.038028,0.503530,0.965243,0.036694,0.037890,0.071809,0.072918
...,...,...,...,...,...,...,...,...,...,...
100,20.00,0.10000,0.00001,,,0.000000,0.000000,0.000000,0.962183,0.000000
100,20.00,0.01000,0.00001,0.018909,0.500000,0.000000,0.000013,0.037818,0.037830,0.072879
100,20.00,0.00100,0.00001,0.018909,0.500000,0.999984,0.000188,0.037823,0.037997,0.072890
100,20.00,0.00010,0.00001,0.018909,0.500000,0.999992,0.000778,0.037845,0.038566,0.072930


##### Aggregating the results of running Galeano's methodology

In [62]:
# Combining all results
galeano_cross_val_res = pd.concat([galeano_cross_val_res_1, galeano_cross_val_res_2, galeano_cross_val_res_3, galeano_cross_val_res_4, galeano_cross_val_res_5, galeano_cross_val_res_6_1, galeano_cross_val_res_6_2, galeano_cross_val_res_6_3, galeano_cross_val_res_6_4, galeano_cross_val_res_7], axis=0)
# Save combined results
galeano_cross_val_res.fillna(0, inplace=True)

galeano_cross_val_res.to_csv('results-latest/galeano_cross_val_results-combined.csv', index=True, header=True)
galeano_cross_val_res


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.01,0.10000,0.00010,0.463702,0.494749,0.999976,0.000044,0.037818,0.037859,0.072880
1,0.01,0.10000,0.00100,0.463702,0.494755,0.882241,0.141302,0.038814,0.169322,0.074356
1,0.01,0.01000,0.00010,0.463702,0.494755,0.871695,0.215585,0.041849,0.240398,0.079864
1,0.01,0.01000,0.00100,0.462535,0.476780,0.799318,0.538000,0.063670,0.547882,0.117946
1,0.01,0.00100,0.00010,0.031156,0.584665,0.723862,0.680280,0.081714,0.681928,0.146850
...,...,...,...,...,...,...,...,...,...,...
100,20.00,0.10000,0.00001,0.000000,0.000000,0.000000,0.000000,0.000000,0.962183,0.000000
100,20.00,0.01000,0.00001,0.018909,0.500000,0.000000,0.000013,0.037818,0.037830,0.072879
100,20.00,0.00100,0.00001,0.018909,0.500000,0.999984,0.000188,0.037823,0.037997,0.072890
100,20.00,0.00010,0.00001,0.018909,0.500000,0.999992,0.000778,0.037845,0.038566,0.072930


#### Getting the best models per metric

In [63]:
galeano_cross_val_best_models = pd.DataFrame()
galeano_cross_val_best_models['max'] = galeano_cross_val_res.max()
galeano_cross_val_best_models['index'] = galeano_cross_val_res.idxmax()
galeano_cross_val_best_models[['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = np.random.randint(10, size=(7,7))
galeano_cross_val_best_models.loc['auprc', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = galeano_cross_val_res.loc[galeano_cross_val_best_models.loc['auprc','index'], :].values
galeano_cross_val_best_models.loc['auroc', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = galeano_cross_val_res.loc[galeano_cross_val_best_models.loc['auroc','index'], :].values
galeano_cross_val_best_models.loc['sn', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = galeano_cross_val_res.loc[galeano_cross_val_best_models.loc['sn','index'], :].values
galeano_cross_val_best_models.loc['sp', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = galeano_cross_val_res.loc[galeano_cross_val_best_models.loc['sp','index'], :].values
galeano_cross_val_best_models.loc['prec', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = galeano_cross_val_res.loc[galeano_cross_val_best_models.loc['prec','index'], :].values
galeano_cross_val_best_models.loc['acc', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = galeano_cross_val_res.loc[galeano_cross_val_best_models.loc['acc','index'], :].values
galeano_cross_val_best_models.loc['f1', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = galeano_cross_val_res.loc[galeano_cross_val_best_models.loc['f1','index'], :].values

galeano_cross_val_best_models

Unnamed: 0,max,index,aupr_val,auroc_val,sn_val,sp_val,prec_val,acc_val,f1_val
auprc,0.518909,"(50, 15.0, 0.1, 1e-05)",0.518909,0.500001,0.0,3e-06,0.037817,0.03782,0.072878
auroc,0.790644,"(1, 0.1, 0.0001, 0.0001)",0.283463,0.790644,0.382052,0.97154,0.345385,0.949247,0.362795
sn,0.999992,"(1, 5.0, 0.1, 0.001)",0.025951,0.499994,0.999992,2.7e-05,0.037818,0.037843,0.072879
sp,1.0,"(1, 10.0, 0.1, 0.0001)",0.0,0.5,0.0,1.0,0.0,0.962183,0.0
prec,0.345385,"(1, 0.1, 0.0001, 0.0001)",0.283463,0.790644,0.382052,0.97154,0.345385,0.949247,0.362795
acc,0.962183,"(1, 20.0, 0.1, 0.0001)",0.0,0.0,0.0,0.0,0.0,0.962183,0.0
f1,0.362795,"(1, 0.1, 0.0001, 0.0001)",0.283463,0.790644,0.382052,0.97154,0.345385,0.949247,0.362795


In [64]:
galeano_cross_val_best_models.sort_values(by=['aupr_val'], ascending=False)

Unnamed: 0,max,index,aupr_val,auroc_val,sn_val,sp_val,prec_val,acc_val,f1_val
auprc,0.518909,"(50, 15.0, 0.1, 1e-05)",0.518909,0.500001,0.0,3e-06,0.037817,0.03782,0.072878
auroc,0.790644,"(1, 0.1, 0.0001, 0.0001)",0.283463,0.790644,0.382052,0.97154,0.345385,0.949247,0.362795
prec,0.345385,"(1, 0.1, 0.0001, 0.0001)",0.283463,0.790644,0.382052,0.97154,0.345385,0.949247,0.362795
f1,0.362795,"(1, 0.1, 0.0001, 0.0001)",0.283463,0.790644,0.382052,0.97154,0.345385,0.949247,0.362795
sn,0.999992,"(1, 5.0, 0.1, 0.001)",0.025951,0.499994,0.999992,2.7e-05,0.037818,0.037843,0.072879
sp,1.0,"(1, 10.0, 0.1, 0.0001)",0.0,0.5,0.0,1.0,0.0,0.962183,0.0
acc,0.962183,"(1, 20.0, 0.1, 0.0001)",0.0,0.0,0.0,0.0,0.0,0.962183,0.0


### Training and Testing with Optimal Params
Chosen models: Max AUPR, F1/Prec/AUROC


In [57]:
def train_test(train_interaction_matrix:np.ndarray, test_interaction_matrix:np.ndarray,lmda:float, k:int, 
              learn_rate:float=0.005, tolerance:float=1e-4, seed:int=0) -> np.ndarray:      
    
    U, V = fit(train=train_interaction_matrix, k=k, lmbda=lmda, learn_rate=learn_rate, tolerance=tolerance, seed=seed)
    predict_matrix = predict(U, V)
    
    test_predictions = np.multiply(predict_matrix, test_index_matrix)

    auc = auroc(truth=test_interaction_matrix.flatten(), predictions=test_predictions.flatten())
    auprc = aupr(truth=test_interaction_matrix.flatten(), predictions=test_predictions.flatten())
    sn, sp, prec, acc, f1 = get_metric(truth=test_interaction_matrix.flatten(), predictions=test_predictions.flatten())

    return np.array([auprc, auc, sn, sp, prec, acc, f1])


In [66]:
side_effect_matrix

Unnamed: 0,C0000729,C0000737,C0002418,C0002871,C0003123,C0003467,C0003811,C0004093,C0004238,C0004604,...,C0235329,C0853698,C0860901,C0221245,C0521500,C0037926,C1619692,C0149801,C0239940,C0877365
"CID100000085, CID000010917",1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
"CID100000137, CID000000137",0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"CID100000143, CID000000143",0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"CID100000143, CID000006006",0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"CID100000143, CID000149436",0,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"CID156603655, CID056603655",0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
"CID156842239, CID056842239",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"CID170683024, CID070683024",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"CID170695640, CID070695640",0,1,0,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


Max F1/Prec/AUROC: (1, 0.1, 0.0001, 0.0001)	

In [67]:
%%time

max_f1_prec_auroc_k=1
max_f1_prec_auroc_lmda=0.1
max_f1_prec_auroc_learn_rate=0.0001
max_f1_prec_auroc_tolerance=0.0001

# train model
max_f1_res = train_test(train_interaction_matrix=train_interaction_matrix.copy(), test_interaction_matrix=test_interaction_matrix.copy(), k=max_f1_prec_auroc_k, lmda=max_f1_prec_auroc_lmda, learn_rate=max_f1_prec_auroc_learn_rate, tolerance=max_f1_prec_auroc_tolerance)


Convergence state: False
CPU times: user 1min 19s, sys: 1.65 s, total: 1min 21s
Wall time: 1min 18s


Max AUPRC: (50, 15.0, 0.1, 1e-05)

In [68]:
%%time
max_aupr_k=50
max_aupr_lmda=15.0
max_aupr_learn_rate=0.1
max_aupr_tolerance=1e-05

# train model
max_aupr_res = train_test(train_interaction_matrix=train_interaction_matrix.copy(), test_interaction_matrix=test_interaction_matrix.copy(), k=max_aupr_k, lmda=max_aupr_lmda, learn_rate=max_aupr_learn_rate, tolerance=max_aupr_tolerance)


Convergence reached at epoch 4
Convergence state: True
CPU times: user 23.8 s, sys: 957 ms, total: 24.8 s
Wall time: 23.4 s


In [69]:
# results
max_f1_res, max_aupr_res

(array([0.31893528, 0.98767998, 0.4763639 , 0.996167  , 0.3419429 ,
        0.9940027 , 0.39811292]),
 array([2.08190094e-03, 4.99976828e-01, 1.00000000e+00, 2.10136049e-05,
        4.16377663e-03, 4.18461561e-03, 8.29302296e-03]))

#### More investigation
Looking at the performance of the top 30 model in terms of AUROC and AUPRC.

In [15]:
galeano_cross_val_res = pd.read_csv('results-latest/galeano_cross_val_results-combined.csv', index_col= [0,1,2,3])

In [82]:
top_models = galeano_cross_val_res.sort_values(by=['auroc', 'auprc', 'f1'], ascending=False)
top_models_parameters= top_models.head(30).index
top_models_parameters

MultiIndex([( 1,  0.1, 0.0001, 0.0001),
            ( 1,  0.1, 0.0001,  1e-05),
            ( 1, 0.01, 0.0001,  1e-05),
            ( 3,  0.1, 0.0001, 0.0001),
            ( 3,  0.1, 0.0001,  1e-05),
            ( 3, 0.01, 0.0001,  1e-05),
            ( 5,  0.1, 0.0001, 0.0001),
            ( 5,  0.1, 0.0001,  1e-05),
            (15,  0.1, 0.0001, 0.0001),
            (15,  0.1, 0.0001,  1e-05),
            (10,  0.1, 0.0001, 0.0001),
            (10,  0.1, 0.0001,  1e-05),
            (20,  0.1, 0.0001, 0.0001),
            (20,  0.1, 0.0001,  1e-05),
            ( 5, 0.01, 0.0001,  1e-05),
            (40,  0.1, 0.0001, 0.0001),
            (40,  0.1, 0.0001,  1e-05),
            (15, 0.01, 0.0001,  1e-05),
            (25,  0.1, 0.0001, 0.0001),
            (25,  0.1, 0.0001,  1e-05),
            (20, 0.01, 0.0001,  1e-05),
            (40, 0.01, 0.0001,  1e-05),
            (30,  0.1, 0.0001, 0.0001),
            (30,  0.1, 0.0001,  1e-05),
            (10, 0.01, 0.0001,  1e-05),


In [83]:
%%time
# getting the test results of the top 30 models ordered by AUROC, AUPRC, F1
top_models_test_results = {}
for k,lmda,learn_rate,tol in top_models_parameters.values:
    res = train_test(train_interaction_matrix=train_interaction_matrix.copy(), test_interaction_matrix=test_interaction_matrix.copy(), k=k, lmda=lmda, learn_rate=learn_rate, tolerance=tol)
    top_models_test_results[(k,lmda,learn_rate,tol)] = res



Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
Convergence state: False
CPU times: user 38min 14s, sys: 24.2 s, total: 38min 38s
Wall time: 37min 59s


In [85]:
with open('top_models_test_results.pkl', 'wb') as f:
    pickle.dump(top_models_test_results, f)
top_models_test_results

{(1,
  0.1,
  0.0001,
  0.0001): array([0.31893528, 0.98767998, 0.4763639 , 0.996167  , 0.3419429 ,
        0.9940027 , 0.39811292]),
 (1,
  0.1,
  0.0001,
  1e-05): array([0.31893528, 0.98767998, 0.4763639 , 0.996167  , 0.3419429 ,
        0.9940027 , 0.39811292]),
 (1,
  0.01,
  0.0001,
  1e-05): array([0.29199285, 0.98755986, 0.4744701 , 0.99586123, 0.32401512,
        0.99369032, 0.38506783]),
 (3,
  0.1,
  0.0001,
  0.0001): array([0.31295791, 0.98761911, 0.4814626 , 0.99600924, 0.33529471,
        0.99386683, 0.39529946]),
 (3,
  0.1,
  0.0001,
  1e-05): array([0.31295791, 0.98761911, 0.4814626 , 0.99600924, 0.33529471,
        0.99386683, 0.39529946]),
 (3,
  0.01,
  0.0001,
  1e-05): array([0.28397315, 0.98746792, 0.47993299, 0.99563465, 0.3149166 ,
        0.99348743, 0.38029551]),
 (5,
  0.1,
  0.0001,
  0.0001): array([0.29967477, 0.98748901, 0.47927744, 0.99590752, 0.32870417,
        0.99375644, 0.38996059]),
 (5,
  0.1,
  0.0001,
  1e-05): array([0.29967477, 0.98748901, 0

In [2]:
top_models_test_results = pd.read_pickle('top_models_test_results.pkl')

In [82]:
galeano_top_auc = pd.DataFrame.from_dict(top_models_test_results)
galeano_top_auc.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']), inplace=True)
galeano_top_auc.replace(1.0,0.0,inplace=True)
galeano_top_auc = galeano_top_auc.T
galeano_top_auc.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
galeano_top_auc.sort_values(by=['auroc'],ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.1,0.0001,0.0001,0.318935,0.98768,0.476364,0.996167,0.341943,0.994003,0.398113
1,0.1,0.0001,1e-05,0.318935,0.98768,0.476364,0.996167,0.341943,0.994003,0.398113
3,0.1,0.0001,0.0001,0.312958,0.987619,0.481463,0.996009,0.335295,0.993867,0.395299
3,0.1,0.0001,1e-05,0.312958,0.987619,0.481463,0.996009,0.335295,0.993867,0.395299
1,0.01,0.0001,1e-05,0.291993,0.98756,0.47447,0.995861,0.324015,0.99369,0.385068
5,0.1,0.0001,0.0001,0.299675,0.987489,0.479277,0.995908,0.328704,0.993756,0.389961
5,0.1,0.0001,1e-05,0.299675,0.987489,0.479277,0.995908,0.328704,0.993756,0.389961
3,0.01,0.0001,1e-05,0.283973,0.987468,0.479933,0.995635,0.314917,0.993487,0.380296
15,0.1,0.0001,0.0001,0.293675,0.987413,0.492243,0.995594,0.318398,0.993498,0.38668
15,0.1,0.0001,1e-05,0.293675,0.987413,0.492243,0.995594,0.318398,0.993498,0.38668


In [61]:
top_models_aupr = galeano_cross_val_res.sort_values(by=['auprc', 'auroc'], ascending=False)
top_models_parameters_aupr= top_models_aupr.head(30).index
top_models_parameters_aupr


(MultiIndex([( 50, 15.0,  0.1,  1e-05),
             ( 40, 10.0,  0.1, 0.0001),
             ( 40, 10.0,  0.1,  1e-05),
             ( 40,  5.0,  0.1,  1e-05),
             (  1, 10.0,  0.1,  0.001),
             (  5, 10.0,  0.1, 0.0001),
             (  5, 10.0,  0.1,  0.001),
             ( 25, 10.0,  0.1, 0.0001),
             ( 35, 10.0,  0.1, 0.0001),
             (100, 10.0,  0.1, 0.0001),
             ( 25, 10.0,  0.1,  0.001),
             ( 35, 10.0,  0.1,  0.001),
             ( 40, 10.0,  0.1,  0.001),
             (100, 10.0,  0.1,  0.001),
             (  5, 10.0,  0.1,  1e-05),
             ( 25, 10.0,  0.1,  1e-05),
             ( 35, 10.0,  0.1,  1e-05),
             (100, 10.0,  0.1,  1e-05),
             (  1, 10.0,  0.1,  1e-05),
             ( 50,  5.0,  0.1, 0.0001),
             (  3,  1.0,  0.1,  1e-05),
             (  3, 0.01, 0.01, 0.0001),
             ( 25,  1.0,  0.1,  1e-05),
             (  3, 0.01,  0.1,  0.001),
             (  1,  0.5, 0.01, 0.0001),


In [16]:
galeano_cross_val_res

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.01,0.10000,0.00010,0.463702,0.494749,0.999976,0.000044,0.037818,0.037859,0.072880
1,0.01,0.10000,0.00100,0.463702,0.494755,0.882241,0.141302,0.038814,0.169322,0.074356
1,0.01,0.01000,0.00010,0.463702,0.494755,0.871695,0.215585,0.041849,0.240398,0.079864
1,0.01,0.01000,0.00100,0.462535,0.476780,0.799318,0.538000,0.063670,0.547882,0.117946
1,0.01,0.00100,0.00010,0.031156,0.584665,0.723862,0.680280,0.081714,0.681928,0.146850
...,...,...,...,...,...,...,...,...,...,...
100,20.00,0.10000,0.00001,0.000000,0.000000,0.000000,0.000000,0.000000,0.962183,0.000000
100,20.00,0.01000,0.00001,0.018909,0.500000,0.000000,0.000013,0.037818,0.037830,0.072879
100,20.00,0.00100,0.00001,0.018909,0.500000,0.999984,0.000188,0.037823,0.037997,0.072890
100,20.00,0.00010,0.00001,0.018909,0.500000,0.999992,0.000778,0.037845,0.038566,0.072930


In [78]:
%%time
# getting the test results of the top 30 models ordered by AUROC, AUPRC, F1
top_models_test_results_aupr = {}
for k,lmda,learn_rate,tol in top_models_parameters_aupr.values:
    res = train_test(train_interaction_matrix=train_interaction_matrix.copy(), test_interaction_matrix=test_interaction_matrix.copy(), k=k, lmda=lmda, learn_rate=learn_rate, tolerance=tol)
    top_models_test_results_aupr[(k,lmda,learn_rate,tol)] = res

Convergence reached at epoch 4
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergence reached at epoch 4
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergence reached at epoch 2
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergence reached at epoch 2
Convergence state: True
Convergence reached at epoch 1
Convergence state: True
Convergenc

In [92]:

with open('top_models_test_results_aupr.pkl', 'wb') as f:
    pickle.dump(top_models_test_results_aupr, f)
top_models_test_results_aupr

{(50,
  15.0,
  0.1,
  1e-05): array([2.08190094e-03, 4.99976828e-01, 1.00000000e+00, 2.10136049e-05,
        4.16377663e-03, 4.18461561e-03, 8.29302296e-03]),
 (40,
  10.0,
  0.1,
  0.0001): array([2.08184538e-03, 5.00000000e-01, 1.00000000e+00, 6.09089998e-07,
        4.16369202e-03, 4.16429605e-03, 8.29285515e-03]),
 (40,
  10.0,
  0.1,
  1e-05): array([2.08184538e-03, 5.00000000e-01, 1.00000000e+00, 6.09089998e-07,
        4.16369202e-03, 4.16429605e-03, 8.29285515e-03]),
 (40,
  5.0,
  0.1,
  1e-05): array([2.08184601e-03, 5.00000152e-01, 1.00000000e+00, 1.85772449e-05,
        4.16376653e-03, 4.18218939e-03, 8.29300293e-03]),
 (1,
  10.0,
  0.1,
  0.001): array([0.        , 0.49999985, 0.        , 0.9999997 , 0.        ,
        0.99583601, 0.        ]),
 (5,
  10.0,
  0.1,
  0.0001): array([0.        , 0.49999985, 0.        , 0.9999997 , 0.        ,
        0.99583601, 0.        ]),
 (5,
  10.0,
  0.1,
  0.001): array([0.        , 0.49999985, 0.        , 0.9999997 , 0.        ,


In [88]:
galeano_top_aupr = pd.DataFrame.from_dict(top_models_test_results_aupr)
galeano_top_aupr.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']), inplace=True)
galeano_top_aupr.replace(1.0,0.0,inplace=True)
galeano_top_aupr = galeano_top_aupr.T
galeano_top_aupr.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
galeano_top_aupr.sort_values(by=['auroc'],ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,1.0,0.1,1e-05,0.520776,0.952057,0.993226,0.9061837,0.042389,0.906546,0.081307
1,0.5,0.1,0.001,0.520802,0.951842,0.999927,0.9037324,0.041621,0.904133,0.079916
3,0.1,0.1,0.001,0.520802,0.951842,0.999927,0.9037339,0.041622,0.904134,0.079917
3,0.01,0.01,0.0001,0.520802,0.951842,0.944716,0.937374,0.05933,0.937405,0.111648
25,1.0,0.1,1e-05,0.520802,0.951842,0.998325,0.9042239,0.041762,0.904616,0.08017
3,0.01,0.1,0.001,0.520802,0.951842,0.972103,0.9205725,0.048681,0.920787,0.092719
1,0.5,0.01,0.0001,0.520802,0.951842,0.999927,0.9037361,0.041623,0.904137,0.079919
3,0.1,0.01,0.0001,0.520802,0.951842,0.999053,0.9039955,0.041696,0.904391,0.08005
3,0.5,0.1,0.001,0.520802,0.951842,0.999927,0.9037355,0.041623,0.904136,0.079918
3,0.5,0.01,0.0001,0.520802,0.951842,0.996067,0.9049396,0.041972,0.905319,0.080549


## Pauwel

### Grid search over the above parameters

Parameters to be tested over:
- k_values = np.array([1,3,5,10,15,20,25,30,35,40,50,100])
- lambda_values = np.array([0.01,0.1,0.5,1.,5.,10.,15.,20.])
- learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
- tolerance_values = np.array([1e-3,1e-4,1e-5])

Time taken for training parameters with Pauwel dataset:

- For tolerance [1e-3, 1e-4] (all else same): 
    - CPU times: user 7h 14min 31s
    - sys: 47min 32s
    - total: 8h 2min 4s
    - Wall time: 6h 30min 42s
- For learn_rate [0.1] and tolerance [1e-5] (all else same):
    - CPU times: user 1h 24min 1s
    - sys: 4min 53s
    - total: 1h 28min 54s
    - Wall time: 1h 19min 29s
- For learn_rate [0.01,1e-3,1e-4,1e-5] and tolerance [1e-5] (all else same):
    - CPU times: user 12h 26min 13s
    - sys: 19min 45s
    - total: 12h 45min 58s
    - Wall time: 12h 7min 53s

In [26]:
# k_values = np.array([1,3,5,10,15,20,25,30,35,40,50,100])
# lambda_values = np.array([0.01,0.1,0.5,1.,5.,10.,15.,20.])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
# tolerance_values = np.array([1e-4,1e-3])

# %time pauwel_results= param_selection(dataset=pauwel.values.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates, tolerance_values=tolerance_values)

# training with tolerance value 1e-5
# tolerance_values = np.array([1e-4,1e-3])

# training with learn_rate 0.1 
# learning_rates = np.array([0.1])
# %time pauwel_results_2= param_selection(dataset=pauwel.values.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates, tolerance_values=tolerance_values)

# training with rest of learn rates 0.01,1e-3,1e-4,1e-5
# learning_rates = np.array([0.01,1e-3,1e-4,1e-5])
# %time pauwel_results_3= param_selection(dataset=pauwel.values.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates, tolerance_values=tolerance_values)



### Analysing Pauwel Results

#### Combining and saving results

In [27]:
# pauwel_results_df_1 = pd.DataFrame.from_dict(pauwel_results)
# pauwel_results_df_1.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']), inplace=True)
# pauwel_results_df_1.replace(1.0,0.0,inplace=True)
# Save to CSV
# pauwel_results_df_1.to_csv('results/results_pauwel.csv', index=True)

In [28]:
# pauwel_results_df_2 = pd.DataFrame.from_dict(pauwel_results_2)
# pauwel_results_df_2.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']),inplace=True)
# pauwel_results_df_2.replace(1.0,0.0,inplace=True)
# Save to CSV
# pauwel_results_df_2.to_csv('results/results_pauwel_2.csv', index=True)

In [29]:
# pauwel_results_df_3 = pd.DataFrame.from_dict(pauwel_results_3)
# pauwel_results_df_3.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']),inplace=True)
# pauwel_results_df_3.replace(1.0,0.0,inplace=True)
# Save to CSV
# pauwel_results_df_3.to_csv('results/results_pauwel_3.csv', index=True)

In [30]:
# pauwel_results_df_1 = pd.read_csv('results/results_pauwel.csv', header=[0,1,2,3], index_col=0)
# # transpose the dataframe
# pauwel_results_df_1 = pauwel_results_df_1.T
# pauwel_results_df_1.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
# pauwel_results_df_1

In [31]:
# pauwel_results_df_2 = pd.read_csv('results/results_pauwel_2.csv', header=[0,1,2,3], index_col=0)
# # transpose the dataframe
# pauwel_results_df_2 = pauwel_results_df_2.T
# pauwel_results_df_2.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
# pauwel_results_df_2

In [32]:
# pauwel_results_df_3 = pd.read_csv('results/results_pauwel_3.csv', header=[0,1,2,3],index_col=0)
# # transpose the dataframe
# pauwel_results_df_3 = pauwel_results_df_3.T
# pauwel_results_df_3.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
# pauwel_results_df_3

In [33]:
# Combine all results
# pauwel_results_df = pd.concat([pauwel_results_df_1, pauwel_results_df_2,pauwel_results_df_3], axis=0)
# Save combined results
# pauwel_results_df.to_csv('results/results_pauwel_all.csv', index=True, header=True)



#### Analysing results

In [34]:
pauwel_results_df = pd.read_csv('results/results_pauwel_all.csv', header=0, index_col=[0,1,2,3])
pauwel_results_df.fillna(0, inplace=True)
pauwel_results_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.01,0.10000,0.00010,0.428557,0.515248,0.792527,0.349199,0.059853,0.371224,0.111300
1,0.01,0.10000,0.00100,0.428557,0.515851,0.786210,0.482105,0.073528,0.497214,0.134479
1,0.01,0.01000,0.00010,0.428557,0.525599,0.783771,0.514532,0.077833,0.527908,0.141603
1,0.01,0.01000,0.00100,0.348701,0.682339,0.548165,0.823717,0.139832,0.810027,0.222824
1,0.01,0.00100,0.00010,0.092456,0.697855,0.588753,0.829858,0.153190,0.817880,0.243122
...,...,...,...,...,...,...,...,...,...,...
100,15.00,0.00001,0.00001,0.026290,0.486961,0.997087,0.119426,0.055887,0.163030,0.105842
100,20.00,0.01000,0.00001,0.024841,0.500002,0.000000,0.000016,0.049682,0.049697,0.094661
100,20.00,0.00100,0.00001,0.024841,0.500002,0.000000,0.000186,0.049690,0.049858,0.094676
100,20.00,0.00010,0.00001,0.024841,0.500002,0.000000,0.001037,0.049730,0.050667,0.094749


In [35]:
pauwel_res = pd.DataFrame()
pauwel_res['max'] = pauwel_results_df.max()
pauwel_res['index'] = pauwel_results_df.idxmax()
pauwel_res[['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = np.random.randint(10, size=(7,7))
pauwel_res.loc['auprc', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = pauwel_results_df.loc[pauwel_res.loc['auprc','index'], :].values
pauwel_res.loc['auroc', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = pauwel_results_df.loc[pauwel_res.loc['auroc','index'], :].values
pauwel_res.loc['sn', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = pauwel_results_df.loc[pauwel_res.loc['sn','index'], :].values
pauwel_res.loc['sp', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = pauwel_results_df.loc[pauwel_res.loc['sp','index'], :].values
pauwel_res.loc['prec', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = pauwel_results_df.loc[pauwel_res.loc['prec','index'], :].values
pauwel_res.loc['acc', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = pauwel_results_df.loc[pauwel_res.loc['acc','index'], :].values
pauwel_res.loc['f1', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = pauwel_results_df.loc[pauwel_res.loc['f1','index'], :].values

pauwel_res

Unnamed: 0,max,index,aupr_val,auroc_val,sn_val,sp_val,prec_val,acc_val,f1_val
auprc,0.434356,"(15, 0.5, 0.001, 1e-05)",0.434356,0.662233,0.696295,0.705447,0.109989,0.704992,0.18997
auroc,0.799473,"(40, 0.5, 0.0001, 1e-05)",0.420982,0.799473,0.439233,0.973283,0.462214,0.946751,0.45043
sn,0.999984,"(3, 5.0, 0.1, 0.0001)",0.024841,0.499999,0.999984,0.00017,0.049689,0.049842,0.094673
sp,0.984322,"(5, 0.1, 0.0001, 0.0001)",0.134105,0.521267,0.141174,0.984322,0.320074,0.942433,0.19593
prec,0.46791,"(10, 0.5, 0.0001, 1e-05)",0.42016,0.793674,0.434683,0.974158,0.46791,0.947357,0.450685
acc,0.950319,"(1, 20.0, 0.1, 1e-05)",0.0,0.0,0.0,0.0,0.0,0.950319,0.0
f1,0.456726,"(1, 0.01, 0.0001, 1e-05)",0.395196,0.769959,0.458774,0.971237,0.454696,0.945777,0.456726


## Liu

#### Gridsearch over Liu Dataset

Parameters to be tested over:
- k_values = np.array([1,3,5,10,15,20,25,30,35,40,50,100])
- lambda_values = np.array([0.01,0.1,0.5,1,5,10,15,20])
- learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
- tolerance_values = np.array([1e-4,1e-3,1e-5])

Time taken for gridsearch over all parameters:
- CPU times: user 16h 39min 25s
- sys: 1h 11min 17s
- total: 17h 50min 42s
- Wall time: 15h 33min 8s



In [36]:

# k_values = np.array([1,3,5,10,15,20,25,30,35,40,50,100])
# lambda_values = np.array([0.01,0.1,0.5,1.,5.,10.,15.,20.])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
# tolerance_values = np.array([1e-3,1e-4,1e-5])

# %time liu_results = param_selection(dataset=liu.values.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates,tolerance_values=tolerance_values)

#### Saving results

In [37]:
# liu_results_df = pd.DataFrame.from_dict(liu_results)
# liu_results_df.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']),inplace=True)
# liu_results_df.replace(1.0,0.0,inplace=True)
# Save to CSV
# liu_results_df.to_csv('results/results_liu.csv', index=True)
# liu_results_df = liu_results_df.T
# liu_results_df.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)
# Save transposed dataframe
# liu_results_df.to_csv('results/results_liu_all.csv', header=True, index=True)

#### Analysing Results

In [38]:
liu_results_df = pd.read_csv('results/results_liu_all.csv', header=0, index_col=[0,1,2,3])
liu_results_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.01,0.10000,0.00001,0.428709,0.515478,0.793480,0.239005,0.053455,0.267493,0.100162
1,0.01,0.10000,0.00010,0.428709,0.515478,0.792078,0.307429,0.058330,0.332330,0.108659
1,0.01,0.10000,0.00100,0.428707,0.516102,0.571236,0.803311,0.135919,0.791387,0.219590
1,0.01,0.01000,0.00001,0.428709,0.515478,0.789798,0.398866,0.066433,0.418951,0.122557
1,0.01,0.01000,0.00010,0.428707,0.553022,0.775965,0.574342,0.089863,0.584701,0.161072
...,...,...,...,...,...,...,...,...,...,...
100,20.00,0.00010,0.00010,0.026133,0.494928,0.999493,0.042426,0.053508,0.091599,0.101578
100,20.00,0.00010,0.00100,0.037364,0.549129,0.946542,0.311307,0.069283,0.343944,0.129115
100,20.00,0.00001,0.00001,0.026591,0.480687,0.975830,0.251177,0.065928,0.288409,0.123511
100,20.00,0.00001,0.00010,0.039087,0.554015,0.905734,0.351089,0.070284,0.379586,0.130446


In [39]:
liu_res = pd.DataFrame()
liu_res['max'] = liu_results_df.max()
liu_res['index'] = liu_results_df.idxmax()
liu_res[['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = np.random.randint(10, size=(7,7))
liu_res.loc['auprc', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = liu_results_df.loc[liu_res.loc['auprc','index'], :].values
liu_res.loc['auroc', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = liu_results_df.loc[liu_res.loc['auroc','index'], :].values
liu_res.loc['sn', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = liu_results_df.loc[liu_res.loc['sn','index'], :].values
liu_res.loc['sp', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = liu_results_df.loc[liu_res.loc['sp','index'], :].values
liu_res.loc['prec', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = liu_results_df.loc[liu_res.loc['prec','index'], :].values
liu_res.loc['acc', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = liu_results_df.loc[liu_res.loc['acc','index'], :].values
liu_res.loc['f1', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = liu_results_df.loc[liu_res.loc['f1','index'], :].values

liu_res

Unnamed: 0,max,index,aupr_val,auroc_val,sn_val,sp_val,prec_val,acc_val,f1_val
auprc,0.435891,"(5, 0.1, 0.01, 0.001)",0.435891,0.637053,0.238308,0.926213,0.148881,0.890869,0.183267
auroc,0.804669,"(3, 0.5, 0.0001, 1e-05)",0.422199,0.804669,0.456076,0.969386,0.446558,0.943012,0.451267
sn,0.999983,"(3, 10.0, 0.01, 0.001)",0.025697,0.5,0.999983,0.002261,0.051489,0.053522,0.097934
sp,0.980183,"(30, 0.1, 0.0001, 0.0001)",0.136187,0.512371,0.152588,0.980183,0.294305,0.937662,0.200977
prec,0.475,"(5, 0.5, 0.0001, 1e-05)",0.425172,0.800874,0.433392,0.974056,0.475,0.946277,0.453243
acc,0.946277,"(5, 0.5, 0.0001, 1e-05)",0.425172,0.800874,0.433392,0.974056,0.475,0.946277,0.453243
f1,0.459098,"(1, 0.01, 0.0001, 1e-05)",0.394419,0.771716,0.444523,0.973353,0.47466,0.946182,0.459098


## Mizutani

#### Gridsearch over parameters

Parameters to be tested over: 
- k_values = np.array([1,3,5,10,15,20,25,30,35,40,50,100])
- lambda_values = np.array([0.01,0.1,0.5,1,5,10,15,20])
- learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
- tolerance_values = np.array([1e-3,1e-4,1e-5])

Time taken for training parameters with Mizutani dataset:
- For tolerance [1e-4] (all else same):
    - CPU times: 
        - user 3h 51min 7s
        - sys: 23min 18s,
        - total: 4h 14min 25s
    - Wall time: 3h 29min 28s
- For tolerance [1e-3] and learning rate [0.1] (all else same):
    - 36min
- For tolerance [1e-3] and learning rates [0.01]:
    - CPU times: 
        - user 26min 55s
        - sys: 4min 44s
        - total: 31min 39s
    - Wall time: 22min 32s
- For tolerance [1e-3] and learning rates [1e-3, 1e-4, 1e-5]:
    - CPU times: 
        - user 42min 40s
        - sys: 12min 17s
        - total: 54min 58s
    - Wall time: 31min 1s
- For tolerance [1e-5] (all else same):
    - CPU times: 
        - user 7h 56min 2s
        - sys: 23min 39s
        - total: 8h 19min 41s
    - Wall time: 7h 33min 51s


In [40]:
# k_values = np.array([1,3,5,10,15,20,25,30,35,40,50,100])
# lambda_values = np.array([0.01,0.1,0.5,1,5,10,15,20])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])

# Training with tolerance value 1e-4
# tolerance_values = np.array([1e-4])
# %time mizutani_results = param_selection(dataset=mizutani.values.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates,tolerance_values=tolerance_values)

# Training with tolerance value 1e-3
# tolerance_values = np.array([1e-3])

# learning_rates = np.array([0.1])
# %time mizutani_results_2 = param_selection(dataset=mizutani.values.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates,tolerance_values=tolerance_values)
# learning_rates = np.array([0.01])
# %time mizutani_results_3 = param_selection(dataset=mizutani.values.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates,tolerance_values=tolerance_values)
# learning_rates = np.array([1e-3,1e-4,1e-5])
# %time mizutani_results_4 = param_selection(dataset=mizutani.values.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates,tolerance_values=tolerance_values)

# Training with tolerance value 1e-5
# tolerance_values = np.array([1e-5])
# learning_rates = np.array([0.1,0.01,1e-3,1e-4,1e-5])
# %time mizutani_results_5 = param_selection(dataset=mizutani.values.copy(), k_values=k_values, lmda_values=lambda_values, learn_rates=learning_rates,tolerance_values=tolerance_values)


#### Combining and Saving Results

In [41]:
# mizutani_results_df_1 = pd.DataFrame.from_dict(mizutani_results)
# mizutani_results_df_1.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']),inplace=True)
# mizutani_results_df_1.replace(1.0,0.0,inplace=True)
# Save to CSV
# mizutani_results_df_1.to_csv('results/results_mizutani.csv', index=True)

# mizutani_results_df_2 = pd.DataFrame.from_dict(mizutani_results_2)
# mizutani_results_df_2.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']),inplace=True)
# mizutani_results_df_2.replace(1.0,0.0,inplace=True)
# Save to CSV
# mizutani_results_df_2.to_csv('results/results_mizutani_2.csv', index=True)

# mizutani_results_df_3 = pd.DataFrame.from_dict(mizutani_results_3)
# mizutani_results_df_3.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']),inplace=True)
# mizutani_results_df_3.replace(1.0,0.0,inplace=True)
# Save to CSV
# mizutani_results_df_3.to_csv('results/results_mizutani_3.csv', index=True)

# mizutani_results_df_4 = pd.DataFrame.from_dict(mizutani_results_4)
# mizutani_results_df_4.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']),inplace=True)
# mizutani_results_df_4.replace(1.0,0.0,inplace=True)
# Save to CSV
# mizutani_results_df_4.to_csv('results/results_mizutani_4.csv', index=True)

# mizutani_results_df_5 = pd.DataFrame.from_dict(mizutani_results_5)
# mizutani_results_df_5.set_index(pd.Series(['auprc','auroc', 'sn', 'sp', 'prec', 'acc', 'f1']),inplace=True)
# mizutani_results_df_5.replace(1.0,0.0,inplace=True)
# # Save to CSV
# mizutani_results_df_5.to_csv('results/results_mizutani_5.csv', index=True)


In [42]:
# mizutani_results_df_1 = pd.read_csv('results/results_mizutani.csv', header=[0,1,2,3],index_col=0)
# mizutani_results_df_1 = mizutani_results_df_1.T
# mizutani_results_df_1.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)

# mizutani_results_df_2 = pd.read_csv('results/results_mizutani_2.csv', header=[0,1,2,3],index_col=0)
# mizutani_results_df_2 = mizutani_results_df_2.T
# mizutani_results_df_2.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)

# mizutani_results_df_3 = pd.read_csv('results/results_mizutani_3.csv', header=[0,1,2,3],index_col=0)
# mizutani_results_df_3 = mizutani_results_df_3.T
# mizutani_results_df_3.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)

# mizutani_results_df_4 = pd.read_csv('results/results_mizutani_4.csv', header=[0,1,2,3],index_col=0)
# mizutani_results_df_4 = mizutani_results_df_4.T
# mizutani_results_df_4.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)

# mizutani_results_df_5 = pd.read_csv('results/results_mizutani_5.csv', header=[0,1,2,3],index_col=0)
# mizutani_results_df_5 = mizutani_results_df_5.T
# mizutani_results_df_5.index.rename(['k', 'lambda', 'learn_rate', 'tolerance'], inplace=True)


In [43]:
# # Combine all results
# mizutani_results_df = pd.concat([mizutani_results_df_1, mizutani_results_df_2,mizutani_results_df_3, mizutani_results_df_4, mizutani_results_df_5], axis=0)
# # Save combined results
# mizutani_results_df.to_csv('results/results_mizutani_all.csv', index=True, header=True)


In [44]:
mizutani_results_df = pd.read_csv('results/results_mizutani_all.csv', header=0, index_col=[0,1,2,3])
mizutani_results_df.fillna(0, inplace=True)
mizutani_results_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auprc,auroc,sn,sp,prec,acc,f1
k,lambda,learn_rate,tolerance,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.01,0.10000,0.00010,0.430974,0.516698,0.789933,0.322845,0.064348,0.348849,0.119002
1,0.01,0.01000,0.00010,0.430972,0.565253,0.775071,0.558920,0.093871,0.570954,0.167461
1,0.01,0.00100,0.00010,0.117559,0.688247,0.600579,0.842325,0.183378,0.828867,0.280967
1,0.01,0.00010,0.00010,0.056574,0.500732,0.999715,0.000343,0.055676,0.055980,0.105477
1,0.01,0.00001,0.00010,0.056576,0.500737,0.999715,0.000344,0.055676,0.055981,0.105477
...,...,...,...,...,...,...,...,...,...,...
100,20.00,0.10000,0.00001,0.000000,0.000000,0.000000,0.000000,0.000000,0.944327,0.000000
100,20.00,0.01000,0.00001,0.034511,0.499590,0.999837,0.000549,0.055693,0.056182,0.105509
100,20.00,0.00100,0.00001,0.035865,0.499628,0.999918,0.000581,0.055699,0.056216,0.105520
100,20.00,0.00010,0.00001,0.030316,0.499009,0.999959,0.001828,0.055767,0.057397,0.105642


#### Analysing Results

In [45]:
mizutani_res = pd.DataFrame()
mizutani_res['max'] = mizutani_results_df.max()
mizutani_res['index'] = mizutani_results_df.idxmax()
mizutani_res[['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = np.random.randint(10, size=(7,7))
mizutani_res.loc['auprc', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = mizutani_results_df.loc[mizutani_res.loc['auprc','index'], :].values
mizutani_res.loc['auroc', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = mizutani_results_df.loc[mizutani_res.loc['auroc','index'], :].values
mizutani_res.loc['sn', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = mizutani_results_df.loc[mizutani_res.loc['sn','index'], :].values
mizutani_res.loc['sp', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = mizutani_results_df.loc[mizutani_res.loc['sp','index'], :].values
mizutani_res.loc['prec', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = mizutani_results_df.loc[mizutani_res.loc['prec','index'], :].values
mizutani_res.loc['acc', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = mizutani_results_df.loc[mizutani_res.loc['acc','index'], :].values
mizutani_res.loc['f1', ['aupr_val', 'auroc_val', 'sn_val', 'sp_val', 'prec_val', 'acc_val', 'f1_val']] = mizutani_results_df.loc[mizutani_res.loc['f1','index'], :].values

mizutani_res

Unnamed: 0,max,index,aupr_val,auroc_val,sn_val,sp_val,prec_val,acc_val,f1_val
auprc,0.438365,"(1, 0.1, 0.01, 0.001)",0.438365,0.646563,0.272329,0.915365,0.159451,0.879566,0.201135
auroc,0.796717,"(20, 0.5, 0.0001, 1e-05)",0.434011,0.796717,0.452203,0.969797,0.468844,0.940981,0.460373
sn,0.99998,"(1, 20.0, 0.01, 0.0001)",0.032073,0.499944,0.99998,0.000379,0.055691,0.056029,0.105507
sp,0.980655,"(50, 0.1, 0.0001, 0.0001)",0.147276,0.523637,0.151781,0.980655,0.31627,0.93451,0.205122
prec,0.490348,"(15, 0.01, 0.0001, 1e-05)",0.412714,0.766206,0.446372,0.972648,0.490348,0.943349,0.467328
acc,0.944327,"(1, 20.0, 0.1, 1e-05)",0.0,0.0,0.0,0.0,0.0,0.944327,0.0
f1,0.468023,"(5, 0.01, 0.0001, 1e-05)",0.416887,0.770814,0.452876,0.97156,0.484218,0.942684,0.468023


##  Measuring Performance of Pauwel, Liu, and Mizutani using Zhang's Methodology
20 rounds of 5-fold CV


In [None]:
# Time for below : CPU times: user 38min 35s, sys: 1min 4s, total: 39min 40s
# Wall time: 37min 41s

In [40]:
# Optimal parameters chosen for Pauwel are
# k=15, lambda=0.5, alpha=0.001, tolerance=1e-5
pauwel_lmda=0.5
pauwel_k=15
pauwel_learn_rate=0.001
pauwel_tolerance=1e-5
%time pauwel_optimal_param_results = train_optimal_params(dataset=pauwel.values.copy(), lmbda=pauwel_lmda, k=pauwel_k, tolerance=pauwel_tolerance, learn_rate=pauwel_learn_rate)


Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV

In [46]:
pauwel_optimal_param_results

array([0.43477066, 0.66247097, 0.69621616, 0.70573663, 0.11007599,
       0.70526364, 0.19009591])

In [41]:
# For the high auroc model on Pauwel's dataset,
# k=40, lambda=0.5, alpha=0.0001, tolerance=1e-5
pauwel_lmda=0.5
pauwel_k=40
pauwel_learn_rate=0.0001
pauwel_tolerance=1e-5
%time pauwel_optimal_param_results_2 = train_optimal_params(dataset=pauwel.values.copy(), lmbda=pauwel_lmda, k=pauwel_k, tolerance=pauwel_tolerance, learn_rate=pauwel_learn_rate)


Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV

In [47]:
pauwel_optimal_param_results_2


array([0.419457  , 0.78841673, 0.44161075, 0.97237833, 0.45543303,
       0.94600912, 0.44834695])

In [42]:
# Optimal params for Liu
# high aupr model params: k=5, lambda=0.1, alpha=0.001, tolerance=1e-3

liu_lmda=0.1
liu_k=5
liu_learn_rate=0.01
liu_tolerance=0.001
%time liu_optimal_param_results_1 = train_optimal_params(dataset=liu.values.copy(), lmbda=liu_lmda, k=liu_k, tolerance=liu_tolerance, learn_rate=liu_learn_rate)


Performing CV fold: 0
Convergence reached at epoch 209
Convergence state: True
Performing CV fold: 1
Convergence reached at epoch 214
Convergence state: True
Performing CV fold: 2
Convergence reached at epoch 213
Convergence state: True
Performing CV fold: 3
Convergence reached at epoch 214
Convergence state: True
Performing CV fold: 4
Convergence reached at epoch 210
Convergence state: True
Performing CV fold: 0
Convergence reached at epoch 204
Convergence state: True
Performing CV fold: 1
Convergence reached at epoch 212
Convergence state: True
Performing CV fold: 2
Convergence reached at epoch 215
Convergence state: True
Performing CV fold: 3
Convergence reached at epoch 200
Convergence state: True
Performing CV fold: 4
Convergence reached at epoch 208
Convergence state: True
Performing CV fold: 0
Convergence reached at epoch 208
Convergence state: True
Performing CV fold: 1
Convergence reached at epoch 208
Convergence state: True
Performing CV fold: 2
Convergence reached at epoch 2

In [48]:
liu_optimal_param_results_1

array([0.25024515, 0.63206191, 0.46982856, 0.83534839, 0.13982588,
       0.81656836, 0.2050049 ])

In [43]:
# high prec/acc (high general performance) model params
# k=5, lambda=0.5, alpha=0.0001, tolerance=1e-5

liu_lmda=0.5
liu_k=5
liu_learn_rate=0.0001
liu_tolerance=1e-5
%time liu_optimal_param_results_2 = train_optimal_params(dataset=liu.values.copy(), lmbda=liu_lmda, k=liu_k, tolerance=liu_tolerance, learn_rate=liu_learn_rate)


Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV

In [49]:
liu_optimal_param_results_2

array([0.42300036, 0.79587728, 0.44745883, 0.97118075, 0.45704242,
       0.94427247, 0.45208091])

In [46]:
# high auroc model params
# k=3, lambda=0.5, alpha=0.0001, tolerance=1e-5

liu_lmda=0.5
liu_k=3
liu_learn_rate=0.0001
liu_tolerance=1e-5
%time liu_optimal_param_results_3 = train_optimal_params(dataset=liu.values.copy(), lmbda=liu_lmda, k=liu_k, tolerance=liu_tolerance, learn_rate=liu_learn_rate)


Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV

In [47]:
liu_optimal_param_results_3

array([0.40976545, 0.79523269, 0.44249641, 0.97059376, 0.44971347,
       0.94346067, 0.44587815])

In [44]:
# high aupr model for mizutani
# params: k=1,lambda=0.1, alpha=0.01, tolerance=0.001
mizutani_lmda=0.1
mizutani_k=1
mizutani_learn_rate=0.01
mizutani_tolerance=0.001
%time mizutani_optimal_param_results_1 = train_optimal_params(dataset=mizutani.values.copy(), lmbda=mizutani_lmda, k=mizutani_k, tolerance=mizutani_tolerance, learn_rate=mizutani_learn_rate)


Performing CV fold: 0
Convergence reached at epoch 241
Convergence state: True
Performing CV fold: 1
Convergence reached at epoch 243
Convergence state: True
Performing CV fold: 2
Convergence reached at epoch 252
Convergence state: True
Performing CV fold: 3
Convergence reached at epoch 242
Convergence state: True
Performing CV fold: 4
Convergence reached at epoch 245
Convergence state: True
Performing CV fold: 0
Convergence reached at epoch 239
Convergence state: True
Performing CV fold: 1
Convergence reached at epoch 250
Convergence state: True
Performing CV fold: 2
Convergence reached at epoch 241
Convergence state: True
Performing CV fold: 3
Convergence reached at epoch 242
Convergence state: True
Performing CV fold: 4
Convergence reached at epoch 242
Convergence state: True
Performing CV fold: 0
Convergence reached at epoch 240
Convergence state: True
Performing CV fold: 1
Convergence reached at epoch 238
Convergence state: True
Performing CV fold: 2
Convergence reached at epoch 2

In [50]:
mizutani_optimal_param_results_1

array([0.38492202, 0.61862054, 0.46632383, 0.81371881, 0.14066109,
       0.79437843, 0.20018319])

In [45]:
# high auroc model for mizutani
# params: k=20, lambda=0.5, alpha=0.0001, tolerance=1e-5
mizutani_lmda=0.5
mizutani_k=20
mizutani_learn_rate=0.0001
mizutani_tolerance=1e-5
%time mizutani_optimal_param_results_2 = train_optimal_params(dataset=mizutani.values.copy(), lmbda=mizutani_lmda, k=mizutani_k, tolerance=mizutani_tolerance, learn_rate=mizutani_learn_rate)


Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV fold: 1
Convergence state: False
Performing CV fold: 2
Convergence state: False
Performing CV fold: 3
Convergence state: False
Performing CV fold: 4
Convergence state: False
Performing CV fold: 0
Convergence state: False
Performing CV

In [51]:
mizutani_optimal_param_results_2

array([0.43594312, 0.79055276, 0.45695603, 0.96914229, 0.46631413,
       0.94062756, 0.46148682])