# Preterm Birth Prediction Microbiome Model Framework (Code)

Challenge website:
https://www.synapse.org/#!Synapse:syn26133770/wiki/618018

In [1]:
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
import sklearn as sk

from sklearn import metrics
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

from collections import Counter,defaultdict, OrderedDict
from itertools import islice

%matplotlib inline
import matplotlib.pyplot as plt






def dataset_splitID(meta_data, prop, myseed):
    
    subjects = list(np.unique(meta_data["participant_id"]))
    numsubjects = len(subjects)
    
    if myseed != None:
        random.seed(myseed)

    subjects_shuffle = random.sample(subjects, numsubjects)
    
    train_subjects = subjects_shuffle[0:(int(numsubjects*prop[0])+1)] 
    valid_subjects = subjects_shuffle[(int(numsubjects*prop[0])+2):(int(numsubjects*(prop[0]+prop[1]))+1)]
    test_subjects = subjects_shuffle[(int(numsubjects*(prop[0]+prop[1]))+2):numsubjects]
    
    splitID_train = meta_data['participant_id'].isin(train_subjects)
    splitID_valid = meta_data['participant_id'].isin(valid_subjects)
    splitID_test = meta_data['participant_id'].isin(test_subjects)
    
    return splitID_train, splitID_valid, splitID_test


# Possible, but not used here
def dataset_pjt_splitID(meta_data, prop, myseed):
    
    projects = meta_data['project']

    splitID_train = []
    splitID_valid = []
    splitID_test  = []
    
    for pjt in np.unique(projects):
        
        submeta = meta_data[projects == pjt]
        subsubjects = list(np.unique(submeta["participant_id"]))
        numsub = len(subsubjects)
        
        subsubjects_shuffle = random.sample(subsubjects, numsub)
        
        train_subsubjects = subsubjects_shuffle[0:(int(numsub*prop[0])+1)] 
        valid_subsubjects = subsubjects_shuffle[(int(numsub*prop[0])+2):(int(numsub*(prop[0]+prop[1]))+1)]
        test_subsubjects  = subsubjects_shuffle[(int(numsub*(prop[0]+prop[1]))+2):numsub]
        
        splitID_train.extend(submeta['participant_id'].isin(train_subsubjects))
        splitID_valid.extend(submeta['participant_id'].isin(valid_subsubjects))
        splitID_test.extend(submeta['participant_id'].isin(test_subsubjects))
        
    return splitID_train, splitID_valid, splitID_test


def Data_Reshaper_Input(data, seq_length):
    
    numsubjects = len(np.unique(data['participant_id']))
    myvary = list(data.columns.values)[2:data.shape[1]]
    num_covariates = len(myvary)
    
    myinput = np.zeros((numsubjects, seq_length, num_covariates), dtype=np.float32)
    for i in range(num_covariates):
        data_wide = data.pivot_table(index=['participant_id'], columns='collect_period', values=myvary[i])
        data_wide = data_wide.sort_index(axis=1)
        data_wide = data_wide.fillna(0)
        tmpindex = data_wide._get_numeric_data().columns.values - 1
        tmpindex = tmpindex.astype(int)
        # time varying variables need to impute all and no records are denoted as 0
        for j in range(numsubjects):
                myinput[j,tmpindex,i] = data_wide.iloc[[j]]
    return myinput



def Data_Reshaper_Output_ManytoMany_0(data, seq_length, classlabel):

    num_samples = len(np.unique(data['participant_id']))
    
    data_wide = data.pivot_table(index=['participant_id'], columns='collect_period', values=classlabel)
    data_wide = data_wide.sort_index(axis=1)
    
    myoutput = np.zeros((num_samples, seq_length, 2), dtype=np.float32)
    for i in range(num_samples):
        tmp = data_wide.iloc[i,:]
        
        if np.nanmax(tmp) == 1:
            # label linear smoonthing from 0.5 to 1
            # fill all position 1 to have final labels equal to 1
            myoutput[i,:,0].fill(1)
            myoutput[i,:,0] = np.linspace(start=0.5, stop=1, num=seq_length)
        else:
            # label linear smoonthing from 0.5 to 0
            # fill all position 0 to have final labels equal to 0 
            #     but array alrady initialize as 0
            myoutput[i,:,0] = np.linspace(start=0.5, stop=0, num=seq_length)
            
        myoutput[i,:,1] = 1 - myoutput[i,:,0]
    return myoutput



def evaluate(model, device, myinput, myoutput, finalperiod, cutoff=0.5):
    
    model.eval()
    
    # predicted labels
    myinput  = torch.from_numpy(myinput).float().to(device)
    myoutput_nn, hidden = model(myinput, device)
    myoutput_nn = myoutput_nn.reshape((myoutput.shape))
    output_prob = nn.functional.softmax(myoutput_nn, dim=2)
    mypredprob = output_prob[:,finalperiod-1,:].cpu().detach().numpy()
    mypred = 1*(mypredprob[:,0] > cutoff)
    # observed labels
    myobs  = myoutput[:,finalperiod-1,0]
    
    return myobs, mypred, mypredprob



def metadata_loader(meta_dir, alpha_dir, cst_dir, task, finalperiod):
    
    meta_data = pd.DataFrame(pd.read_csv(meta_dir, delimiter=','))
    meta_data.replace('Unknown', np.nan, inplace=True)
    meta_data = meta_data[['participant_id', 'project', 'delivery_wk', 'collect_wk', 'age', 'race']]
    
    alpha_data = pd.DataFrame(pd.read_csv(alpha_dir, delimiter=','))
    cst_data = pd.DataFrame(pd.read_csv(cst_dir, delimiter=','))
    
    meta_data = pd.concat([meta_data, alpha_data['shannon'], alpha_data['inv_simpson'], alpha_data['rooted_pd'], cst_data['CST']], axis=1)

    for i in range(1,meta_data.shape[1]):
        if meta_data.iloc[:,i].dtypes == object:
            meta_data.iloc[:,i] = meta_data.iloc[:,i].astype('category').cat.codes + 1
            meta_data.iloc[:,i] = meta_data.iloc[:,i].astype('float64')
            
    # create new variable 'collect_period'
    meta_data['collect_period'] = 1
    meta_data.loc[(meta_data['collect_wk']>=8)  & (meta_data['collect_wk']<=14),'collect_period'] = 2
    meta_data.loc[(meta_data['collect_wk']>=15) & (meta_data['collect_wk']<=21),'collect_period'] = 3
    meta_data.loc[(meta_data['collect_wk']>=22) & (meta_data['collect_wk']<=28),'collect_period'] = 4
    meta_data.loc[(meta_data['collect_wk']>=29) & (meta_data['collect_wk']<=32),'collect_period'] = 5
    meta_data.loc[(meta_data['collect_wk']>=33), 'collect_period']                                = 6
    
    # print(meta_data['collect_period'].value_counts())
    
    # create task class label
    if task == "was_preterm":
        meta_data[task] = 1*(meta_data['delivery_wk'] < 37)
    elif task == "was_early_preterm":
        meta_data[task] = 1*(meta_data['delivery_wk'] < 32)
        
    # Filtered out observations with "collect_wk<=32" OR "collect_period<=5" 
    # Filtered out observations with "collect_wk<=28" OR "collect_period<=4" 
    meta_data = meta_data[meta_data['collect_period']<=finalperiod]
    # Average within each collection period
    meta_data = meta_data.groupby(['participant_id', 'collect_period'], as_index = False).mean()

    return meta_data



In [2]:
def InputLoader(data_dir, meta_data, trainID, validID, testID, myprop, myseed, finalperiod):
    
    participant_id = meta_data['participant_id']
    collect_period = meta_data['collect_period']
   
    Input_data = pd.DataFrame(pd.read_csv(data_dir, delimiter=','))
    Input_data = pd.concat([participant_id, collect_period, Input_data], axis=1)
    
    #---- Filter 1 on columns ----#
    # columns/taxons that were observed in fewer than 10 samples
    Input_reads = Input_data.iloc[:,3:Input_data.shape[1]]
    # pt = np.where((1*(Input_reads != 0)).sum(axis = 0)  > 10)[0]+3
    pt = np.where((1*(Input_reads != 0)).sum(axis = 0)  > Input_data.shape[0]*0.01)[0]+3
    pt = np.concatenate(([0, 1, 2], pt), axis=None)
    Input_data = Input_data.iloc[:,pt]
    
    #---- Filter 2 on rows    ----#
    # Filtered out observations with "collect_wk<=32" OR "collect_period<=6" 
    # Filtered out observations with "collect_wk<=28" OR "collect_period<=5" 
    Input_data = Input_data[Input_data['collect_period']<=finalperiod]
    
    # Average within each collection period
    Input_data = Input_data.groupby(['participant_id', 'collect_period'], as_index = False).mean()
    
    Input_data_train = Input_data[trainID]
    Input_data_valid = Input_data[validID]
    Input_data_test  = Input_data[testID]
    
    print("## Input: train/valid/test (before reshape)")
    print(Input_data_train.shape)
    print(Input_data_valid.shape)
    print(Input_data_test.shape)
    
    #---- Input features reshaper ----#
    mytrain_input = Data_Reshaper_Input(data=Input_data_train, seq_length=finalperiod)
    myvalid_input = Data_Reshaper_Input(data=Input_data_valid, seq_length=finalperiod)
    mytest_input  = Data_Reshaper_Input(data=Input_data_test, seq_length=finalperiod)
    
    print("## Input: train/valid/test (after reshape)")
    print(mytrain_input.shape)
    print(myvalid_input.shape)
    print(mytest_input.shape)
    
    return mytrain_input, myvalid_input, mytest_input

In [3]:
def OutputLoader(meta_data, trainID, validID, testID, task, finalperiod):
    
    meta_data_train = meta_data[trainID]
    meta_data_valid = meta_data[validID]
    meta_data_test  = meta_data[testID]
    
    print("################ Output: train/valid/test (before reshape)")
    print(meta_data_train.shape)
    print(meta_data_valid.shape)
    print(meta_data_test.shape)
    
    #---- Output label reshaper ----#
    mytrain_output = Data_Reshaper_Output_ManytoMany_0(data=meta_data_train, seq_length=finalperiod, classlabel=task)
    myvalid_output = Data_Reshaper_Output_ManytoMany_0(data=meta_data_valid, seq_length=finalperiod, classlabel=task)
    mytest_output = Data_Reshaper_Output_ManytoMany_0(data=meta_data_test, seq_length=finalperiod, classlabel=task)
    
    print("################ Output: train/valid/test (after reshape)")
    print(mytrain_output.shape)
    print(myvalid_output.shape)
    print(mytest_output.shape)
    
    return mytrain_output, myvalid_output, mytest_output

In [4]:
def InputLoaderMtd(meta_data, trainID, validID, testID, task, finalperiod):
    
    meta_data_train = meta_data[trainID]
    meta_data_valid = meta_data[validID]
    meta_data_test  = meta_data[testID]
    
    #---- Input features reshaper ----#
    mytrain_input_mtd = meta_data_train.drop(['project', 'delivery_wk', task], axis=1)
    myvalid_input_mtd = meta_data_valid.drop(['project', 'delivery_wk', task], axis=1)
    mytest_input_mtd  = meta_data_test.drop(['project', 'delivery_wk', task], axis=1)
    
    # scale the input features in this data set
    columns = ['collect_wk', 'age', 'race', 'shannon', 'inv_simpson', 'rooted_pd', 'CST']
    for col in columns:
        mytrain_input_mtd[col] = MinMaxScaler().fit_transform(np.array(mytrain_input_mtd[col]).reshape(-1,1))
        myvalid_input_mtd[col] = MinMaxScaler().fit_transform(np.array(myvalid_input_mtd[col]).reshape(-1,1))
        mytest_input_mtd[col]  = MinMaxScaler().fit_transform(np.array(mytest_input_mtd[col]).reshape(-1,1))
    
    print("## Input: train/valid/test (before reshape)")
    print(mytrain_input_mtd.shape)
    print(myvalid_input_mtd.shape)
    print(mytest_input_mtd.shape)
    
    mytrain_input_mtd = Data_Reshaper_Input(data=mytrain_input_mtd, seq_length=finalperiod)
    myvalid_input_mtd = Data_Reshaper_Input(data=myvalid_input_mtd, seq_length=finalperiod)
    mytest_input_mtd  = Data_Reshaper_Input(data=mytest_input_mtd,  seq_length=finalperiod) 
    
    print("## Input: train/valid/test (after reshape)")
    print(mytrain_input_mtd.shape)
    print(myvalid_input_mtd.shape)
    print(mytest_input_mtd.shape)
    
    return mytrain_input_mtd, myvalid_input_mtd, mytest_input_mtd

In [5]:
def LSTMtrain(model, device, criterion, optimizer, mytrain_input, mytrain_output, myvalid_input, myvalid_output, max_epochs, batch_size, finalperiod, patience, earlystop='loss', verbose=True):
    
    # training and validation set class proportion
    trainprior = sum(mytrain_output[:,finalperiod-1,0])/mytrain_output.shape[0]
    class1ID_train = mytrain_output[:,finalperiod-1,0] == 1
    class2ID_train = mytrain_output[:,finalperiod-1,0] == 0
    
    validprior = sum(myvalid_output[:,finalperiod-1,0])/myvalid_output.shape[0]
    class1ID_valid = myvalid_output[:,finalperiod-1,0] == 1
    class2ID_valid = myvalid_output[:,finalperiod-1,0] == 0
    model = model.to(device)
    
    # Track the value of the loss function and model accuracy across epochs
    history_train_valid = {'TrainLoss': [], 'TrainAcc': [], 'TrainAUC': [],
                           'ValidLoss': [], 'ValidAcc': [], 'ValidAUC': []}
    
    # Same reshaped Validation set for each epoch    
    myvalid_input  = torch.from_numpy(myvalid_input).float().to(device)
    myvalid_output = torch.from_numpy(myvalid_output).float().to(device)
        
    valid_loss_min = np.inf
    valid_losses = []
    
    valid_auc_max = np.NINF
    valid_auces = []
    
    last_valid_loss = 100
    last_valid_auc  = 100
    
    trigger_times = 0
    
    for epoch in range(max_epochs):
        
        #----  shuffle the training set to avoid the batch(project) effects ----#
        shuffleindex = list(range(mytrain_output.shape[0]))
        random.shuffle(shuffleindex)
        mytrain_output = mytrain_output[shuffleindex]
        mytrain_input = mytrain_input[shuffleindex]
        
        #-------------- Batch-wise training model --------------#
        model.train()
        # train_loss = 0.0
        train_num_correct = 0
        train_prob = []
        for batch_idx in range(0, mytrain_input.shape[0], batch_size):
            
            # subset a batch of sequences and class labels
            tmpindex = list(range(batch_idx, min(batch_idx+batch_size, mytrain_input.shape[0])))
            mytrain_input_batch  = mytrain_input[tmpindex,:]
            mytrain_output_batch = mytrain_output[tmpindex,:]
            
            batchprior = sum(mytrain_output_batch[:,finalperiod-1,0])/mytrain_output_batch.shape[0]
            class1ID_batch = mytrain_output_batch[:,finalperiod-1,0] == 1
            class2ID_batch = mytrain_output_batch[:,finalperiod-1,0] == 0
            
            mytrain_input_batch  = torch.from_numpy(mytrain_input_batch).float().to(device)
            mytrain_output_batch = torch.from_numpy(mytrain_output_batch).float().to(device)
            
            # forward pass of RNN model
            output, hidden = model(mytrain_input_batch, device)
            output = output.reshape((mytrain_output_batch.shape))
            output_prob = nn.functional.softmax(output, dim=2)
            # weighted MSE
            loss = batchprior*criterion(output_prob[class1ID_batch,:,0], mytrain_output_batch[class1ID_batch,:,0]) + (1-batchprior)*criterion(output_prob[class2ID_batch,:,1], mytrain_output_batch[class2ID_batch,:,1])
            # loss = trainprior*criterion(output_prob[class1ID_batch,:,0], mytrain_output_batch[class1ID_batch,:,0]) + (1-trainprior)*criterion(output_prob[class2ID_batch,:,1], mytrain_output_batch[class2ID_batch,:,1])
            # loss = criterion(output_prob, mytrain_output_batch)
            # Clear existing gradients from previous epoch
            optimizer.zero_grad()
            # Does backpropagation and calculates gradients
            loss.backward()
            # Updates the weights accordingly
            optimizer.step()
            # Number correct prediction on trainning set collection
            tmppred = 1*(output_prob[:,finalperiod-1,0] > 0.5)
            train_num_correct += sum(1*(tmppred == mytrain_output_batch[:,finalperiod-1,0]))
            # Training function loss collection
            # train_loss += loss.item()
            train_prob = np.concatenate((train_prob, output_prob[:,finalperiod-1,0].cpu().detach().numpy()), axis=None)
            
        train_acc = (float(train_num_correct) / len(mytrain_output))*100
        train_auc = metrics.roc_auc_score(mytrain_output[:,finalperiod-1,0], train_prob)
        
        model.eval()
        
        # Training loss calculation
        tmpmytrain_input  = torch.from_numpy(mytrain_input).float().to(device)
        tmpmytrain_output = torch.from_numpy(mytrain_output).float().to(device)
        tmpoutputtrain, tmphidden = model(tmpmytrain_input, device)
        tmpoutputtrain = tmpoutputtrain.reshape((tmpmytrain_output.shape))
        tmpoutputtrain_prob = nn.functional.softmax(tmpoutputtrain, dim=2)
        # train_loss = criterion(tmpoutputtrain_prob, tmpmytrain_output)
        train_loss = trainprior*criterion(tmpoutputtrain_prob[class1ID_train,:,0], tmpmytrain_output[class1ID_train,:,0]) + (1-trainprior)*criterion(tmpoutputtrain_prob[class2ID_train,:,1], tmpmytrain_output[class2ID_train,:,1])
        history_train_valid['TrainLoss'].append(train_loss.item())
        history_train_valid['TrainAcc'].append(train_acc)
        history_train_valid['TrainAUC'].append(train_auc)
        

        #--------------       Validate model      --------------#
        outputvalid, hidden = model(myvalid_input, device)
        outputvalid = outputvalid.reshape((myvalid_output.shape))
        outputvalid_prob = nn.functional.softmax(outputvalid, dim=2)
        # validation loss
        # valid_loss = criterion(outputvalid_prob, myvalid_output)
        valid_loss = validprior*criterion(outputvalid_prob[class1ID_valid,:,0], myvalid_output[class1ID_valid,:,0]) + (1-validprior)*criterion(outputvalid_prob[class2ID_valid,:,1], myvalid_output[class2ID_valid,:,1])
        # Number correct prediction on trainning set collection
        tmppredprob = outputvalid_prob[:,finalperiod-1,0].cpu().detach().numpy()
        tmppred = 1*(tmppredprob > 0.5)
        tmpobs = myvalid_output[:,finalperiod-1,0].cpu().detach().numpy()
        valid_num_correct = sum(1*(tmppred == tmpobs))
        valid_acc = (float(valid_num_correct) / len(myvalid_output))*100
        valid_auc = metrics.roc_auc_score(tmpobs, tmppredprob)
        
        history_train_valid['ValidLoss'].append(valid_loss.item())
        history_train_valid['ValidAcc'].append(valid_acc)
        history_train_valid['ValidAUC'].append(valid_auc)
        
        if verbose or epoch + 1 == max_epochs:
            print(f'[E {epoch + 1}/{max_epochs}]'
                  f" T.Loss: {history_train_valid['TrainLoss'][-1]:.4f}, T.Acc: {history_train_valid['TrainAcc'][-1]:2.2f}, T.AUC: {history_train_valid['TrainAUC'][-1]:.4f}"
                  f" V.Loss: {history_train_valid['ValidLoss'][-1]:.4f}, V.Acc: {history_train_valid['ValidAcc'][-1]:2.2f}, V.AUC: {history_train_valid['ValidAUC'][-1]:.4f};")
        
        valid_auces.append(valid_auc.item())
        valid_losses.append(valid_loss.item())
        
        if earlystop == "auc":
            current_valid_auc = valid_auc
            if current_valid_auc < last_valid_auc:
                trigger_times += 1
                print('AUC Trigger Times:', trigger_times)
                if trigger_times >= patience:
                    print('Early stopping by AUC!.')
                    break
            else:
                print('trigger times: 0')
                trigger_times = 0
            last_valid_auc = np.mean(valid_auces[-10:])
            # last_valid_auc = current_valid_auc
        elif earlystop == "loss":
            current_valid_loss = valid_loss
            if current_valid_loss > last_valid_loss:
                trigger_times += 1
                print('Loss Trigger Times:', trigger_times)
                if trigger_times >= patience:
                    print('Early stopping by LOSS!.')
                    break
            else:
                print('Trigger times >= patience: 0')
                trigger_times = 0
            last_valid_loss = np.mean(valid_losses[-10:])
            # last_valid_loss = current_valid_loss
        
        
        
        # if earlystop == "auc":
        #     # start to considering early-stop after 20 epoch
        #     if epoch > 20:
        #        if np.mean(valid_auces) < valid_auc_max:
        #            print("Stopped here by AUC!")
        #            break
        #        valid_auc_max = np.mean(valid_auces)
        # elif earlystop == "loss":
        #    # start to considering early-stop after 20 epoch
        #    if epoch > 20:
        #        if np.mean(valid_losses) > valid_loss_min:
        #            print("Stopped here by LOSS!")
        #            break
        #        # valid_loss_min = np.mean(valid_losses[-20:])
        #        valid_loss_min = np.mean(valid_losses)
        
    return history_train_valid

In [6]:
class Model_Mtd(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, seq_len, n_layers, fc_size, dropoutrate):
        super(Model_Mtd, self).__init__()

        # Defining some parameters
        self.input_size  = input_size      # number of input node
        self.output_size = output_size     # number of output node
        self.seq_len     = seq_len         # seq_len: number of timepoints (collection period)
        self.fc_size     = fc_size         # size of the fully connected net
        self.n_layers    = n_layers        # number of LSTM/RNN layers
        self.hidden_dim  = hidden_dim      # hidden size of LSTM/RNN, also the size of fully connected NN 1
        
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)
        self.fc_1 = nn.Linear(in_features=hidden_dim*seq_len, out_features=fc_size[0], bias=False)
        self.fc_2 = nn.Linear(in_features=fc_size[0], out_features=output_size, bias=False)

        # define dropout proportion to prevent overfitting
        self.dropout = nn.Dropout(dropoutrate)
        self.tanh = nn.Tanh()
        
    def forward(self, x, device):
        
        # Initializing hidden state for first input using method defined below
        batch_size = x.size(0)
        h0 = self.init_hidden(batch_size, device)
        #------------ RNN  ------------#
        # outp, hidden = self.rnn(x, h0)
        #------------ LSTM ------------#
        # c0 = self.init_hidden(batch_size, device)
        # outp, hidden = self.lstm(x, (h0, c0))
        #------------ GRU  ------------#
        outp, hidden = self.gru(x, h0)
            
        outp = outp.reshape(outp.shape[0], -1)  # reshaping the data for Dense layer next

        outp = self.fc_1(outp)
        outp = self.tanh(outp)   # relu
        outp = self.dropout(outp)# dropout
        outp = self.fc_2(outp)
        
        return outp, hidden
    
    def init_hidden(self, batch_size, device):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)
        # We'll send the tensor holding the hidden state to the device we specified earlier as well
        return hidden

In [7]:
def FirstStage_Mtd(mytrain_input_mtd, mytrain_output, myvalid_input_mtd, myvalid_output, mytest_input_mtd, mytest_output, finalperiod):
    
    # 7 -> lstm -> 16 -> 8
    
    #---- Hyper-parameter set-up ----#
    input_size  = mytrain_input_mtd.shape[2]
    output_size = mytrain_output.shape[2]*finalperiod
    seq_len     = finalperiod
    hidden_dim  = 8
    fc_size     = [16]
    n_layers    = 1
    
    dropoutrate = 0.5
    lr          = 0.001
    max_epochs  = 2000
    batch_size  = 50
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    model_Mtd = Model_Mtd(input_size=input_size, output_size=output_size, hidden_dim=hidden_dim, seq_len=seq_len, 
                          n_layers=n_layers, fc_size=fc_size, dropoutrate=dropoutrate)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model_Mtd.parameters(), lr=lr) 
    
    print("################ Mtd LSTM training...")
    Mtd_hist = LSTMtrain(model_Mtd, device, criterion, optimizer, mytrain_input_mtd, mytrain_output, 
                         myvalid_input_mtd, myvalid_output, max_epochs, batch_size, finalperiod, patience=4, earlystop="loss", verbose=True)
    
    #---- testing set evaluation ----#
    Mtd_obs, Mtd_pred, Mtd_prob = evaluate(model_Mtd, device, mytest_input_mtd, mytest_output, finalperiod, cutoff=0.5)
    Mtdtest_auc = metrics.roc_auc_score(Mtd_obs, Mtd_prob[:,0])
    Mtdtest_acc = metrics.accuracy_score(Mtd_obs, Mtd_pred)
    Mtdtest_conf = metrics.confusion_matrix(Mtd_obs, Mtd_pred)

    return model_Mtd, Mtd_hist, Mtd_obs, Mtd_pred, Mtd_prob, Mtdtest_auc, Mtdtest_acc, Mtdtest_conf

In [8]:
class Model_pty(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, seq_len, n_layers, fc_size, dropoutrate):
        super(Model_pty, self).__init__()

        # Defining some parameters
        self.input_size  = input_size      # number of input node
        self.output_size = output_size     # number of output node
        self.seq_len     = seq_len         # seq_len: number of timepoints (collection period)
        self.fc_size     = fc_size         # size of the fully connected net
        self.n_layers    = n_layers        # number of LSTM/RNN layers
        self.hidden_dim  = hidden_dim      # hidden size of LSTM/RNN, also the size of fully connected NN 1
        
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)
        self.fc_1 = nn.Linear(in_features=hidden_dim*seq_len, out_features=fc_size[0], bias=False)
        self.fc_2 = nn.Linear(in_features=fc_size[0], out_features=fc_size[1], bias=False)
        self.fc_3 = nn.Linear(in_features=fc_size[1], out_features=output_size, bias=False)
        # self.fc_4 = nn.Linear(in_features=fc_size[2], out_features=output_size, bias=False)
        # self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        # define dropout proportion to prevent overfitting
        self.dropout = nn.Dropout(dropoutrate)

    
    def forward(self, x, device):
        
        # Initializing hidden state for first input using method defined below
        batch_size = x.size(0)
        h0 = self.init_hidden(batch_size, device)
        
        #------------ RNN  ------------#
        # outp, hidden = self.rnn(x, h0)
        #------------ LSTM ------------#
        # c0 = self.init_hidden(batch_size, device)
        # outp, hidden = self.lstm(x, (h0, c0))
        #------------ GRU  ------------#
        outp, hidden = self.gru(x, h0)
        
        outp = outp.reshape(outp.shape[0], -1)  # reshaping the data for Dense layer next
        
        outp = self.tanh(outp)   # relu
        outp = self.dropout(outp)# dropout
        outp = self.fc_1(outp)   # first Dense
        outp = self.tanh(outp)   # relu
        outp = self.dropout(outp)# dropout
        outp = self.fc_2(outp)   # 2nd Dense
        outp = self.tanh(outp)   # relu
        outp = self.dropout(outp)# dropout
        outp = self.fc_3(outp)   # 3rd Output
        
        return outp, hidden
    
    def init_hidden(self, batch_size, device):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)
        # We'll send the tensor holding the hidden state to the device we specified earlier as well
        return hidden


In [9]:
def FirstStage_pty(mytrain_input_pty, mytrain_output, myvalid_input_pty, myvalid_output, mytest_input_pty, mytest_output, finalperiod):
   
    #---- Hyper-parameter set-up ----#
    input_size  = mytrain_input_pty.shape[2]
    output_size = mytrain_output.shape[2]*finalperiod
    seq_len     = finalperiod
    hidden_dim  = 128
    n_layers    = 1
    fc_size     = [128, 64]
    
    dropoutrate = 0.1
    lr          = 0.0001
    max_epochs  = 2000
    batch_size  = 200
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    model_pty = Model_pty(input_size=input_size, output_size=output_size, hidden_dim=hidden_dim, seq_len=seq_len, 
                          n_layers=n_layers, fc_size=fc_size, dropoutrate=dropoutrate)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model_pty.parameters(), lr=lr) 
    
    #---- training lstm ----#
    print("################ pty LSTM training...")
    pty_hist = LSTMtrain(model_pty, device, criterion, optimizer, mytrain_input_pty, mytrain_output, 
                         myvalid_input_pty, myvalid_output, max_epochs, batch_size, finalperiod, patience=4, earlystop="loss", verbose=True)
    
    #---- testing set evaluation ----#
    pty_obs, pty_pred, pty_prob = evaluate(model_pty, device, mytest_input_pty, mytest_output, finalperiod, cutoff=0.5)
    ptytest_auc = metrics.roc_auc_score(pty_obs, pty_prob[:,0])
    ptytest_acc = metrics.accuracy_score(pty_obs, pty_pred)
    ptytest_conf = metrics.confusion_matrix(pty_obs, pty_pred)

    return model_pty, pty_hist, pty_obs, pty_pred, pty_prob, ptytest_auc, ptytest_acc, ptytest_conf

In [10]:
class Model_txy(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, seq_len, n_layers, fc_size, dropoutrate):
        super(Model_txy, self).__init__()

        # Defining some parameters
        self.input_size  = input_size      # number of input node
        self.output_size = output_size     # number of output node
        self.seq_len     = seq_len         # seq_len: number of timepoints (collection period)
        self.fc_size     = fc_size         # size of the fully connected net
        self.n_layers    = n_layers        # number of LSTM/RNN layers
        self.hidden_dim  = hidden_dim      # hidden size of LSTM/RNN, also the size of fully connected NN 1
        
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_dim, num_layers=n_layers, batch_first=True)
        self.fc_1 = nn.Linear(in_features=hidden_dim*seq_len, out_features=fc_size[0], bias=False)
        self.fc_2 = nn.Linear(in_features=fc_size[0], out_features=fc_size[1], bias=False)
        self.fc_3 = nn.Linear(in_features=fc_size[1], out_features=fc_size[2], bias=False)
        self.fc_4 = nn.Linear(in_features=fc_size[2], out_features=output_size, bias=False)
        # self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        # define dropout proportion to prevent overfitting
        self.dropout = nn.Dropout(dropoutrate)

    
    def forward(self, x, device):
        
        # Initializing hidden state for first input using method defined below
        batch_size = x.size(0)
        h0 = self.init_hidden(batch_size, device)
        #------------ RNN  ------------#
        # outp, hidden = self.rnn(x, h0)
        #------------ LSTM ------------#
        # c0 = self.init_hidden(batch_size, device)
        # outp, hidden = self.lstm(x, (h0, c0))
        #------------ GRU  ------------#
        outp, hidden = self.gru(x, h0)
        
        outp = outp.reshape(outp.shape[0], -1)  # reshaping the data for Dense layer next
        
        outp = self.tanh(outp)   # relu
        outp = self.dropout(outp)# dropout
        outp = self.fc_1(outp)   # first Dense
        outp = self.tanh(outp)   # relu
        outp = self.dropout(outp)# dropout
        outp = self.fc_2(outp)   # 2nd Dense
        outp = self.tanh(outp)   # relu
        outp = self.dropout(outp)# dropout
        outp = self.fc_3(outp)   # 3rd Output
        outp = self.tanh(outp)   # relu
        outp = self.dropout(outp)# dropout
        outp = self.fc_4(outp)   # 4th Ouuput
        outp = self.tanh(outp)   # relu
        
        return outp, hidden
    
    def init_hidden(self, batch_size, device):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)
        # We'll send the tensor holding the hidden state to the device we specified earlier as well
        return hidden

In [11]:
def FirstStage_txy(mytrain_input_pty, mytrain_output, myvalid_input_pty, myvalid_output, mytest_input_pty, mytest_output, finalperiod):
    
    #---- Hyper-parameter set-up ----#
    input_size  = mytrain_input_txy.shape[2]
    output_size = mytrain_output.shape[2]*finalperiod
    seq_len     = finalperiod
    hidden_dim  = 256
    n_layers    = 1
    fc_size     = [256, 128, 64]
    
    dropoutrate = 0.1
    lr          = 0.0001
    max_epochs  = 2000
    batch_size  = 200
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    
    model_txy = Model_txy(input_size=input_size, output_size=output_size, hidden_dim=hidden_dim, 
                          seq_len=seq_len, n_layers=n_layers, fc_size=fc_size, dropoutrate=dropoutrate)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model_txy.parameters(), lr=lr) 
    
    print("################ txy LSTM training...")
    txy_hist = LSTMtrain(model_txy, device, criterion, optimizer, mytrain_input_txy, mytrain_output, 
                         myvalid_input_txy, myvalid_output, max_epochs, batch_size, finalperiod, patience=4, earlystop="loss", verbose=True)
    
    #---- testing set evaluation ----#
    txy_obs, txy_pred, txy_prob = evaluate(model_txy, device, mytest_input_txy, mytest_output, finalperiod, cutoff=0.5)
    txytest_auc = metrics.roc_auc_score(txy_obs, txy_prob[:,0])
    txytest_acc = metrics.accuracy_score(txy_obs, txy_pred)
    txytest_conf = metrics.confusion_matrix(txy_obs, txy_pred)

    return model_txy, txy_hist, txy_obs, txy_pred, txy_prob, txytest_auc, txytest_acc, txytest_conf

# Main script start from here

In [12]:
# data directory
# meta_dir      = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/metadata/metadata.csv'
meta_dir      = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/metadata_imputed1.csv'
alpha_dir     = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/alpha_diversity/alpha_diversity.csv'
cst_dir       = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/community_state_types/cst_valencia.csv'

txy_dir_fam = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/taxonomy/taxonomy_relabd.family.csv'
txy_dir_gen = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/taxonomy/taxonomy_relabd.genus.csv'
txy_dir_spe = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/taxonomy/taxonomy_relabd.species.csv'

pty_dir_1dot = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/phylotypes/phylotype_relabd.1e0.csv'
pty_dir_dot5 = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/phylotypes/phylotype_relabd.5e_1.csv'
pty_dir_dot1 = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/phylotypes/phylotype_relabd.1e_1.csv'

# krdwide_dir   = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/pairwise_distance/krd_distance_wide.csv'


txy_dir = txy_dir_gen
pty_dir = pty_dir_1dot

task = "was_preterm"
finalperiod = 5
# task = "was_early_preterm"
# finalperiod = 4

myprop = [0.6, 0.3, 0.1]
myseed = 0


#-------------------------------------------#
#---- Data Preparation                  ----#
#-------------------------------------------#

meta_data = metadata_loader(meta_dir, alpha_dir, cst_dir, task, finalperiod)

#---- data set splitter ----#
trainID, validID, testID = dataset_splitID(meta_data=meta_data, prop=myprop, myseed=myseed)

#---- output loader ----#
mytrain_output, myvalid_output, mytest_output = OutputLoader(meta_data, trainID, validID, testID, task, finalperiod)


################ Output: train/valid/test (before reshape)
(998, 12)
(488, 12)
(162, 12)
################ Output: train/valid/test (after reshape)
(729, 5, 2)
(363, 5, 2)
(120, 5, 2)


In [13]:
print("################ meta:")
mytrain_input_mtd, myvalid_input_mtd, mytest_input_mtd = InputLoaderMtd(meta_data, trainID, validID, testID, task, finalperiod)

################ meta:
## Input: train/valid/test (before reshape)
(998, 9)
(488, 9)
(162, 9)
## Input: train/valid/test (after reshape)
(729, 5, 7)
(363, 5, 7)
(120, 5, 7)


In [14]:
print("################ pty:")
mytrain_input_pty, myvalid_input_pty, mytest_input_pty = InputLoader(pty_dir, meta_data, trainID, validID, testID, myprop, myseed, finalperiod)

################ pty:
## Input: train/valid/test (before reshape)
(998, 165)
(488, 165)
(162, 165)
## Input: train/valid/test (after reshape)
(729, 5, 163)
(363, 5, 163)
(120, 5, 163)


In [15]:
print("################ txy:")
mytrain_input_txy, myvalid_input_txy, mytest_input_txy = InputLoader(txy_dir, meta_data, trainID, validID, testID, myprop, myseed, finalperiod)

################ txy:
## Input: train/valid/test (before reshape)
(998, 250)
(488, 250)
(162, 250)
## Input: train/valid/test (after reshape)
(729, 5, 248)
(363, 5, 248)
(120, 5, 248)


In [16]:
#-------------------------------------------#
#---- First stage: Metadata             ----#
#-------------------------------------------#

model_Mtd, Mtd_hist, Mtdtest_obs, Mtdtest_pred, Mtdtest_prob, Mtdtest_auc, Mtdtest_acc, Mtdtest_conf = FirstStage_Mtd(mytrain_input_mtd, mytrain_output, myvalid_input_mtd, myvalid_output, mytest_input_mtd, mytest_output, finalperiod)
print(Mtdtest_acc)
print(Mtdtest_auc)
print(Mtdtest_conf)

################ Mtd LSTM training...
[E 1/2000] T.Loss: 0.0852, T.Acc: 66.39, T.AUC: 0.5498 V.Loss: 0.0869, V.Acc: 65.84, V.AUC: 0.6538;
Trigger times >= patience: 0
[E 2/2000] T.Loss: 0.0811, T.Acc: 69.14, T.AUC: 0.5085 V.Loss: 0.0842, V.Acc: 65.84, V.AUC: 0.6588;
Trigger times >= patience: 0
[E 3/2000] T.Loss: 0.0798, T.Acc: 69.00, T.AUC: 0.4840 V.Loss: 0.0836, V.Acc: 65.84, V.AUC: 0.6693;
Trigger times >= patience: 0
[E 4/2000] T.Loss: 0.0794, T.Acc: 69.14, T.AUC: 0.5274 V.Loss: 0.0834, V.Acc: 65.84, V.AUC: 0.6755;
Trigger times >= patience: 0
[E 5/2000] T.Loss: 0.0793, T.Acc: 69.14, T.AUC: 0.5439 V.Loss: 0.0832, V.Acc: 65.84, V.AUC: 0.6787;
Trigger times >= patience: 0
[E 6/2000] T.Loss: 0.0790, T.Acc: 69.27, T.AUC: 0.5413 V.Loss: 0.0831, V.Acc: 65.84, V.AUC: 0.6797;
Trigger times >= patience: 0
[E 7/2000] T.Loss: 0.0789, T.Acc: 69.27, T.AUC: 0.5445 V.Loss: 0.0826, V.Acc: 65.84, V.AUC: 0.6791;
Trigger times >= patience: 0
[E 8/2000] T.Loss: 0.0786, T.Acc: 69.27, T.AUC: 0.5546 V.Lo

In [17]:
#-------------------------------------------#
#---- First stage: phylotype data       ----#
#-------------------------------------------#

model_pty, pty_hist, ptytest_obs, ptytest_pred, ptytest_prob, ptytest_auc, ptytest_acc, ptytest_conf = FirstStage_pty(mytrain_input_pty, mytrain_output, myvalid_input_pty, myvalid_output, mytest_input_pty, mytest_output, finalperiod)
print(ptytest_acc)
print(ptytest_auc)
print(ptytest_conf)

################ pty LSTM training...
[E 1/2000] T.Loss: 0.0930, T.Acc: 62.41, T.AUC: 0.4917 V.Loss: 0.0932, V.Acc: 65.84, V.AUC: 0.4968;
Trigger times >= patience: 0
[E 2/2000] T.Loss: 0.0918, T.Acc: 69.27, T.AUC: 0.5252 V.Loss: 0.0921, V.Acc: 65.84, V.AUC: 0.5158;
Trigger times >= patience: 0
[E 3/2000] T.Loss: 0.0905, T.Acc: 69.27, T.AUC: 0.5229 V.Loss: 0.0911, V.Acc: 65.84, V.AUC: 0.5263;
Trigger times >= patience: 0
[E 4/2000] T.Loss: 0.0892, T.Acc: 69.27, T.AUC: 0.5278 V.Loss: 0.0901, V.Acc: 65.84, V.AUC: 0.5295;
Trigger times >= patience: 0
[E 5/2000] T.Loss: 0.0878, T.Acc: 69.27, T.AUC: 0.5271 V.Loss: 0.0890, V.Acc: 65.84, V.AUC: 0.5379;
Trigger times >= patience: 0
[E 6/2000] T.Loss: 0.0863, T.Acc: 69.27, T.AUC: 0.5155 V.Loss: 0.0879, V.Acc: 65.84, V.AUC: 0.5449;
Trigger times >= patience: 0
[E 7/2000] T.Loss: 0.0848, T.Acc: 69.27, T.AUC: 0.4631 V.Loss: 0.0868, V.Acc: 65.84, V.AUC: 0.5507;
Trigger times >= patience: 0
[E 8/2000] T.Loss: 0.0834, T.Acc: 69.27, T.AUC: 0.5273 V.Lo

[E 64/2000] T.Loss: 0.0778, T.Acc: 69.27, T.AUC: 0.6398 V.Loss: 0.0829, V.Acc: 65.84, V.AUC: 0.6767;
Trigger times >= patience: 0
[E 65/2000] T.Loss: 0.0777, T.Acc: 69.27, T.AUC: 0.6260 V.Loss: 0.0829, V.Acc: 65.84, V.AUC: 0.6769;
Trigger times >= patience: 0
[E 66/2000] T.Loss: 0.0776, T.Acc: 69.27, T.AUC: 0.6438 V.Loss: 0.0828, V.Acc: 65.84, V.AUC: 0.6770;
Trigger times >= patience: 0
[E 67/2000] T.Loss: 0.0776, T.Acc: 69.27, T.AUC: 0.6746 V.Loss: 0.0828, V.Acc: 65.84, V.AUC: 0.6770;
Trigger times >= patience: 0
[E 68/2000] T.Loss: 0.0775, T.Acc: 69.27, T.AUC: 0.6557 V.Loss: 0.0827, V.Acc: 65.84, V.AUC: 0.6766;
Trigger times >= patience: 0
[E 69/2000] T.Loss: 0.0774, T.Acc: 69.27, T.AUC: 0.6668 V.Loss: 0.0827, V.Acc: 65.84, V.AUC: 0.6761;
Trigger times >= patience: 0
[E 70/2000] T.Loss: 0.0773, T.Acc: 69.27, T.AUC: 0.6481 V.Loss: 0.0826, V.Acc: 65.84, V.AUC: 0.6763;
Trigger times >= patience: 0
[E 71/2000] T.Loss: 0.0772, T.Acc: 69.27, T.AUC: 0.6593 V.Loss: 0.0824, V.Acc: 65.84, V.AU

[E 128/2000] T.Loss: 0.0689, T.Acc: 72.29, T.AUC: 0.7197 V.Loss: 0.0778, V.Acc: 67.77, V.AUC: 0.6769;
Trigger times >= patience: 0
[E 129/2000] T.Loss: 0.0688, T.Acc: 72.02, T.AUC: 0.7296 V.Loss: 0.0779, V.Acc: 68.04, V.AUC: 0.6768;
Trigger times >= patience: 0
[E 130/2000] T.Loss: 0.0687, T.Acc: 72.57, T.AUC: 0.7210 V.Loss: 0.0785, V.Acc: 68.04, V.AUC: 0.6757;
Loss Trigger Times: 1
[E 131/2000] T.Loss: 0.0685, T.Acc: 72.43, T.AUC: 0.7255 V.Loss: 0.0783, V.Acc: 68.32, V.AUC: 0.6758;
Loss Trigger Times: 2
[E 132/2000] T.Loss: 0.0684, T.Acc: 72.57, T.AUC: 0.7260 V.Loss: 0.0780, V.Acc: 68.04, V.AUC: 0.6751;
Trigger times >= patience: 0
[E 133/2000] T.Loss: 0.0684, T.Acc: 72.29, T.AUC: 0.7176 V.Loss: 0.0777, V.Acc: 67.49, V.AUC: 0.6749;
Trigger times >= patience: 0
[E 134/2000] T.Loss: 0.0682, T.Acc: 73.39, T.AUC: 0.7263 V.Loss: 0.0784, V.Acc: 68.60, V.AUC: 0.6743;
Loss Trigger Times: 1
[E 135/2000] T.Loss: 0.0682, T.Acc: 71.74, T.AUC: 0.7234 V.Loss: 0.0786, V.Acc: 68.04, V.AUC: 0.6746;
Lo

In [18]:
#-------------------------------------------#
#---- First stage: taxonomy data        ----#
#-------------------------------------------#

model_txy, txy_hist, txytest_obs, txytest_pred, txytest_prob, txytest_auc, txytest_acc, txytest_conf = FirstStage_txy(mytrain_input_txy, mytrain_output, myvalid_input_txy, myvalid_output, mytest_input_txy, mytest_output, finalperiod)
print(txytest_acc)
print(txytest_auc)
print(txytest_conf)

################ txy LSTM training...
[E 1/2000] T.Loss: 0.0925, T.Acc: 62.69, T.AUC: 0.4933 V.Loss: 0.0928, V.Acc: 65.84, V.AUC: 0.5102;
Trigger times >= patience: 0
[E 2/2000] T.Loss: 0.0912, T.Acc: 69.27, T.AUC: 0.5094 V.Loss: 0.0917, V.Acc: 65.84, V.AUC: 0.5203;
Trigger times >= patience: 0
[E 3/2000] T.Loss: 0.0897, T.Acc: 69.27, T.AUC: 0.5216 V.Loss: 0.0905, V.Acc: 65.84, V.AUC: 0.5235;
Trigger times >= patience: 0
[E 4/2000] T.Loss: 0.0878, T.Acc: 69.27, T.AUC: 0.4831 V.Loss: 0.0890, V.Acc: 65.84, V.AUC: 0.5257;
Trigger times >= patience: 0
[E 5/2000] T.Loss: 0.0856, T.Acc: 69.27, T.AUC: 0.4855 V.Loss: 0.0875, V.Acc: 65.84, V.AUC: 0.5264;
Trigger times >= patience: 0
[E 6/2000] T.Loss: 0.0835, T.Acc: 69.27, T.AUC: 0.5035 V.Loss: 0.0860, V.Acc: 65.84, V.AUC: 0.5277;
Trigger times >= patience: 0
[E 7/2000] T.Loss: 0.0816, T.Acc: 69.27, T.AUC: 0.4753 V.Loss: 0.0850, V.Acc: 65.84, V.AUC: 0.5269;
Trigger times >= patience: 0
[E 8/2000] T.Loss: 0.0804, T.Acc: 69.27, T.AUC: 0.5355 V.Lo

[E 64/2000] T.Loss: 0.0741, T.Acc: 69.27, T.AUC: 0.6797 V.Loss: 0.0797, V.Acc: 67.22, V.AUC: 0.6777;
Trigger times >= patience: 0
[E 65/2000] T.Loss: 0.0737, T.Acc: 68.45, T.AUC: 0.6796 V.Loss: 0.0798, V.Acc: 65.29, V.AUC: 0.6782;
Trigger times >= patience: 0
[E 66/2000] T.Loss: 0.0734, T.Acc: 69.68, T.AUC: 0.6790 V.Loss: 0.0795, V.Acc: 67.22, V.AUC: 0.6787;
Trigger times >= patience: 0
[E 67/2000] T.Loss: 0.0731, T.Acc: 68.86, T.AUC: 0.6876 V.Loss: 0.0795, V.Acc: 67.77, V.AUC: 0.6784;
Trigger times >= patience: 0
[E 68/2000] T.Loss: 0.0727, T.Acc: 70.10, T.AUC: 0.6935 V.Loss: 0.0792, V.Acc: 67.77, V.AUC: 0.6785;
Trigger times >= patience: 0
[E 69/2000] T.Loss: 0.0725, T.Acc: 71.06, T.AUC: 0.6900 V.Loss: 0.0791, V.Acc: 67.77, V.AUC: 0.6781;
Trigger times >= patience: 0
[E 70/2000] T.Loss: 0.0722, T.Acc: 70.78, T.AUC: 0.6960 V.Loss: 0.0791, V.Acc: 68.32, V.AUC: 0.6770;
Trigger times >= patience: 0
[E 71/2000] T.Loss: 0.0719, T.Acc: 71.06, T.AUC: 0.6891 V.Loss: 0.0791, V.Acc: 68.60, V.AU

In [19]:
sum(mytest_output[:,finalperiod-1,0])/mytest_output.shape[0]

0.3416666666666667

In [20]:
#-------------------------------------------#
#---- Second stage: Logistic Regression ----# # use validation set only without class weights
#-------------------------------------------#

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

#---- validation set training ----#
Mtdvalid_obs, Mtdvalid_pred, Mtdvalid_prob = evaluate(model_Mtd, device, myvalid_input_mtd, myvalid_output, finalperiod, cutoff=0.5)
ptyvalid_obs, ptyvalid_pred, ptyvalid_prob = evaluate(model_pty, device, myvalid_input_pty, myvalid_output, finalperiod, cutoff=0.5)
txyvalid_obs, txyvalid_pred, txyvalid_prob = evaluate(model_txy, device, myvalid_input_txy, myvalid_output, finalperiod, cutoff=0.5)

x_valid = np.array(np.column_stack([Mtdvalid_prob, ptyvalid_prob, txyvalid_prob])).reshape(-1, 3*2)

S2prior = sum(Mtdvalid_obs)/len(Mtdvalid_obs)
print(S2prior)
wt = {0:S2prior, 1:1-S2prior} # use the prior class prob as class weights

L2Logistic_model = LogisticRegression(penalty='l2', solver='sag')
L2Logistic_model.fit(x_valid, Mtdvalid_obs)

#---- testing set evaluation ----#
x_test = np.array(np.column_stack([Mtdtest_prob, ptytest_prob, txytest_prob])).reshape(-1, 3*2)
final_obs  = Mtdtest_obs
final_prob = L2Logistic_model.predict_proba(x_test)[:,1]
final_pred = L2Logistic_model.predict(x_test)

final_acc  = metrics.accuracy_score(final_obs, final_pred)
final_auc  = metrics.roc_auc_score(final_obs, final_prob)
final_conf = metrics.confusion_matrix(final_obs, final_pred)

print(final_acc)
print(final_auc)
print(final_conf)

0.3415977961432507
0.7583333333333333
0.7665946279715962
[[74  5]
 [24 17]]


In [21]:
#-------------------------------------------#
#---- Second stage: Logistic Regression ----# # use validation set only with class weights (Best)
#-------------------------------------------#

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

#---- validation set training ----#
Mtdvalid_obs, Mtdvalid_pred, Mtdvalid_prob = evaluate(model_Mtd, device, myvalid_input_mtd, myvalid_output, finalperiod, cutoff=0.5)
ptyvalid_obs, ptyvalid_pred, ptyvalid_prob = evaluate(model_pty, device, myvalid_input_pty, myvalid_output, finalperiod, cutoff=0.5)
txyvalid_obs, txyvalid_pred, txyvalid_prob = evaluate(model_txy, device, myvalid_input_txy, myvalid_output, finalperiod, cutoff=0.5)

x_valid = np.array(np.column_stack([Mtdvalid_prob, ptyvalid_prob, txyvalid_prob])).reshape(-1, 3*2)

S2prior = sum(Mtdvalid_obs)/len(Mtdvalid_obs)
print(S2prior)
wt = {0:S2prior, 1:1-S2prior} # use the prior class prob as class weights

L2Logistic_model = LogisticRegression(penalty='l2', solver='sag', class_weight=wt)
L2Logistic_model.fit(x_valid, Mtdvalid_obs)

#---- testing set evaluation ----#
x_test = np.array(np.column_stack([Mtdtest_prob, ptytest_prob, txytest_prob])).reshape(-1, 3*2)
final_obs  = Mtdtest_obs
final_prob = L2Logistic_model.predict_proba(x_test)[:,1]
final_pred = L2Logistic_model.predict(x_test)

final_acc  = metrics.accuracy_score(final_obs, final_pred)
final_auc  = metrics.roc_auc_score(final_obs, final_prob)
final_conf = metrics.confusion_matrix(final_obs, final_pred)

print(final_acc)
print(final_auc)
print(final_conf)

0.3415977961432507
0.7333333333333333
0.7616548317381907
[[61 18]
 [14 27]]


In [22]:
#-------------------------------------------#
#---- Second stage: Logistic Regression ----# # use trianing+validation set without class weights
#-------------------------------------------#

MtdS2train_input = np.concatenate((mytrain_input_mtd, myvalid_input_mtd), axis=0)
ptyS2train_input = np.concatenate((mytrain_input_pty, myvalid_input_pty), axis=0)
txyS2train_input = np.concatenate((mytrain_input_txy, myvalid_input_txy), axis=0)

S2train_output = np.concatenate((mytrain_output, myvalid_output), axis=0)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

#---- validation set training ----#

MtdS2_obs, MtdS2_pred, MtdS2_prob = evaluate(model_Mtd, device, MtdS2train_input, S2train_output, finalperiod, cutoff=0.5)
ptyS2_obs, ptyS2_pred, ptyS2_prob = evaluate(model_pty, device, ptyS2train_input, S2train_output, finalperiod, cutoff=0.5)
txyS2_obs, txyS2_pred, txyS2_prob = evaluate(model_txy, device, txyS2train_input, S2train_output, finalperiod, cutoff=0.5)
x_valid = np.array(np.column_stack([MtdS2_prob, ptyS2_prob, txyS2_prob])).reshape(-1, 3*2)

S2prior = sum(MtdS2_obs)/len(MtdS2_obs)
print(S2prior)
wt = {0:S2prior, 1:1-S2prior} # use the prior class prob as class weights

L2Logistic_model = LogisticRegression(penalty='l2', solver='sag')
L2Logistic_model.fit(x_valid, MtdS2_obs)


#---- testing set evaluation ----#
# x_test = np.array(np.transpose([Mtdtest_prob, ptytest_prob, txytest_prob, krdtest_prob])).reshape(-1, 3*2)
x_test = np.array(np.column_stack([Mtdtest_prob, ptytest_prob, txytest_prob])).reshape(-1, 3*2)
final_obs  = Mtdtest_obs
final_prob = L2Logistic_model.predict_proba(x_test)[:,1]
final_pred = L2Logistic_model.predict(x_test)

final_acc  = metrics.accuracy_score(final_obs, final_pred)
final_auc  = metrics.roc_auc_score(final_obs, final_prob)
final_conf = metrics.confusion_matrix(final_obs, final_pred)

print(final_acc)
print(final_auc)
print(final_conf)

0.31868131868131866
0.7666666666666667
0.7542451373880827
[[72  7]
 [21 20]]


In [23]:
#-------------------------------------------#
#---- Second stage: Logistic Regression ----# # use trianing + validation set with class weights
#-------------------------------------------#

MtdS2train_input = np.concatenate((mytrain_input_mtd, myvalid_input_mtd), axis=0)
ptyS2train_input = np.concatenate((mytrain_input_pty, myvalid_input_pty), axis=0)
txyS2train_input = np.concatenate((mytrain_input_txy, myvalid_input_txy), axis=0)

S2train_output = np.concatenate((mytrain_output, myvalid_output), axis=0)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

#---- validation set training ----#

MtdS2_obs, MtdS2_pred, MtdS2_prob = evaluate(model_Mtd, device, MtdS2train_input, S2train_output, finalperiod, cutoff=0.5)
ptyS2_obs, ptyS2_pred, ptyS2_prob = evaluate(model_pty, device, ptyS2train_input, S2train_output, finalperiod, cutoff=0.5)
txyS2_obs, txyS2_pred, txyS2_prob = evaluate(model_txy, device, txyS2train_input, S2train_output, finalperiod, cutoff=0.5)
x_valid = np.array(np.column_stack([MtdS2_prob, ptyS2_prob, txyS2_prob])).reshape(-1, 3*2)

S2prior = sum(MtdS2_obs)/len(MtdS2_obs)
print(S2prior)
wt = {0:S2prior, 1:1-S2prior} # use the prior class prob as class weights

L2Logistic_model = LogisticRegression(penalty='l2', solver='sag', class_weight=wt)
L2Logistic_model.fit(x_valid, MtdS2_obs)


#---- testing set evaluation ----#
# x_test = np.array(np.transpose([Mtdtest_prob, ptytest_prob, txytest_prob, krdtest_prob])).reshape(-1, 3*2)
x_test = np.array(np.column_stack([Mtdtest_prob, ptytest_prob, txytest_prob])).reshape(-1, 3*2)
final_obs  = Mtdtest_obs
final_prob = L2Logistic_model.predict_proba(x_test)[:,1]
final_pred = L2Logistic_model.predict(x_test)

final_acc  = metrics.accuracy_score(final_obs, final_pred)
final_auc  = metrics.roc_auc_score(final_obs, final_prob)
final_conf = metrics.confusion_matrix(final_obs, final_pred)

print(final_acc)
print(final_auc)
print(final_conf)

0.31868131868131866
0.7166666666666667
0.7548626119172585
[[60 19]
 [15 26]]
