In [1]:
import sys
import os
sys.path.append('../metal')
sys.path.append('../heart-MRI-pytorch')
sys.path.append('../data')
sys.path.append('../../sequential_ws')

In [2]:
import numpy as np
import argparse
import torch
import logging
import warnings
import pandas
from glob import glob
from scipy.sparse import csr_matrix
import torchvision
import torch.nn as nn
from torch.nn.functional import normalize
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from utils import *
from metal.label_model import LabelModel
from metal.label_model.baselines import MajorityLabelVoter
from metal.analysis import lf_summary, confusion_matrix
from DP.label_model import DPLabelModel, optimize

In [3]:
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore", category=DeprecationWarning)
#warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [4]:
def read_labels(label_list):
    '''
    Function to read labels given list of labels
    Returns a sparse matrix for LFs
    Returns numpy array for true labels
    
    Input
    ----
    label_list: list of labels
    
    Output
    -----
    L: sparse matrix (#patients*#frames, #LFs)
    or L: numpy array (#patients*#frames,)
    '''
    
    L = []
    for index in range(len(label_list)):
        L.append(np.load(label_list[index]))

    L = np.squeeze(np.array(L))
    
    # reshaping array from (PID,frames,) -> (PID*frames,)
    m = L.shape[0]
    n = L.shape[1]
    if(len(L.shape) == 2): # true labels 
        L = np.reshape(L,(m*n,))
        L = L+1 # changing from 0-indexing to 1-indexing
    else:
        L = csr_matrix(np.reshape(L,(m*n,L.shape[2])))

    return L

In [5]:
def load_labels(args):
    '''
    Script to read labels using input args
    '''
    L = {}
    Y = {}

    #train_lf_list = glob(args.train + '/lf_labels/*.npy') 
    L["train"] = read_labels(glob(args["train"] + '/lf_labels/*.npy'))
    L["dev"] = read_labels(glob(args["dev"] + '/lf_labels/*.npy'))
    L["test"] = read_labels(glob(args["test"] + '/lf_labels/*.npy'))

    #import ipdb; ipdb.set_trace()
    Y["dev"] = read_labels(glob(args["dev"] + '/true_labels/*.npy'))
    Y["test"] = read_labels(glob(args["test"] + '/true_labels/*.npy'))	

    return L,Y

In [6]:
# loading data
args = {}
args["train"] = '../data/open_close/train'
args["dev"] = '../data/open_close/dev'
args["test"] = '../data/open_close/test'

L,Y = load_labels(args) 

In [7]:
#print(L["train"].todense().shape) # (18850,5)
#print(L["dev"].todense().shape) # (1500,5)
#print(Y["dev"].shape) # (1500,)

In [8]:
# labelling functions analysis
print(lf_summary(L["dev"], Y = Y["dev"]))

     Polarity  Coverage  Overlaps  Conflicts  Correct  Incorrect  Emp. Acc.
0  [1.0, 2.0]       1.0       1.0   0.212667     1335        165   0.890000
1  [1.0, 2.0]       1.0       1.0   0.212667     1304        196   0.869333
2  [1.0, 2.0]       1.0       1.0   0.212667     1335        165   0.890000
3  [1.0, 2.0]       1.0       1.0   0.212667     1291        209   0.860667
4  [1.0, 2.0]       1.0       1.0   0.212667     1377        123   0.918000


In [9]:
# majority vote of LFs
mv = MajorityLabelVoter(seed=123)
print('Majority Label Voter Metrics:')
mv_score = mv.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1'])

Majority Label Voter Metrics:
Accuracy: 0.897
Precision: 0.876
Recall: 0.901
F1: 0.888
        y=1    y=2   
 l=1    612    87    
 l=2    67     734   


In [10]:
# defining parameters
num_classes = 2
if(torch.cuda.is_available()):
    device = 'cuda'
else:
    device = 'cpu'

In [11]:
# training label model - no temporal modelling
label_model = LabelModel(k=num_classes, seed=123)
label_model.train_model(L["train"], Y["dev"], n_epochs = 500, log_train_every = 50)

# evaluating label model
print('Trained Label Model Metrics:')
lm_score = label_model.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1'])

Computing O...
Estimating \mu...
[50 epo]: TRAIN:[loss=0.017]
[100 epo]: TRAIN:[loss=0.002]
[150 epo]: TRAIN:[loss=0.002]
[200 epo]: TRAIN:[loss=0.002]
[250 epo]: TRAIN:[loss=0.002]
[300 epo]: TRAIN:[loss=0.002]
[350 epo]: TRAIN:[loss=0.002]
[400 epo]: TRAIN:[loss=0.002]
[450 epo]: TRAIN:[loss=0.002]
[500 epo]: TRAIN:[loss=0.002]
Finished Training
Trained Label Model Metrics:
Accuracy: 0.894
Precision: 0.865
Recall: 0.907
F1: 0.886
        y=1    y=2   
 l=1    616    96    
 l=2    63     725   


In [12]:
# training label model without temporal modelling
# ( this should reproduce the results above )
# naive model
m_per_task = L["train"].todense().shape[1] # 5
MRI_data_naive = {'Li_train': torch.FloatTensor(np.array(L["train"].todense())),
                'Li_dev': torch.FloatTensor(np.array(L["dev"].todense())),
                'R_dev':Y["dev"] }


naive_model = DPLabelModel(m=m_per_task, 
                       T=1,
                       edges=[],
                       coverage_sets=[[0,]]*m_per_task,
                       mu_sharing=[[i,] for i in range(m_per_task)],
                       phi_sharing=[],
                       device=device,
                       #class_balance=MRI_data_naive['class_balance'], 
                       seed=0)

optimize(naive_model, L_hat=MRI_data_naive['Li_train'], num_iter=3000, lr=1e-3, momentum=0.8, clamp=True, seed=0)

R_pred = naive_model.predict( MRI_data_naive['Li_dev'] )
for metric in ['accuracy', 'f1', 'recall', 'precision']:
    score = metric_score(MRI_data_naive['R_dev'].cpu(), R_pred.cpu(), metric)
    print(f"{metric.capitalize()}: {score:.3f}")

RuntimeError: tensors used as indices must be long or byte tensors

In [None]:
# training label model with temporal modelling
T = 50
n_patients_train = round(L["train"].todense().shape[0]/T) #(377)
n_patients_dev = round(L["dev"].todense().shape[0]/T) #(30)
MRI_data_temporal = {'Li_train': torch.FloatTensor(np.reshape(np.array(L["train"].todense()),(n_patients_train,( m_per_task*T)) )), # (377,250) 
                    'Li_dev': torch.FloatTensor(np.reshape(np.array(L["dev"].todense()),(n_patients_dev,( m_per_task*T)) )), # (30,250)
                    'R_dev':Y["dev"],
                    'm': m_per_task*T,
                    'T': T }