In [65]:
def logging( i, print_every, no_samples, tic, elapsed_time):
    """
    Function for logging: timing and counting
    Inputs:
    - i:            Iteration i 
    - print_every:  Print logging in the interval of print_every
    - no_samples:   Number of samples
    - tic:          Start timer for last time
    - elapsed_time: Amount of time elapsed 
    """
    if not i % print_every:
        toc = time.clock()
        period_time = toc - tic;
        if i > 0:
            elapsed_time = (elapsed_time + period_time)
            mean_period_time = elapsed_time/ ((i)/print_every)
            minutes = round(mean_period_time*( (No_samples_train-i)//print_every)//60)
            seconds = round(mean_period_time*( (No_samples_train-i)/print_every)%60)
            print("Data loaded:", i,"/",No_samples_train,  "    Remaining time: ", minutes,":", seconds)
        tic = time.clock(); 
    else:
        tic = -1
        elapsed_time = -1
    return tic, elapsed_time


def uniform_stratified_sampler(labels, n=None):
    """
    Stratified sampler that distributes labels uniformly by
    sampling at most n data points per class
    """
    from functools import reduce
    # Only choose digits in n_labels
    (indices,) = np.where(reduce(lambda x, y: x | y, [labels.numpy() == i for i in classes]))

    # Ensure uniform distribution of labels
    np.random.shuffle(indices)
    indices = np.hstack([list(filter(lambda idx: labels[idx] == i, indices))[:n] for i in classes])

    indices = torch.from_numpy(indices)
    sampler = SubsetRandomSampler(indices)
    return sampler


def construct_DataLoader( Target, Image, batch_size, labels_per_class=None):
    """
    Contruct DataLoader with tensors
    Inputs:
    - Target:            Iteration i 
    - Image:  Print logging in the interval of print_every
    - batch_size
    """
    # Convert to Tensor
    Target = torch.Tensor(Target)
    Image = torch.Tensor(Image)
    Image = Image.unsqueeze(1)

    # Construct DataLoader
    loader = TensorDataset(Image, Target)
    if labels_per_class==None:
        dataLoader = DataLoader(loader, batch_size=batch_size, shuffle = True)
    else:    
        dataLoader = DataLoader(loader, batch_size=batch_size,
                     sampler=uniform_stratified_sampler(Target, labels_per_class))
    return dataLoader

In [71]:
warnings.filterwarnings("ignore")

######### Defining the data set #########

KAGGLE = False
 
# Display label format
if KAGGLE:
    df = pd.read_csv('../input/stage_2_detailed_class_info.csv')
    df.rename(columns={'class': 'Target'}, inplace=True)
    mapping = {'Normal': 0, 'Lung Opacity': 1, 'No Lung Opacity / Not Normal': 2}
    df = df.replace({'Target': mapping})
    df = df[ df['Target']!=2 ]
    df = df.reset_index(drop=True)
else:
    df = pd.read_csv('data/stage_1_train_labels.csv')

"""
# Defining the size of the data set
classes = [0,1]          # ['Normal , 'Lung Opacity']  
batch_size       = 64
labels_per_class = 32    # Specify how many labelled examples we want per class
No_samples_train_labelled = labels_per_class*len(classes)
No_samples_train = 64
No_samples_test  = 64
IMG_SIZE         = 32
img_dimension = [IMG_SIZE,IMG_SIZE]
"""

# Test connection to data set
No = 4
patientId = df['patientId'][No]
if KAGGLE:
    dcm_file = '../input/stage_2_train_images/%s.dcm' % patientId
else:
    dcm_file = 'data/stage_1_train_images/%s.dcm' % patientId
dcm_data = pydicom.read_file(dcm_file)
print("Connection to dataset established:")
print(df.iloc[No])
print(' ')

# Get only unique entrances from the provided data (some patients occur multiple times)
unq, idx = np.unique(df['patientId'], return_index = True)

patientId    00436515-870c-4b36-a041-de91049b9ab4
x                                             264
y                                             152
width                                         213
height                                        379
Target                                          1
Name: 4, dtype: object


In [72]:
######### Load data into DataLoader's #########
# Load unlabelled training data
# Load labelled test data
# Load labelled training data

"""
print_every = 10
"""
Do_img_eq = True

######### Loading UNLABELED TRAINING data #########
Target = []; Image = []
tic = time.clock(); elapsed_time = 0

print("Loading training images: 0 /", No_samples_train)
for i in range(0,No_samples_train):
    Target.append(df.Target[idx[i]]) # Get label  
    patientId = df['patientId'][idx[i]] # Get patient id from the idx 
    if KAGGLE:
        dcm_file = '../input/stage_2_train_images/%s.dcm' % patientId # find the image-file corresponding to the patient id
    else:
        dcm_file = 'data/stage_1_train_images/%s.dcm' % patientId # find the image-file corresponding to the patient id
    dcm_data = pydicom.read_file(dcm_file) # Load the image 
    Image.append(resize(dcm_data.pixel_array, output_shape=img_dimension, mode='reflect'))#, anti_aliasing=True)) # resize image
    if Do_img_eq:
        Image[-1] = equalize_hist(Image[-1])
    
    # Logging: counting and time remaining
    val1, val2 = logging(i, print_every, No_samples_train, tic, elapsed_time)
    if not val1 == -1:
        tic = val1
        elapsed_time = val2    
    i = i + 1
    
print("Train data loaded:", i)
train_loader = construct_DataLoader( Target, Image, batch_size)
print(' ')


######### Loading TEST data #########
Target = []; Image = []
tic = time.clock(); elapsed_time = 0
for i in range(0,No_samples_test):
    Target.append(df.Target[idx[No_samples_train+i]]) # Get label  
    patientId = df['patientId'][idx[No_samples_train+i]] # Get patient id from the idx 
    if KAGGLE:
        dcm_file = '../input/stage_2_train_images/%s.dcm' % patientId # find the image-file corresponding to the patient id
    else:
        dcm_file = 'data/stage_1_train_images/%s.dcm' % patientId # find the image-file corresponding to the patient id
    dcm_data = pydicom.read_file(dcm_file) # Load the image 
    Image.append(resize(dcm_data.pixel_array, output_shape=img_dimension, mode='reflect'))#, anti_aliasing=True)) # resize image
    if Do_img_eq:
        Image[-1] = equalize_hist(Image[-1])
        
    # Logging: counting and time remaining
    val1, val2 = logging(i, print_every, No_samples_test, tic, elapsed_time)
    if not val1 == -1:
        tic = val1
        elapsed_time = val2
    i = i + 1
    
print("Test data loaded:", i)
test_loader = construct_DataLoader( Target, Image, batch_size)
print(' ')


######### Loading LABELED TRAINING data #########
Target = []; Image = []
tic = time.clock(); elapsed_time = 0
count_unlabelled = 0; count_labelled = 0
count_old = 0;
i = 0
while count_labelled<labels_per_class or count_unlabelled<labels_per_class:
    Target.append(df.Target[idx[No_samples_test+No_samples_train+i]]) # Get label  

    if Target[i]==1:
        count_labelled = count_labelled + 1
    else:
        count_unlabelled = count_unlabelled + 1
        
    patientId = df['patientId'][idx[No_samples_test+No_samples_train+i]] # Get patient id from the idx 
    if KAGGLE:
        dcm_file = '../input/stage_2_train_images/%s.dcm' % patientId # find the image-file corresponding to the patient id
    else:
        dcm_file = 'data/stage_1_train_images/%s.dcm' % patientId # find the image-file corresponding to the patient id
    dcm_data = pydicom.read_file(dcm_file) # Load the image 
    Image.append(resize(dcm_data.pixel_array, output_shape=img_dimension, mode='reflect'))#, anti_aliasing=True)) # resize image
    if Do_img_eq:
        Image[-1] = equalize_hist(Image[-1])

    # Logging: counting and time remaining
    count = min(count_labelled, count_unlabelled)
    if (count > count_old):
        count_old = min(count_labelled, count_unlabelled)
        val1, val2 = logging(2*count, print_every, No_samples_train_labelled, tic, elapsed_time)
        if not val1 == -1:
            tic = val1
            elapsed_time = val2  
    i = i + 1

print("Labelled data loaded:", No_samples_train_labelled)
train_loader_labelled = construct_DataLoader( Target, Image, batch_size, labels_per_class)

del df, idx, unq

Loading training images: 0 / 64


  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


Data loaded: 10 / 64     Remaining time:  0 : 1
Data loaded: 20 / 64     Remaining time:  0 : 1
Data loaded: 30 / 64     Remaining time:  0 : 1
Data loaded: 40 / 64     Remaining time:  0 : 0
Data loaded: 50 / 64     Remaining time:  0 : 0
Data loaded: 60 / 64     Remaining time:  0 : 0
Train data loaded: 64
 
Data loaded: 10 / 64     Remaining time:  0 : 1
Data loaded: 20 / 64     Remaining time:  0 : 0
Data loaded: 30 / 64     Remaining time:  0 : 0
Data loaded: 40 / 64     Remaining time:  0 : 0
Data loaded: 50 / 64     Remaining time:  0 : 0
Data loaded: 60 / 64     Remaining time:  0 : 0
Test data loaded: 64
 
Data loaded: 10 / 64     Remaining time:  0 : 1
Data loaded: 20 / 64     Remaining time:  0 : 1
Data loaded: 30 / 64     Remaining time:  0 : 1
Data loaded: 40 / 64     Remaining time:  0 : 0
Data loaded: 50 / 64     Remaining time:  0 : 0
Data loaded: 60 / 64     Remaining time:  0 : 0
Labelled data loaded: 87
