In [17]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.nn.functional import one_hot


def load_labels(label_path, fill=True):
    labels = pd.read_csv(label_path)

    labels['Sex'] = labels['Sex'].mask(
                            labels['Sex'] == 'Female', 1).mask(
                            labels['Sex'] == 'Male', -1).mask(
                            labels['Sex'] == 'Unknown', 0)
    labels['Frontal/Lateral'] = labels['Frontal/Lateral'].mask(
                            labels['Frontal/Lateral'] == 'Frontal', 1).mask(
                            labels['Frontal/Lateral'] == 'Lateral', -1)
    labels['AP/PA'] = labels['AP/PA'].mask(
                            labels['AP/PA'] == 'PA', 1).mask(
                            labels['AP/PA'] == 'AP', 0).fillna(-1)

    labels.rename(columns={'Unnamed: 0':'id'}, inplace=True)
    labels.set_index('id', drop=True, inplace=True)
    
    if fill:
        mses = np.zeros((81,14))
        train, test = train_test_split(np.arange(labels.shape[0]))
        rs = np.linspace(-1.5, 1.5 ,81)
        cols = labels.columns[5:]

        fill = labels[cols]
        pred = np.asarray(fill.iloc[train]).mean(axis=0) * np.ones((len(test), fill.shape[1]))
        mses[-1,:] = np.mean((pred - np.asarray(fill.iloc[test]))**2, axis=0)

        for i, r in enumerate(rs):
            fill = labels[cols].fillna(r)
            filltrain = fill.iloc[train]
            filltest = fill.iloc[test]
            pred = np.asarray(filltrain).mean(axis=0) * np.ones((len(test), fill.shape[1]))
            mses[i,:] = np.mean((pred - np.asarray(filltest))**2, axis=0)

        fills = np.zeros(14)
        argmins = mses.argmin(axis=0)
        for n in range(14):
            if argmins[n] == len(rs):
                fills[n] = None
            else:
                fills[n] = int(round(rs[argmins[n]]))

        for i, col in enumerate(labels.columns[5:]):
            labels[col] = labels[col].fillna(fills[i])
    
    return labels


if __name__ == "__main__":
    
    label_path = "D:\\cs156\\train.csv"

    processed_labels = load_labels(label_path, fill=True)
    labels_needed = processed_labels[['No Finding', 'Enlarged Cardiomediastinum',
                                     'Cardiomegaly', 'Lung Opacity', 'Pneumonia',
                                     'Pleural Effusion', 'Pleural Other', 'Fracture',
                                     'Support Devices']]
    labels_1hot = pd.DataFrame()
    for col in labels_needed.columns:
        for lab in [-1, 0, 1]:
            labels_1hot[col + ' ' + str(lab)] = (labels_needed[col] == lab).astype(int)
    #processed_labels.to_csv('./labels_1hot.csv')

In [46]:
def load_labels(label_path, fill=True):
    labels = pd.read_csv(label_path)

    labels['Sex'] = labels['Sex'].mask(
                            labels['Sex'] == 'Female', 1).mask(
                            labels['Sex'] == 'Male', -1).mask(
                            labels['Sex'] == 'Unknown', 0)
    labels['Frontal/Lateral'] = labels['Frontal/Lateral'].mask(
                            labels['Frontal/Lateral'] == 'Frontal', 1).mask(
                            labels['Frontal/Lateral'] == 'Lateral', -1)
    labels['AP/PA'] = labels['AP/PA'].mask(
                            labels['AP/PA'] == 'PA', 1).mask(
                            labels['AP/PA'] == 'AP', 0).fillna(-1)

    labels.rename(columns={'Unnamed: 0':'id'}, inplace=True)
    labels.set_index('id', drop=True, inplace=True)
    
    if fill:
        mses = np.zeros((4,14))
        train, test = train_test_split(np.arange(labels.shape[0]))
        rs = [-1, 0, 1]
        cols = labels.columns[5:]

        fill = labels[cols]
        pred = np.nanmean(fill.iloc[train],axis=0) * np.ones((len(test), fill.shape[1]))
        mses[-1,:] = np.nanmean((pred - np.asarray(fill.iloc[test]))**2, axis=0)

        for i, r in enumerate(rs):
            fill = labels[cols].fillna(r)
            filltrain = fill.iloc[train]
            filltest = fill.iloc[test]
            pred = np.asarray(filltrain).mean(axis=0) * np.ones((len(test), fill.shape[1]))
            mses[i,:] = np.mean((pred - np.asarray(filltest))**2, axis=0)

        fills = np.zeros(14)
        argmins = mses.argmin(axis=0)
        for n in range(14):
            if argmins[n] == len(rs):
                fills[n] = None
            else:
                fills[n] = int(round(rs[argmins[n]]))

        for i, col in enumerate(labels.columns[5:]):
            labels[col] = labels[col].fillna(fills[i])
    return labels

load_labels("D:\\cs156\\train.csv")

Unnamed: 0_level_0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,train/pid50512/study1/view1_frontal.jpg,1,68,1,0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0,1.0
1,train/pid21580/study2/view1_frontal.jpg,1,87,1,0,-1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,train/pid21580/study1/view1_frontal.jpg,1,83,1,0,-1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3,train/pid21580/study1/view2_lateral.jpg,1,83,-1,-1,-1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,train/pid33839/study1/view1_frontal.jpg,-1,41,1,0,-1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223409,train/pid16708/study2/view1_frontal.jpg,-1,59,1,0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,1.0
223410,train/pid16708/study1/view1_frontal.jpg,-1,59,1,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.0,1.0
223411,train/pid32381/study1/view1_frontal.jpg,1,0,1,0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
223412,train/pid25865/study1/view1_frontal.jpg,1,0,1,0,-1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,-1.0,0.0,1.0,0.0,-1.0


In [25]:
labels_1hot = pd.DataFrame()
for col in labels_needed.columns:
    for lab in [-1, 0, 1]:
        labels_1hot[col + ' ' + str(lab)] = (labels_needed[col] == lab).astype(int)
labels_1hot

Unnamed: 0_level_0,No Finding -1,No Finding 0,No Finding 1,Enlarged Cardiomediastinum -1,Enlarged Cardiomediastinum 0,Enlarged Cardiomediastinum 1,Cardiomegaly -1,Cardiomegaly 0,Cardiomegaly 1,Lung Opacity -1,...,Pleural Effusion 1,Pleural Other -1,Pleural Other 0,Pleural Other 1,Fracture -1,Fracture 0,Fracture 1,Support Devices -1,Support Devices 0,Support Devices 1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,0,1,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
1,1,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1
2,1,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1
3,1,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1
4,1,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223409,1,0,0,0,1,0,0,1,0,0,...,1,0,0,1,0,1,0,0,0,1
223410,0,1,0,0,1,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
223411,0,1,0,0,1,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
223412,1,0,0,0,1,0,0,0,1,0,...,0,0,0,1,0,1,0,1,0,0


In [13]:
one_hot(torch.tensor(labels_1hot.to_numpy()), 3)

RuntimeError: one_hot is only applicable to index tensor.

In [47]:
def load_labels(label_path, fill=True):
    labels = pd.read_csv(label_path)

    labels['Sex'] = labels['Sex'].mask(
                            labels['Sex'] == 'Female', 1).mask(
                            labels['Sex'] == 'Male', -1).mask(
                            labels['Sex'] == 'Unknown', 0)
    labels['Frontal/Lateral'] = labels['Frontal/Lateral'].mask(
                            labels['Frontal/Lateral'] == 'Frontal', 1).mask(
                            labels['Frontal/Lateral'] == 'Lateral', -1)
    labels['AP/PA'] = labels['AP/PA'].mask(
                            labels['AP/PA'] == 'PA', 1).mask(
                            labels['AP/PA'] == 'AP', 0).fillna(-1)

    labels.rename(columns={'Unnamed: 0':'id'}, inplace=True)
    labels.set_index('id', drop=True, inplace=True)
    
    if fill:
        mses = np.zeros((4,14))
        train, test = train_test_split(np.arange(labels.shape[0]))
        rs = [-1, 0, 1]
        cols = labels.columns[5:]

        fill = labels[cols]
        pred = np.nanmean(fill.iloc[train],axis=0) * np.ones((len(test), fill.shape[1]))
        mses[-1,:] = np.nanmean((pred - np.asarray(fill.iloc[test]))**2, axis=0)

        for i, r in enumerate(rs):
            fill = labels[cols].fillna(r)
            filltrain = fill.iloc[train]
            filltest = fill.iloc[test]
            pred = np.asarray(filltrain).mean(axis=0) * np.ones((len(test), fill.shape[1]))
            mses[i,:] = np.mean((pred - np.asarray(filltest))**2, axis=0)

        fills = np.zeros(14)
        argmins = mses.argmin(axis=0)
        for n in range(14):
            if argmins[n] == len(rs):
                fills[n] = None
            else:
                fills[n] = int(round(rs[argmins[n]]))

        for i, col in enumerate(labels.columns[5:]):
            labels[col] = labels[col].fillna(fills[i])
    return labels

load_labels("D:\\cs156\\train.csv").to_csv('Optimal_Fill_Labels.csv')

In [59]:
labs = pd.read_csv('./Optimal_Fill_Labels.csv', index_col='id', low_memory=False)

In [79]:
path = labels_1hot['Path'][4]
path[15:path.find('/',15)]

'study1'

In [81]:
labels_needed = labs[['Path', 'No Finding', 'Enlarged Cardiomediastinum',
                                 'Cardiomegaly', 'Lung Opacity', 'Pneumonia',
                                 'Pleural Effusion', 'Pleural Other', 'Fracture',
                                 'Support Devices']]
labels_1hot = pd.DataFrame({'Path':labs['Path']})
path200x200 = []
for idx in labels_1hot.index:
    path = labels_1hot['Path'][idx]
    path200x200.append(path[6:14] + '_' + path[15:path.find('/',15)] + path[-12:-4] + '.npy')
labels_1hot['Path200x200'] = path200x200
for col in labels_needed.columns[1:]:
    for lab in [-1, 0, 1]:
        labels_1hot[col + ' ' + str(lab)] = (labels_needed[col] == lab).astype(int)
labels_1hot.to_csv('./Labels_Onehot.csv')

In [61]:
l1h = pd.read_csv('Labels_Onehot.csv', index_col='id', low_memory=False)

In [62]:
l1h

Unnamed: 0_level_0,Path,No Finding -1,No Finding 0,No Finding 1,Enlarged Cardiomediastinum -1,Enlarged Cardiomediastinum 0,Enlarged Cardiomediastinum 1,Cardiomegaly -1,Cardiomegaly 0,Cardiomegaly 1,...,Pleural Effusion 1,Pleural Other -1,Pleural Other 0,Pleural Other 1,Fracture -1,Fracture 0,Fracture 1,Support Devices -1,Support Devices 0,Support Devices 1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,train/pid50512/study1/view1_frontal.jpg,0,0,1,0,1,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
1,train/pid21580/study2/view1_frontal.jpg,1,0,0,0,1,0,0,1,0,...,0,0,0,1,0,0,1,0,0,1
2,train/pid21580/study1/view1_frontal.jpg,1,0,0,0,1,0,0,1,0,...,0,0,0,1,0,0,1,0,0,1
3,train/pid21580/study1/view2_lateral.jpg,1,0,0,0,1,0,0,1,0,...,0,0,0,1,0,0,1,0,0,1
4,train/pid33839/study1/view1_frontal.jpg,1,0,0,0,1,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223409,train/pid16708/study2/view1_frontal.jpg,1,0,0,0,1,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1
223410,train/pid16708/study1/view1_frontal.jpg,0,1,0,0,1,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
223411,train/pid32381/study1/view1_frontal.jpg,0,1,0,0,1,0,0,1,0,...,0,0,0,1,0,1,0,0,0,1
223412,train/pid25865/study1/view1_frontal.jpg,1,0,0,0,1,0,0,0,1,...,0,0,0,1,0,1,0,1,0,0
