# In this notebook we will generate csv files with vectors for an offical test train split

## Imports

In [1]:
import numpy as np
import pandas as pd 
import os
from os.path import join
from glob import glob
import matplotlib.pyplot as plt
from itertools import chain
from sklearn.preprocessing import MultiLabelBinarizer
import glob

## Load dataframe

In [2]:
all_xray_df = pd.read_csv('metadata/Data_Entry_2017.csv')

## Generate paths

In [3]:
PATH = "/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-Annotations"
all_image_paths = glob.glob(f'{PATH}/images_*/images/*.png', recursive=True)
all_image_paths.sort()
all_image_paths = {os.path.basename(x): x for x in all_image_paths}

print('Scans found:', len(all_image_paths), ', Total Headers', all_xray_df.shape[0])
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)

all_xray_df.head()

Scans found: 112120 , Total Headers 112120


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11,path
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...


## Replace "No finding" with ""

In [4]:
all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('No Finding', ''))
all_xray_df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11,path
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
3,00000002_000.png,,0,2,81,M,PA,2500,2048,0.171,0.171,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...


## Replace "|" with ","

In [5]:
all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('|', ','))
all_xray_df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11,path
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
1,00000001_001.png,"Cardiomegaly,Emphysema",1,1,58,M,PA,2894,2729,0.143,0.143,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
2,00000001_002.png,"Cardiomegaly,Effusion",2,1,58,M,PA,2500,2048,0.168,0.168,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
3,00000002_000.png,,0,2,81,M,PA,2500,2048,0.171,0.171,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,,/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...


In [6]:
classes = np.unique(list(chain(*all_xray_df['Finding Labels'].map(lambda x: x.split(',')).tolist())))
classes = [x for x in classes if len(x)>0]
classes.sort()
classes

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax']

## Convert lables to a vector

In [7]:
encoder = MultiLabelBinarizer(classes=classes)
labels = encoder.fit_transform([c.split(",") for c in list(all_xray_df["Finding Labels"])])



In [8]:
labels

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Create dataframe with the relevant data

In [9]:
df = pd.DataFrame()

In [10]:
df["Image Index"] = all_xray_df["Image Index"]
df["label"] = labels.tolist()
df["path"] = all_xray_df["path"]
df.head()

Unnamed: 0,Image Index,label,path
0,00000001_000.png,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
1,00000001_001.png,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
2,00000001_002.png,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
3,00000002_000.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
4,00000003_000.png,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...


## Use the official train-test split

In [12]:
train_val_list = pd.read_fwf('metadata/train_val_list.txt', header=None)
train_val_list = train_val_list.squeeze()
train_val_list.head()
train_df = df.loc[all_xray_df['Image Index'].isin(train_val_list)]
train_df.head()

Unnamed: 0,Image Index,label,path
0,00000001_000.png,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
1,00000001_001.png,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
2,00000001_002.png,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
3,00000002_000.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
12,00000004_000.png,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...


In [13]:
test_list = pd.read_fwf('metadata/test_list.txt', header=None)
test_list = test_list.squeeze()
test_df = df.loc[all_xray_df['Image Index'].isin(test_list)]
test_df.head()

Unnamed: 0,Image Index,label,path
4,00000003_000.png,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
5,00000003_001.png,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
6,00000003_002.png,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
7,00000003_003.png,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...
8,00000003_004.png,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",/home/piotr/Desktop/Nih-Chest-X-Ray-Artifacts-...


In [14]:
f"{len(train_df)} examples in the trainset and {len(test_df)} examples in the testset, {len(train_df)+len(test_df)}  in total"

'86524 examples in the trainset and 25596 examples in the testset, 112120  in total'

## Save train and test dfs as csv

In [15]:
train_df.to_csv("metadata/train_df.csv", index=False)
test_df.to_csv("metadata/test_df.csv", index=False)