In [None]:
import matplotlib.pyplot as plt
from functools import partial
import pandas as pd
import numpy as np
import tarfile 
import os

from google.colab import drive
drive.mount('/content/drive')

In [2]:
def add_path(file_, path):
  '''
  add the images folder to the file name

  Args:
    file_ (str) --> file that is inside the folder
    path (str) --> folder that contains the files
  
  Returns:
    filepath (str) --> file with destination path
  '''

  return path + file_

In [3]:
!cp -r /content/drive/MyDrive/datasets-projeto/nih-files /content

In [4]:
extract_files = os.listdir('/content/nih-files')

for files in extract_files:
  file_ = tarfile.open(f'/content/nih-files/{files}') 
  file_.extractall('/content/') 
  file_.close() 

In [16]:
nih_df = pd.read_csv('/content/nih_original_csv_file.csv')
nih_df = nih_df.drop(['Follow-up #', 'Patient ID', 'Patient Age', 
                      'View Position', 'OriginalImage[Width', 'Height]', 
                      'OriginalImagePixelSpacing[x', 'y]'], axis = 1)

In [17]:
nih_df.rename(columns = {'Patient Gender': 'Sex'}, inplace = True)
nih_df['Sex'].replace({'M': 'male', 'F': 'female'}, inplace = True)

image_index = list(nih_df['Image Index'])
partial_add_path = partial(add_path, path = '/content/images/')
image_index = list(map(partial_add_path, image_index))

nih_df['Path'] = image_index
nih_df = nih_df.drop(['Image Index'], axis = 1)
nih_df['Dataset'] = ['NIH'] * len(nih_df)
nih_df['Label Type'] = ['Multilabel'] * len(nih_df)

In [26]:
abnormalities_combinations = list()
classes = ['Nodule', 'Effusion', 'Atelectasis',  'Pneumothorax', 'Mass']
combinations = ['01', '02', '03', '04', '11', '12', '13', '14', '22', '23', 
                '24', '33', '34', '44', '012', '013', '014', '123', '124', 
                '234', '0123', '0124', '1234']

for combination in combinations:
  sub_classes = list()
  for element in list(combination):
    sub_classes.append(classes[int(element)])
  abnormalities_combinations.append(sub_classes)

In [31]:
nih_df['Finding Labels'][1]

'Cardiomegaly|Emphysema'

In [28]:
nih_df['Finding Labels'].value_counts()

No Finding                                                         60361
Infiltration                                                        9547
Atelectasis                                                         4215
Effusion                                                            3955
Nodule                                                              2705
                                                                   ...  
Atelectasis|Consolidation|Edema|Effusion|Infiltration|Pneumonia        1
Atelectasis|Consolidation|Effusion|Emphysema|Mass|Pneumothorax         1
Cardiomegaly|Effusion|Pleural_Thickening|Pneumothorax                  1
Edema|Infiltration|Pneumothorax                                        1
Atelectasis|Consolidation|Mass|Pleural_Thickening|Pneumothorax         1
Name: Finding Labels, Length: 836, dtype: int64

In [None]:
classes = ['Nodule', 'Effusion', 'Atelectasis',  'Pneumothorax', 'Mass']

dict_abnormalities = dict()
for classe in classes:
  dict_abnormalities[classe] = nih_df[nih_df['Finding Labels'] == classe]

for keys, items in dict_abnormalities.items():
  dict_abnormalities[keys][keys] = [1] * len(items)
  columns_others = list(dict_abnormalities.keys())
  columns_others.remove(keys)
  zeros = np.zeros((len(items), 5))
  others_df = pd.DataFrame(data = zeros, columns = columns_others)
  dict_abnormalities[keys] = dict_abnormalities[keys].join(others_df)
  dict_abnormalities[keys] = dict_abnormalities[keys].fillna(0)
  columns = np.concatenate((['Dataset', 'Path', 'Sex', 'Finding Labels', 
                             'Label Type'], list(dict_abnormalities.keys())))
  dict_abnormalities[keys] = dict_abnormalities[keys][columns]

In [27]:
abnormalities_combinations

[['Nodule', 'Effusion'],
 ['Nodule', 'Atelectasis'],
 ['Nodule', 'Pneumothorax'],
 ['Nodule', 'Mass'],
 ['Effusion', 'Effusion'],
 ['Effusion', 'Atelectasis'],
 ['Effusion', 'Pneumothorax'],
 ['Effusion', 'Mass'],
 ['Atelectasis', 'Atelectasis'],
 ['Atelectasis', 'Pneumothorax'],
 ['Atelectasis', 'Mass'],
 ['Pneumothorax', 'Pneumothorax'],
 ['Pneumothorax', 'Mass'],
 ['Mass', 'Mass'],
 ['Nodule', 'Effusion', 'Atelectasis'],
 ['Nodule', 'Effusion', 'Pneumothorax'],
 ['Nodule', 'Effusion', 'Mass'],
 ['Effusion', 'Atelectasis', 'Pneumothorax'],
 ['Effusion', 'Atelectasis', 'Mass'],
 ['Atelectasis', 'Pneumothorax', 'Mass'],
 ['Nodule', 'Effusion', 'Atelectasis', 'Pneumothorax'],
 ['Nodule', 'Effusion', 'Atelectasis', 'Mass'],
 ['Effusion', 'Atelectasis', 'Pneumothorax', 'Mass']]