In [80]:
from functools import partial
import pandas as pd
import numpy as np
import tarfile 
import os

In [32]:
def add_path(file_, path):
  '''
  add the images folder to the file name

  Args:
    file_ (str) --> file that is inside the folder
    path (str) --> folder that contains the files
  
  Returns:
    filepath (str) --> file with destination path
  '''

  return path + file_

In [35]:
!cp -r /content/drive/MyDrive/datasets/nih-files /content

In [None]:
extract_files = os.listdir('/content/nih-files')

for files in extract_files:
  file_ = tarfile.open(f'/content/nih-files/{files}') 
  file_.extractall('/content/') 
  file_.close() 

In [8]:
nih_df = pd.read_csv('/content/Data_Entry_2017_v2020.csv')
nih_df = nih_df.drop(['Follow-up #', 'Patient ID', 'Patient Age', 
                      'View Position', 'OriginalImage[Width', 'Height]', 
                      'OriginalImagePixelSpacing[x', 'y]'], axis = 1)

In [40]:
nih_df.rename(columns = {'Patient Gender': 'Sex'}, inplace = True)
nih_df['Sex'].replace({'M': 'male', 'F': 'female'}, inplace = True)

image_index = list(nih_df['Image Index'])
partial_add_path = partial(add_path, path = '/content/images/')
image_index = list(map(partial_add_path, image_index))

nih_df['Path'] = image_index
nih_df = nih_df.drop(['Image Index'], axis = 1)
nih_df['Dataset'] = ['NIH'] * len(nih_df)

In [None]:
infiltration = nih_df[nih_df['Finding Labels'] == 'Infiltration']
nodule = nih_df[nih_df['Finding Labels'] == 'Nodule']
emphysema = nih_df[nih_df['Finding Labels'] == 'Emphysema']
effusion = nih_df[nih_df['Finding Labels'] == 'Effusion']
atelectasis = nih_df[nih_df['Finding Labels'] == 'Atelectasis']
pleural_thickening = nih_df[nih_df['Finding Labels'] == 'Pleural_Thickening']
pneumothorax = nih_df[nih_df['Finding Labels'] == 'Pneumothorax']
mass = nih_df[nih_df['Finding Labels'] == 'Mass']
fibrosis = nih_df[nih_df['Finding Labels'] == 'Fibrosis']
consolidation = nih_df[nih_df['Finding Labels'] == 'Consolidation']
edema = nih_df[nih_df['Finding Labels'] == 'Edema']
pneumonia = nih_df[nih_df['Finding Labels'] == 'Pneumonia']

dict_abnormalities = {'Infiltration': infiltration, 'Nodule': nodule,
                      'Emphysema': emphysema, 'Effusion': effusion,
                      'Atelectasis': atelectasis, 'Mass': mass,
                      'Pleural_Thickening': pleural_thickening,
                      'Pneumothorax': pneumothorax, 'Fibrosis': fibrosis,
                      'Consolidation': consolidation, 'Edema': edema,
                      'Pneumonia': pneumonia}

for keys, items in dict_abnormalities.items():
  dict_abnormalities[keys][keys] = [1] * len(items)
  columns_others = list(dict_abnormalities.keys())
  columns_others.remove(keys)
  zeros = np.zeros((len(items), 11))
  others_df = pd.DataFrame(data = zeros, columns = columns_others)
  dict_abnormalities[keys] = dict_abnormalities[keys].join(others_df)
  dict_abnormalities[keys] = dict_abnormalities[keys].fillna(0)
  columns = np.concatenate((['Dataset', 'Path', 'Sex', 'Finding Labels'], 
                           list(dict_abnormalities.keys())))
  dict_abnormalities[keys] = dict_abnormalities[keys][columns]

In [154]:
nih_df = pd.concat((dict_abnormalities['Infiltration'], dict_abnormalities['Nodule'],
          dict_abnormalities['Emphysema'], dict_abnormalities['Effusion'],
          dict_abnormalities['Atelectasis'], dict_abnormalities['Pleural_Thickening'],
          dict_abnormalities['Pneumothorax'], dict_abnormalities['Mass'],
          dict_abnormalities['Fibrosis'], dict_abnormalities['Consolidation'],
          dict_abnormalities['Edema'], dict_abnormalities['Pneumonia']))
nih_df = nih_df.drop(['Finding Labels'], axis = 1)

In [158]:
columns = np.concatenate((['Dataset', 'Path', 'Sex'], 
                          list(dict_abnormalities.keys())))
nih_df.to_csv(path_or_buf = 'nih_df.csv', columns = columns)