In [49]:
from functools import partial
import pandas as pd
import os

In [29]:
def add_path(file_, path):
  '''
  add the images folder to the file name

  Args:
    file_ (str) --> file that is inside the folder
    path (str) --> folder that contains the files
  
  Returns:
    filepath (str) --> file with destination path
  '''

  return path + file_

In [34]:
def select_abnormalities(file_img):
  '''
  select images with associated abnormalities

  Args:
    file_img (str) --> montogomery or shenzhen filename 
  
  Returns:
    list_files_ab (list) --> returns the file if it is an abnormal image
  '''

  try:
    if int(file_img.split('.')[0][-1]) == 1:
      return file_img
  except:
    pass

In [102]:
def read_txt(file_):
  '''
  read a txt file from a list
  
  Args:
    file_ (str) --> txt file path

  Returns:
    content (str) --> txt file text content
  '''

  file_ = open(file_, "r")
  
  return file_.read().lower()

In [140]:
def find_sex(content_str, dataset):
  '''
  finds the patient's sex from clinical notes
  
  Args:
    content_str (str) --> string with the clinical information of radiographs
    dataset (str) --> reference data set

  Returns:
    sex_info (str) --> patient sex found
  '''

  if dataset == 'montgomery':
    if 'sex: f' in content_str: return 'female'
    elif 'sex: m' in content_str: return 'male'
  elif dataset == 'shenzhen':
    if 'female' in content_str: return 'female'
    elif 'male' in content_str: return 'male'

In [None]:
!cp /content/drive/MyDrive/datasets/montgomery-shenzhen-datasets.zip /content
!unzip /content/montgomery-shenzhen-datasets.zip

In [103]:
montgomery_f = os.listdir('/content/Montgomery/MontgomerySet/CXR_png')
montgomery_l = os.listdir('/content/Montgomery/MontgomerySet/ClinicalReadings')
shenzhen_f = os.listdir('/content/ChinaSet_AllFiles/ChinaSet_AllFiles/CXR_png')
shenzhen_l = os.listdir('/content/ChinaSet_AllFiles/ChinaSet_AllFiles/ClinicalReadings')

In [104]:
montgomery_f = list(map(select_abnormalities, montgomery_f))
montgomery_l = list(map(select_abnormalities, montgomery_l))
shenzhen_f = list(map(select_abnormalities, shenzhen_f))
shenzhen_l = list(map(select_abnormalities, shenzhen_l))

In [105]:
montgomery_f = [x for x in montgomery_f if x != None]
montgomery_l = [x for x in montgomery_l if x != None]
shenzhen_f = [x for x in shenzhen_f if x != None]
shenzhen_l = [x for x in shenzhen_l if x != None]

In [106]:
partial_add_path = partial(add_path, 
  path = '/content/Montgomery/MontgomerySet/CXR_png/')
montgomery_f = list(map(partial_add_path, montgomery_f))
partial_add_path = partial(add_path, 
  path = '/content/Montgomery/MontgomerySet/ClinicalReadings/')
montgomery_l = list(map(partial_add_path, montgomery_l))
partial_add_path = partial(add_path, 
  path = '/content/ChinaSet_AllFiles/ChinaSet_AllFiles/CXR_png/')
shenzhen_f = list(map(partial_add_path, shenzhen_f))
partial_add_path = partial(add_path, 
  path = '/content/ChinaSet_AllFiles/ChinaSet_AllFiles/ClinicalReadings/')
shenzhen_l = list(map(partial_add_path, shenzhen_l))

In [151]:
montgomery_l_sex = list(map(read_txt, montgomery_l))
partial_find_sex = partial(find_sex, dataset = 'montgomery')
montgomery_l_sex = list(map(partial_find_sex, montgomery_l_sex))
shenzhen_l_sex = list(map(read_txt, shenzhen_l))
partial_find_sex = partial(find_sex, dataset = 'shenzhen')
shenzhen_l_sex = list(map(partial_find_sex, shenzhen_l_sex))

In [175]:
montgomery_df = pd.DataFrame(data = montgomery_f, columns = ['Path'])
montgomery_df['Dataset'] = ['Montgomery'] * len(montgomery_f)
montgomery_df['Tuberculosis'] = [1] * len(montgomery_f)
montgomery_df['Sex'] = montgomery_l_sex
montgomery_df = montgomery_df[['Dataset', 'Path', 'Sex', 'Tuberculosis']]

shenzhen_df = pd.DataFrame(data = shenzhen_f, columns = ['Path'])
shenzhen_df['Dataset'] = ['Shenzhen'] * len(shenzhen_f)
shenzhen_df['Tuberculosis'] = [1] * len(shenzhen_f)
shenzhen_df['Sex'] = shenzhen_l_sex
shenzhen_df = shenzhen_df[['Dataset', 'Path', 'Sex', 'Tuberculosis']]

montgomery_shenzhen_df = pd.concat([montgomery_df, shenzhen_df])
montgomery_shenzhen_df.to_csv(path_or_buf = 'montgomery_shenzhen_df.csv', 
  columns = ['Dataset', 'Path', 'Sex', 'Tuberculosis'])