In [4]:
import pandas as pd
import random
import os
import cv2
import imghdr
import time
import numpy as np
import datetime
import pandas as pd
from PIL import Image
import datetime
from tqdm import tqdm

In [7]:
def assign_group(row, grouping):
    """Given a dictionary of groupings, return the corresponding group.
    
    This function is to be used within a call to an .apply() method for 
    a pandas series. 
    """
    for group,values in grouping.items():
        if row in values:
            return group

In [17]:
# Configure where to save the resulting dataset
save_path_dir = '../data/intermediate/'
os.makedirs(save_path_dir, exist_ok=True)
save_path = save_path_dir+'inter_medpix.csv'

# Read the raw df
path = '../../data/raw/medpix/Dataset_MedPix_V1.xlsx'
df = pd.read_excel(path)
# (1) and (2) Eliminate rows where Image_Title has single numbers and
# eliminate rows where title has word figure , considering that it has
# either the word 'Figure',  'Film' or 'Infection'
filter1_2 = (
    (df['Image_Title'].str.contains('Figure', na=False)) |
    (df['Image_Title'].str.contains('Film', na=False)) |
    (df['Image_Title'].str.contains('Infection', na=False)) |
    (df['Image_Title'].str.contains('Replace', na=False))
)

df.drop(index=df[filter1_2].index, inplace=True)

# (3)Eliminate rows where the title has the string "Dermatology Seminar"
filter3 = (
    (df['Image_Title'].str.contains('Dermatology')) &
    (df['Image_Title'].str.contains('Seminar'))
)

df.drop(index=df[filter3].index, inplace=True)

# (4)Eliminate where diagnosis says New case builder
filter4 = (
    (df['Diagnosis'].str.contains('New', na=False)) &
    (df['Diagnosis'].str.contains('case')) &
    (df['Diagnosis'].str.contains('builder'))
)
df.drop(index=df[filter4].index, inplace=True)

# (5) Eliminate where diagnosis says Unknown
filter5 = (df['Diagnosis'].str.contains('Unknown', na=False))
df.drop(index=df[filter5].index, inplace=True)

#
# (7) Eliminate images that have no plane or modality or Findings or Location
# Editted so as to only filter findings column because Mamograph doesn't have core modality but is important. 
# Also, there is no need for view because ultrasound doesn't have view. 
df.dropna(axis='index', how='any',
          subset=['Findings'], inplace=True)
df.dropna(axis='index', how='any',
          subset=['Location'], inplace=True)

# (8) Eliminate images where no full modality is provided.
df.dropna(axis='index', how='any', subset=['Full_Modality'], inplace=True)

# (9) Do something about the rows containing "Replace with", perhaphs
# eliminate them.
filter9 = (
    (df['Caption'].str.contains('Replace', na=False)) &
    (df['Caption'].str.contains('with'))
)
df.drop(index=df[filter9].index, inplace=True)

# (10) --> Check full modality and eliminate cases that contain the words
# 'Drawing', 'Not specified ', 'Not assigned' and 'Empty'
filterextra = (
    (df['Full_Modality'].str.contains('Drawing', na=False)) |
    (df['Full_Modality'].str.contains('Not', na=False)) |
    (df['Full_Modality'].str.contains('Not', na=False))
)
df.drop(index=df[filterextra].index, inplace=True)

# (11) Filter so as to only have the relevant core modalities
groupings={'MR':['MR'],
       'CT':['CT'],
       'XR':['XR'],
       'US':['US','US-D'],
       'AN':['AN','CTA','MRA'],
       'HE':['HE','Histology'],
       'PET/NM':['PET','NM']
      }

valid_mods=[]
for group,modalities in groupings.items(): 
    for modality in modalities: 
        valid_mods.append(modality)
df=df.loc[df.Core_Modality.isin(valid_mods)]

# (12) Add primary modality group names
df['Modality_Group']=df.Core_Modality.apply(lambda x: assign_group(x,groupings))

# (13) Add primary anatomy groups
anatomy_groupings={'Brain':['Brain and Neuro','Nerve, central'],
               'Musculoskeletal':['MSK - Musculoskeletal'],
               'Pulmonary':['Chest, Pulmonary (ex. Heart)'],
               'Breast':['Breast and Mammography'],
               'Abdomen':['Abdomen - Generalized','Gastrointestinal'],
               'Genitourinary':['Genitourinary'],
               'Spine':['Spine'],
               'Head and Neck':['Head and Neck (ex. orbit)','Eye and Orbit (exclude Ophthalmology)'],
               'Cardiovascular':['Vascular','Cardiovascular (inc. Heart)'],
              }
df['Anatomy_Group']=df.Location.apply(lambda x:assign_group(x,anatomy_groupings))
df.Anatomy_Group.fillna('Other',inplace=True)

# Rename columns to fit standard
df.rename(columns={'Case_Diagnosis': 'Impression', 'Location': 'Anatomy',
                   'Caption': 'Caption', 'ID': 'Path',
                   'Case_URL': 'File URL', 'Image_URL': 'URL',
                   'Full_Modality': 'Modality',
                   'History': 'Patient history'}, inplace=True)







# # Take only relevant columns
df = df[['Path', 'Modality','Plane', 'Anatomy', 'Patient history',
         'Findings', 'Impression', 'Diagnosis','Core_Modality',
         'Modality_Group','Anatomy_Group']]

# # Save findings in different columns
# df['Findings'] = df['Findings']

# Generate the Full caption to be predicted by models.
df['Full_Caption']=df.apply(lambda row: ('<start>'+
                                     ' Modality: ' + str(row['Modality'])+
                                         ' Plane: ' + str (row['Plane']) +
                                     ' Anatomy: ' + str(row['Anatomy'])+
                                     ' Findings: '+ str(row['Findings'])+
                                     ' Impression: '+ str(row['Impression'])+' <end>') ,axis=1)

# # Make paths relative to source
# prefix = '../data/raw/medpix/Images/'
# suffix = '.jpg'
# df.Path = prefix+df.Path.astype(str)+suffix

# # Check that the images exist or otherwise eliminate them 
# bad_images=check_images(df.Path.to_list())
# print('listing bad images')
# print(bad_images)
# print(f'there are a total of {len(bad_images)} bad images')
# df=df.loc[~df.Path.isin(bad_images)]
# df.to_csv(save_path)