In [4]:
import pandas as pd
import random
import os
import cv2
import imghdr
import time
import numpy as np
import datetime
import pandas as pd
from PIL import Image
import datetime
from tqdm import tqdm

In [168]:
def check_images(path_list):
    """
    Check integrity of images in path.

    Args:
        path (list of str): list containing the paths to images.

    Returns:
        bad_images (list of str): list of paths pointing to corrupted images.

    """


    ext_list = ['jpg', 'png', 'jpeg', 'gif', 'bmp']
    bad_images = []
    for f_path in tqdm(path_list):
        try:
            tip = imghdr.what(f_path)
        except:
            # print(f_path+' not found')
            bad_images.append(f_path)
            continue
        if ext_list.count(tip) == 0:

            bad_images.append(f_path)

        if os.path.isfile(f_path):
            try:
                img = cv2.imread(f_path)
                shape = img.shape
            except:
                # print('file ', f_path, ' is not a valid image file')
                bad_images.append(f_path)
        else:
            pass
            # print('could not find file {path}')

    return bad_images

In [169]:
# Configure where to save the resulting dataset
save_path_dir = '../data/intermediate/'
os.makedirs(save_path_dir, exist_ok=True)
save_path = save_path_dir+'inter_medpix.csv'

# Read the raw df
path = '../data/raw/medpix/Dataset_MedPix_V1.xlsx'
df = pd.read_excel(path)

# Grab useful columns
useful_cols=['ID','Plane',
             'Core_Modality','Full_Modality',
             'Findings','Case_Diagnosis','Location']
df=df[useful_cols]

# Rename columns
renamings={'ID':'Path',
           'Full_Modality':'Modality',
          'Case_Diagnosis':'Impression',
          'Location':'Anatomy'}
df.rename(columns=renamings,inplace=True)
# Now the dataframe contains columns 
#['Path', 'Plane', 'Core_Modality', 'Modality', 'Findings', 'Impression',
# 'Anatomy']


# Drop rows with empty values
df = df.dropna() 

# CLEANING ON THE "PLANE" COLUMN-------------------------
# Consolidating synonims into single types.
df.Plane.replace('Transverse','Axial',inplace=True)
df.Plane.replace('Lateral','Sagittal',inplace=True)
df.Plane.replace('Frontal','Coronal',inplace=True)

# Keep rows that have a plane values with a frequency higher than 100
valid_index=df.Plane.value_counts().index[df.Plane.value_counts()>100]
df=df.loc[df.Plane.isin(valid_index)]

# Drop whenever plane is equal to particular values.
df = df.loc[~df.Plane.isin(['NOS - Not specified', #
                      'Other View (see caption)'])]

# CLEANING ON THE "CORE_MODALITY" COLUMN-------------------
# Consolidating synonims under the same concept.
df.Core_Modality.replace('US-D','US',inplace=True)
df.Core_Modality.replace('CTA','AN',inplace=True)
df.Core_Modality.replace('MRA','AN',inplace=True)
df.Core_Modality.replace('Histology','HE',inplace=True)
df.Core_Modality.replace('PET','PET/NM',inplace=True)
df.Core_Modality.replace('NM','PET/NM',inplace=True)
df.Core_Modality.replace('PET-CT','PET/NM',inplace=True)
df.Core_Modality.replace('MRS','MR',inplace=True)

# Keep rows that have a Core_Modality values with a frequency higher than 100
valid_index=df.Core_Modality.value_counts().index[df.Core_Modality.value_counts()>100]
df=df.loc[df.Core_Modality.isin(valid_index)]

# Drop whenever plane is equal to particular values.
df = df.loc[~df.Core_Modality.isin(['NOS'])]

# CLEANING ON "FINDINGS" COLUMN------------------------------
# Eliminate rows that have a "findings" wordcount larger than 100 words. 
df["Number of Words"] = df["Findings"].apply(lambda n: len(n.split()))
df=df.loc[df['Number of Words']<=100]

# CLEANING ON THE "ANATOMY" COLUMN--------------------------
# Consolidation
df.Anatomy.replace('Brain and Neuro','Brain',inplace=True)
df.Anatomy.replace('Nerve, central','Brain',inplace=True)

df.Anatomy.replace('MSK - Musculoskeletal','Musculoskeletal',inplace=True)
df.Anatomy.replace('Extremity - Please Select MSK','Musculoskeletal',inplace=True)

df.Anatomy.replace('Chest, Pulmonary (ex. Heart)','Pulmonary',inplace=True)

df.Anatomy.replace('Breast and Mammography','Breast',inplace=True)

df.Anatomy.replace('Abdomen - Generalized','Abdomen',inplace=True)
df.Anatomy.replace('Gastrointestinal','Abdomen',inplace=True)

df.Anatomy.replace('Head and Neck (ex. orbit)','Head and Neck',inplace=True)
df.Anatomy.replace('Eye and Orbit (exclude Ophthalmology)','Head and Neck',inplace=True)

df.Anatomy.replace('Vascular','Cardiovascular',inplace=True)
df.Anatomy.replace('Cardiovascular (inc. Heart)','Cardiovascular',inplace=True)

df.Anatomy.replace('Multisystem','Generalized',inplace=True)

# Keep rows that have Anatomy values with a frequency higher than 200
valid_index=df.Anatomy.value_counts().index[df.Anatomy.value_counts()>200]
df=df.loc[df.Anatomy.isin(valid_index)]

# CLEANING THE IMPRESSIONS COLUMN---------------------
# Keep impressions with at most 30 words.
df["Number of Words"] = df["Impression"].apply(lambda n: len(n.split()))
df=df.loc[df['Number of Words']<=30]

# Eliminate the Number or words column
df.drop(columns='Number of Words').count()

# CONVERT THE PATH COLUMN INTO THE COMPLETE PATHS--------------------
prefix = '../data/raw/medpix/Images/'
suffix = '.jpg'
df.Path = prefix+df.Path.astype(str)+suffix

# CREATE THE FULL CAPTIONS COLUMN-------------------------------------
df['Full_Caption']=df.apply(lambda row: ('<start>'+
                                         ' Core Modality:'+ str(row['Core_Modality'])+
                                         ' Modality: ' + str(row['Modality'])+
                                         ' Plane: ' + str (row['Plane']) +
                                         ' Anatomy: ' + str(row['Anatomy'])+
                                         ' Findings: '+ str(row['Findings'])+
                                         ' Impression: '+ str(row['Impression'])+' <end>') ,axis=1)

# CHECK THAT WE ARE ABLE TO OPEN IMAGES POINTED BY THE PATH COLUMN------------------
bad_images=check_images(df.Path.to_list())
print('listing bad images')
print(bad_images)
print(f'there are a total of {len(bad_images)} bad images')
df=df.loc[~df.Path.isin(bad_images)] # eliminate rows with bad images from dataframe
df.to_csv(save_path)

100%|██████████| 25805/25805 [03:38<00:00, 118.27it/s]


listing bad images
['../data/raw/medpix/Images/258.jpg', '../data/raw/medpix/Images/266.jpg', '../data/raw/medpix/Images/267.jpg', '../data/raw/medpix/Images/268.jpg', '../data/raw/medpix/Images/741.jpg', '../data/raw/medpix/Images/742.jpg', '../data/raw/medpix/Images/903.jpg', '../data/raw/medpix/Images/989.jpg', '../data/raw/medpix/Images/1384.jpg', '../data/raw/medpix/Images/1752.jpg', '../data/raw/medpix/Images/1758.jpg', '../data/raw/medpix/Images/1759.jpg', '../data/raw/medpix/Images/1762.jpg', '../data/raw/medpix/Images/15800.jpg', '../data/raw/medpix/Images/15801.jpg', '../data/raw/medpix/Images/15802.jpg']
there are a total of 16 bad images
