In [1]:
import h5py
import pandas as pd
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
import glob

In [2]:
# read metadata
path = "/media/yesindeed/WD5T/data/COVID-CT-MD/"

demo_data = pd.read_csv(path + "Clinical-data.csv")
demo_data

Unnamed: 0,clinical_data,Diagnosis,Folder,Patient Gender,Patient Age,Weight,Clinical characteristics,Surgery,Follow-up,PCR
0,0,COVID-19,P001,M,039Y,80.0,"Dyspnea, Cough, Fever",,,
1,1,COVID-19,P002,F,048Y,75.0,"Dyspnea, Flu-like symptoms",No,Uneventful,
2,2,COVID-19,P003,M,068Y,90.0,Cough,,Hospitalized,
3,3,COVID-19,P004,M,065Y,75.0,"Dyspnea, fever",No,Uneventful,
4,4,COVID-19,P005,M,029Y,78.0,"Cough, Fatigue",No,Uneventful,
...,...,...,...,...,...,...,...,...,...,...
300,302,Normal,normal072,M,031Y,105.0,Dyspnea,,,
301,303,Normal,normal073,M,039Y,96.0,"Headache, Myalgia",Yes,,
302,304,Normal,normal074,M,040Y,97.0,Chest pain,Yes,,
303,305,Normal,normal075,M,056Y,92.0,Dyspnea,,,


In [3]:
slice_label = np.load(os.path.join(path, "Slice-level-labels-updated-1.npy"))

In [4]:
index_df = pd.read_csv(os.path.join(path, "Index.csv"))
index_df

Unnamed: 0,Label Index,Diagnosis,Relative Path,Folder/ID
0,0,COVID-19,./COVID-19/,P001
1,1,COVID-19,./COVID-19/,P002
2,2,COVID-19,./COVID-19/,P003
3,3,COVID-19,./COVID-19/,P004
4,4,COVID-19,./COVID-19/,P005
...,...,...,...,...
74,75,CAP,./CAP/,cap021
75,76,CAP,./CAP/,cap022
76,77,CAP,./CAP/,cap023
77,78,CAP,./CAP/,cap024


In [28]:
slice_meta_dict = {"Folder": [], "Slice File": [], "Slice Label": []}

for i in range(len(index_df)):
    item = index_df.iloc[i]

    label_index = item["Label Index"]

    d = item["Diagnosis"]
    folder = item["Folder/ID"]

    if d == "CAP":
        d = "Cap"

    num_slice = len(glob.glob(os.path.join(
        path, f"{d} Cases", folder, "*.dcm")))

    for i_slice in range(num_slice):
        slice_meta_dict["Folder"].append(folder)
        slice_meta_dict["Slice File"].append(f"IM{str(i_slice+1).zfill(4)}")
        slice_meta_dict["Slice Label"].append(slice_label[i, i_slice])

normal_df = demo_data.loc[demo_data["Diagnosis"]
                          == "Normal"].reset_index(drop=True)

for i in normal_df.index.tolist():
    item = normal_df.iloc[i]

    d = item["Diagnosis"]
    folder = item["Folder"]

    num_slice = len(glob.glob(os.path.join(
        path, f"{d} Cases", folder, "*.dcm")))

    for i_slice in range(num_slice):
        slice_meta_dict["Folder"].append(folder)
        slice_meta_dict["Slice File"].append(f"IM{str(i_slice+1).zfill(4)}")
        slice_meta_dict["Slice Label"].append(0)

slice_meta = pd.DataFrame.from_dict(slice_meta_dict)
slice_meta

Unnamed: 0,Folder,Slice File,Slice Label
0,P001,IM0001,0
1,P001,IM0002,0
2,P001,IM0003,0
3,P001,IM0004,0
4,P001,IM0005,0
...,...,...,...
23344,normal076,IM0167,0
23345,normal076,IM0168,0
23346,normal076,IM0169,0
23347,normal076,IM0170,0


In [29]:
df_slice = demo_data.merge(slice_meta, how="left",
                           left_on="Folder", right_on="Folder")
df_slice = df_slice.loc[~df_slice["Slice File"].isna()].reset_index(drop=True)
df_slice.to_csv(os.path.join(path, "test.csv"), index=False)

In [15]:
import pydicom
import SimpleITK as sitk
import sys
import glob
from PIL import Image

# accordidng to https://github.com/ShahinSHH/COVID-CT-MD, the slice index and slice location may not match
# resort based on slice location metadata, anc convert dicom to pngs


def read_resort_dcm(folder):
    files = []
    # print(f"glob: {sys.argv[1]}")
    for fname in glob.glob(os.path.join(folder, "*.dcm"), recursive=False):
        # print(f"loading: {fname}")
        files.append(pydicom.dcmread(fname))

    # print(f"file count: {len(files)}")

    # skip files with no SliceLocation (eg scout views)
    slices = []
    skipcount = 0
    for f in files:
        if hasattr(f, "SliceLocation"):
            slices.append(f)
        else:
            skipcount = skipcount + 1

    if skipcount > 0:
        print(f"folder {folder}: skipped, no SliceLocation: {skipcount}")

    # ensure they are in the correct order
    slices = sorted(slices, key=lambda s: s.SliceLocation, reverse=True)

    return slices


def save_slices(slices, out_folder, window=[-1250, 250]):
    for i, ds in enumerate(slices):
        array = ds.pixel_array.astype(float)
        array = array * ds.RescaleSlope + ds.RescaleIntercept
        # ds[ds < window[0]] = window[0]
        # ds[ds > window[1]] = window[1]
        array = np.clip(array, window[0], window[1])
        array = (array - window[0]) / (window[1] - window[0]) * 255
        img = Image.fromarray(array.astype(np.uint8))

        if not os.path.exists(out_folder):
            os.makedirs(out_folder)

        img.save(os.path.join(out_folder, f"IM{str(i+1).zfill(4)}.png"))


for d, f in zip(demo_data["Diagnosis"].values, demo_data["Folder"].values):
    if d == "CAP":
        d = "Cap"
    slices = read_resort_dcm(os.path.join(path, f"{d} Cases", f))
    save_slices(slices, os.path.join(path, "png", f"{d} Cases", f))
    # break

# # dcm.SliceLocation
# print(slices[0].SliceLocation)
# print(slices[1].SliceLocation)

In [28]:
df_slice = pd

{'Folder': ['P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
  'P001',
