In [1]:
import os
from tqdm import tqdm
import pandas as pd
from preprocessing.getUID import *
from preprocessing.getXML import *

# Ignore future warnings
import warnings 
warnings.simplefilter(action='ignore', category=FutureWarning)


In [2]:
# IDs
patientsID = [file_id.split("-")[1] for file_id in sorted(
    list(os.listdir("../manifest-1608669183333/Lung-PET-CT-Dx/")))[1:]]

# DICOM paths
dicom_list = ["../manifest-1608669183333/Lung-PET-CT-Dx/Lung_Dx-" +
              file_id for file_id in patientsID]

# Annotation paths
annot_list = ["../Lung-PET-CT-Dx-Annotations-XML-Files-rev12222020/Annotation/" +
              file_id for file_id in patientsID]


In [3]:
dataframe = pd.DataFrame(
    columns=["patient", "image", "xmin", "ymin", "xmax", "ymax", "class"])


In [4]:
num_classes = 4
class_list = ['A', 'B', 'E', 'G']


In [5]:
def save_image_and_annotations(patient, img_name, img_np, img_data, label_list, dataframe):
   
    for rect in img_data:
        bounding_box = [rect[0], rect[1], rect[2], rect[3]]
        xmin = int(bounding_box[0])
        ymin = int(bounding_box[1])
        xmax = int(bounding_box[2])
        ymax = int(bounding_box[3])

        label_array = rect[4:]
        index = int(np.where(label_array == 1)[0])
        label = label_list[index]

    if not os.path.exists("../data/" + patient):
        os.mkdir("../data/" + patient)

    dataframe = dataframe.append({"patient": patient, "image": img_name, "xmin": xmin,
                                 "ymin": ymin, "xmax": xmax, "ymax": ymax, "class": label}, ignore_index=True)

    cv2.imwrite("../data/" + patient + "/" + img_name + '.jpg', img_np)
    return dataframe


In [6]:
zip_list = zip(patientsID, dicom_list, annot_list)

for patientID, dicom_path, annotation_path in tqdm(zip_list):

    dict = getUID_path(dicom_path)
    
    if os.path.isdir(annotation_path):
        annotations = XML_preprocessor(
            annotation_path, num_classes=num_classes).data

        i = 0
        for (k, v) in list(annotations.items()):
            try:
                dcm_path, dcm_name = dict[k[:-4]]
                matrix, _, _, _, ch = loadFile(os.path.join(dcm_path))
                img_bitmap = MatrixToImage(matrix[0], ch)
                dataframe = save_image_and_annotations(
                    patient=patientID,
                    img_name=str(i),
                    img_np=img_bitmap,
                    img_data=v,
                    label_list=class_list,
                    dataframe=dataframe
                )

                i += 1
                if (i == 10):
                    break
            except:
                pass


114it [01:05,  1.20it/s]

unknown label: Q
unknown label: Q
unknown label: Q
unknown label: Q
unknown label: Q
unknown label: Q
unknown label: Q
unknown label: Q
unknown label: Q
unknown label: Q


183it [02:34,  2.63s/it]

unknown label: A0192
unknown label: A0192
unknown label: A0192
unknown label: A0192
unknown label: A0192
unknown label: A0192
unknown label: A0192
unknown label: A0192
unknown label: A0192


355it [07:23,  1.25s/it]


In [7]:
dataframe.to_csv("../data/annotations.csv", index=False)
dataframe

Unnamed: 0,patient,image,xmin,ymin,xmax,ymax,class
0,A0001,0,288,313,351,380,A
1,A0001,1,290,305,338,378,A
2,A0001,2,286,310,355,402,A
3,A0001,3,282,307,355,388,A
4,A0001,4,298,298,335,374,A
...,...,...,...,...,...,...,...
3344,G0062,5,282,324,380,412,G
3345,G0062,6,316,302,373,366,G
3346,G0062,7,299,325,388,403,G
3347,G0062,8,286,326,376,404,G


In [8]:
stats = pd.read_excel("../statistics-clinical-20201221.xlsx")
stats.to_csv("../data/stats.csv", index=False)
stats

Unnamed: 0,No.,NewPatientID,Sex,Age,weight (kg),T-Stage,N-Stage,Ｍ-Stage,Histopathological grading,Smoking History
0,1,A0001,M,58.0,65.0,2b,3,1b,G3,1
1,2,A0002,F,53.0,55.0,2b,1,0,,0
2,3,A0003,M,60.0,62.0,1c,1,0,G3,0
3,4,A0004,F,48.0,56.0,1c,3,0,,0
4,5,A0005,M,70.0,75.0,1b,0,0,G2,1
...,...,...,...,...,...,...,...,...,...,...
350,351,G0057,M,72.0,67.5,1c,0,0,G2,1
351,352,G0058,M,57.0,55.0,4,0,0,,1
352,353,G0059,M,53.0,62.0,1b,0,0,,1
353,354,G0060,M,57.0,74.0,2a,1,0,,0
