In [1]:
import settings
import helpers
import SimpleITK  # conda install -c https://conda.anaconda.org/simpleitk SimpleITK
import numpy
import pandas
import ntpath
import cv2  # conda install -c https://conda.anaconda.org/menpo opencv3
import shutil
import random
import math
import multiprocessing
from bs4 import BeautifulSoup #  conda install beautifulsoup4, coda install lxml
import os
import glob

random.seed(1321)
numpy.random.seed(1321)


def find_mhd_file(patient_id):
    for subject_no in range(settings.LUNA_SUBSET_START_INDEX, 10):
        src_dir = settings.LUNA16_RAW_SRC_DIR + "subset" + str(subject_no) + "/"
        for src_path in glob.glob(src_dir + "*.mhd"):
            if patient_id in src_path:
                return src_path
    return None


def load_lidc_xml(xml_path, agreement_threshold=0, only_patient=None, save_nodules=False):
    pos_lines = []
    neg_lines = []
    extended_lines = []
    with open(xml_path, 'r') as xml_file:
        markup = xml_file.read()
    xml = BeautifulSoup(markup, features="xml")
    if xml.LidcReadMessage is None:
        return None, None, None
    patient_id = xml.LidcReadMessage.ResponseHeader.SeriesInstanceUid.text

    if only_patient is not None:
        if only_patient != patient_id:
            return None, None, None

    src_path = find_mhd_file(patient_id)
    if src_path is None:
        return None, None, None

    print(patient_id)
    itk_img = SimpleITK.ReadImage(src_path)
    img_array = SimpleITK.GetArrayFromImage(itk_img)
    num_z, height, width = img_array.shape        #heightXwidth constitute the transverse plane
    origin = numpy.array(itk_img.GetOrigin())      # x,y,z  Origin in world coordinates (mm)
    spacing = numpy.array(itk_img.GetSpacing())    # spacing of voxels in world coor. (mm)
    rescale = spacing / settings.TARGET_VOXEL_MM

    reading_sessions = xml.LidcReadMessage.find_all("readingSession")
    for reading_session in reading_sessions:
        # print("Sesion")
        nodules = reading_session.find_all("unblindedReadNodule")
        for nodule in nodules:
            nodule_id = nodule.noduleID.text
            # print("  ", nodule.noduleID)
            rois = nodule.find_all("roi")
            x_min = y_min = z_min = 999999
            x_max = y_max = z_max = -999999
            if len(rois) < 2:
                continue

            for roi in rois:
                z_pos = float(roi.imageZposition.text)
                z_min = min(z_min, z_pos)
                z_max = max(z_max, z_pos)
                edge_maps = roi.find_all("edgeMap")
                for edge_map in edge_maps:
                    x = int(edge_map.xCoord.text)
                    y = int(edge_map.yCoord.text)
                    x_min = min(x_min, x)
                    y_min = min(y_min, y)
                    x_max = max(x_max, x)
                    y_max = max(y_max, y)
                if x_max == x_min:
                    continue
                if y_max == y_min:
                    continue

            x_diameter = x_max - x_min
            x_center = x_min + x_diameter / 2
            y_diameter = y_max - y_min
            y_center = y_min + y_diameter / 2
            z_diameter = z_max - z_min
            z_center = z_min + z_diameter / 2
            z_center -= origin[2]
            z_center /= spacing[2]

            x_center_perc = round(x_center / img_array.shape[2], 4)
            y_center_perc = round(y_center / img_array.shape[1], 4)
            z_center_perc = round(z_center / img_array.shape[0], 4)
            diameter = max(x_diameter , y_diameter)
            diameter_perc = round(max(x_diameter / img_array.shape[2], y_diameter / img_array.shape[1]), 4)

            if nodule.characteristics is None:
                print("!!!!Nodule:", nodule_id, " has no charecteristics")
                continue
            if nodule.characteristics.malignancy is None:
                print("!!!!Nodule:", nodule_id, " has no malignacy")
                continue

            malignacy = nodule.characteristics.malignancy.text
            sphericiy = nodule.characteristics.sphericity.text
            margin = nodule.characteristics.margin.text
            spiculation = nodule.characteristics.spiculation.text
            texture = nodule.characteristics.texture.text
            calcification = nodule.characteristics.calcification.text
            internal_structure = nodule.characteristics.internalStructure.text
            lobulation = nodule.characteristics.lobulation.text
            subtlety = nodule.characteristics.subtlety.text

            line = [nodule_id, x_center_perc, y_center_perc, z_center_perc, diameter_perc, malignacy]
            extended_line = [patient_id, nodule_id, x_center_perc, y_center_perc, z_center_perc, diameter_perc, malignacy, sphericiy, margin, spiculation, texture, calcification, internal_structure, lobulation, subtlety ]
            pos_lines.append(line)
            extended_lines.append(extended_line)

        nonNodules = reading_session.find_all("nonNodule")
        for nonNodule in nonNodules:
            z_center = float(nonNodule.imageZposition.text)
            z_center -= origin[2]
            z_center /= spacing[2]
            x_center = int(nonNodule.locus.xCoord.text)
            y_center = int(nonNodule.locus.yCoord.text)
            nodule_id = nonNodule.nonNoduleID.text
            x_center_perc = round(x_center / img_array.shape[2], 4)
            y_center_perc = round(y_center / img_array.shape[1], 4)
            z_center_perc = round(z_center / img_array.shape[0], 4)
            diameter_perc = round(max(6 / img_array.shape[2], 6 / img_array.shape[1]), 4)
            # print("Non nodule!", z_center)
            line = [nodule_id, x_center_perc, y_center_perc, z_center_perc, diameter_perc, 0]
            neg_lines.append(line)

    if agreement_threshold > 1:
        filtered_lines = []
        for pos_line1 in pos_lines:
            id1 = pos_line1[0]
            x1 = pos_line1[1]
            y1 = pos_line1[2]
            z1 = pos_line1[3]
            d1 = pos_line1[4]
            overlaps = 0
            for pos_line2 in pos_lines:
                id2 = pos_line2[0]
                if id1 == id2:
                    continue
                x2 = pos_line2[1]
                y2 = pos_line2[2]
                z2 = pos_line2[3]
                d2 = pos_line1[4]
                dist = math.sqrt(math.pow(x1 - x2, 2) + math.pow(y1 - y2, 2) + math.pow(z1 - z2, 2))
                if dist < d1 or dist < d2:
                    overlaps += 1
            if overlaps >= agreement_threshold:
                filtered_lines.append(pos_line1)
            # else:
            #     print("Too few overlaps")
        pos_lines = filtered_lines

    df_annos = pandas.DataFrame(pos_lines, columns=["anno_index", "coord_x", "coord_y", "coord_z", "diameter", "malscore"])
    df_annos.to_csv(settings.LUNA16_EXTRACTED_IMAGE_DIR + "_labels/" + patient_id + "_annos_pos_lidc.csv", index=False)
    df_neg_annos = pandas.DataFrame(neg_lines, columns=["anno_index", "coord_x", "coord_y", "coord_z", "diameter", "malscore"])
    df_neg_annos.to_csv(settings.LUNA16_EXTRACTED_IMAGE_DIR + "_labels/" + patient_id + "_annos_neg_lidc.csv", index=False)

    # return [patient_id, spacing[0], spacing[1], spacing[2]]
    return pos_lines, neg_lines, extended_lines


def normalize(image):
    MIN_BOUND = -1000.0
    MAX_BOUND = 400.0
    image = (image - MIN_BOUND) / (MAX_BOUND - MIN_BOUND)
    image[image > 1] = 1.
    image[image < 0] = 0.
    return image


def process_image(src_path):
    patient_id = ntpath.basename(src_path).replace(".mhd", "")
    print("Patient: ", patient_id)

    dst_dir = settings.LUNA16_EXTRACTED_IMAGE_DIR + patient_id + "/"
    if not os.path.exists(dst_dir):
        os.mkdir(dst_dir)

    itk_img = SimpleITK.ReadImage(src_path)
    img_array = SimpleITK.GetArrayFromImage(itk_img)
    print("Img array: ", img_array.shape)

    origin = numpy.array(itk_img.GetOrigin())      # x,y,z  Origin in world coordinates (mm)
    print("Origin (x,y,z): ", origin)

    direction = numpy.array(itk_img.GetDirection())      # x,y,z  Origin in world coordinates (mm)
    print("Direction: ", direction)


    spacing = numpy.array(itk_img.GetSpacing())    # spacing of voxels in world coor. (mm)
    print("Spacing (x,y,z): ", spacing)
    rescale = spacing / settings.TARGET_VOXEL_MM
    print("Rescale: ", rescale)

    img_array = helpers.rescale_patient_images(img_array, spacing, settings.TARGET_VOXEL_MM)

    img_list = []
    for i in range(img_array.shape[0]):
        img = img_array[i]
        seg_img, mask = helpers.get_segmented_lungs(img.copy())
        img_list.append(seg_img)
        img = normalize(img)
        cv2.imwrite(dst_dir + "img_" + str(i).rjust(4, '0') + "_i.png", img * 255)
        cv2.imwrite(dst_dir + "img_" + str(i).rjust(4, '0') + "_m.png", mask * 255)


def process_pos_annotations_patient(src_path, patient_id):
    df_node = pandas.read_csv(settings.EXTRA_DATA_DIR + "annotations.csv")
    dst_dir = settings.LUNA16_EXTRACTED_IMAGE_DIR + "_labels/"
    if not os.path.exists(dst_dir):
        os.mkdir(dst_dir)
    dst_dir = dst_dir + patient_id + "/"
    if not os.path.exists(dst_dir):
        os.mkdir(dst_dir)

    itk_img = SimpleITK.ReadImage(src_path)
    img_array = SimpleITK.GetArrayFromImage(itk_img)
    print("Img array: ", img_array.shape)
    df_patient = df_node[df_node["seriesuid"] == patient_id]
    print("Annos: ", len(df_patient))

    num_z, height, width = img_array.shape        #heightXwidth constitute the transverse plane
    origin = numpy.array(itk_img.GetOrigin())      # x,y,z  Origin in world coordinates (mm)
    print("Origin (x,y,z): ", origin)
    spacing = numpy.array(itk_img.GetSpacing())    # spacing of voxels in world coor. (mm)
    print("Spacing (x,y,z): ", spacing)
    rescale = spacing /settings.TARGET_VOXEL_MM
    print("Rescale: ", rescale)

    direction = numpy.array(itk_img.GetDirection())      # x,y,z  Origin in world coordinates (mm)
    print("Direction: ", direction)
    flip_direction_x = False
    flip_direction_y = False
    if round(direction[0]) == -1:
        origin[0] *= -1
        direction[0] = 1
        flip_direction_x = True
        print("Swappint x origin")
    if round(direction[4]) == -1:
        origin[1] *= -1
        direction[4] = 1
        flip_direction_y = True
        print("Swappint y origin")
    print("Direction: ", direction)
    assert abs(sum(direction) - 3) < 0.01

    patient_imgs = helpers.load_patient_images(patient_id, settings.LUNA16_EXTRACTED_IMAGE_DIR, "*_i.png")

    pos_annos = []
    df_patient = df_node[df_node["seriesuid"] == patient_id]
    anno_index = 0
    for index, annotation in df_patient.iterrows():
        node_x = annotation["coordX"]
        if flip_direction_x:
            node_x *= -1
        node_y = annotation["coordY"]
        if flip_direction_y:
            node_y *= -1
        node_z = annotation["coordZ"]
        diam_mm = annotation["diameter_mm"]
        print("Node org (x,y,z,diam): ", (round(node_x, 2), round(node_y, 2), round(node_z, 2), round(diam_mm, 2)))
        center_float = numpy.array([node_x, node_y, node_z])
        center_int = numpy.rint((center_float-origin) / spacing)
        # center_int = numpy.rint((center_float - origin) )
        print("Node tra (x,y,z,diam): ", (center_int[0], center_int[1], center_int[2]))
        # center_int_rescaled = numpy.rint(((center_float-origin) / spacing) * rescale)
        center_float_rescaled = (center_float - origin) / settings.TARGET_VOXEL_MM
        center_float_percent = center_float_rescaled / patient_imgs.swapaxes(0, 2).shape
        # center_int = numpy.rint((center_float - origin) )
        print("Node sca (x,y,z,diam): ", (center_float_rescaled[0], center_float_rescaled[1], center_float_rescaled[2]))
        diameter_pixels = diam_mm / settings.TARGET_VOXEL_MM
        diameter_percent = diameter_pixels / float(patient_imgs.shape[1])

        pos_annos.append([anno_index, round(center_float_percent[0], 4), round(center_float_percent[1], 4), round(center_float_percent[2], 4), round(diameter_percent, 4), 1])
        anno_index += 1

    df_annos = pandas.DataFrame(pos_annos, columns=["anno_index", "coord_x", "coord_y", "coord_z", "diameter", "malscore"])
    df_annos.to_csv(settings.LUNA16_EXTRACTED_IMAGE_DIR + "_labels/" + patient_id + "_annos_pos.csv", index=False)
    return [patient_id, spacing[0], spacing[1], spacing[2]]


def process_excluded_annotations_patient(src_path, patient_id):
    df_node = pandas.read_csv(settings.EXTRA_DATA_DIR + "/annotations_excluded.csv")
    dst_dir = settings.LUNA16_EXTRACTED_IMAGE_DIR + "_labels/"
    if not os.path.exists(dst_dir):
        os.mkdir(dst_dir)
    dst_dir = dst_dir + patient_id + "/"
    if not os.path.exists(dst_dir):
        os.mkdir(dst_dir)

    # pos_annos_df = pandas.read_csv(TRAIN_DIR + "metadata/" + patient_id + "_annos_pos_lidc.csv")
    pos_annos_df = pandas.read_csv(settings.LUNA16_EXTRACTED_IMAGE_DIR + "_labels/" + patient_id + "_annos_pos.csv")
    pos_annos_manual = None
    manual_path = settings.EXTRA_DATA_DIR + "luna16_manual_labels/" + patient_id + ".csv"
    if os.path.exists(manual_path):
        pos_annos_manual = pandas.read_csv(manual_path)
        dmm = pos_annos_manual["dmm"]  # check

    itk_img = SimpleITK.ReadImage(src_path)
    img_array = SimpleITK.GetArrayFromImage(itk_img)
    print("Img array: ", img_array.shape)
    df_patient = df_node[df_node["seriesuid"] == patient_id]
    print("Annos: ", len(df_patient))

    num_z, height, width = img_array.shape        #heightXwidth constitute the transverse plane
    origin = numpy.array(itk_img.GetOrigin())      # x,y,z  Origin in world coordinates (mm)
    print("Origin (x,y,z): ", origin)
    spacing = numpy.array(itk_img.GetSpacing())    # spacing of voxels in world coor. (mm)
    print("Spacing (x,y,z): ", spacing)
    rescale = spacing / settings.TARGET_VOXEL_MM
    print("Rescale: ", rescale)

    direction = numpy.array(itk_img.GetDirection())      # x,y,z  Origin in world coordinates (mm)
    print("Direction: ", direction)
    flip_direction_x = False
    flip_direction_y = False
    if round(direction[0]) == -1:
        origin[0] *= -1
        direction[0] = 1
        flip_direction_x = True
        print("Swappint x origin")
    if round(direction[4]) == -1:
        origin[1] *= -1
        direction[4] = 1
        flip_direction_y = True
        print("Swappint y origin")
    print("Direction: ", direction)
    assert abs(sum(direction) - 3) < 0.01

    patient_imgs = helpers.load_patient_images(patient_id, settings.LUNA16_EXTRACTED_IMAGE_DIR, "*_i.png")

    neg_annos = []
    df_patient = df_node[df_node["seriesuid"] == patient_id]
    anno_index = 0
    for index, annotation in df_patient.iterrows():
        node_x = annotation["coordX"]
        if flip_direction_x:
            node_x *= -1
        node_y = annotation["coordY"]
        if flip_direction_y:
            node_y *= -1
        node_z = annotation["coordZ"]
        center_float = numpy.array([node_x, node_y, node_z])
        center_int = numpy.rint((center_float-origin) / spacing)
        center_float_rescaled = (center_float - origin) / settings.TARGET_VOXEL_MM
        center_float_percent = center_float_rescaled / patient_imgs.swapaxes(0, 2).shape
        # center_int = numpy.rint((center_float - origin) )
        # print("Node sca (x,y,z,diam): ", (center_float_rescaled[0], center_float_rescaled[1], center_float_rescaled[2]))
        diameter_pixels = 6 / settings.TARGET_VOXEL_MM
        diameter_percent = diameter_pixels / float(patient_imgs.shape[1])

        ok = True

        for index, row in pos_annos_df.iterrows():
            pos_coord_x = row["coord_x"] * patient_imgs.shape[2]
            pos_coord_y = row["coord_y"] * patient_imgs.shape[1]
            pos_coord_z = row["coord_z"] * patient_imgs.shape[0]
            diameter = row["diameter"] * patient_imgs.shape[2]
            print((pos_coord_x, pos_coord_y, pos_coord_z))
            print(center_float_rescaled)
            dist = math.sqrt(math.pow(pos_coord_x - center_float_rescaled[0], 2) + math.pow(pos_coord_y - center_float_rescaled[1], 2) + math.pow(pos_coord_z - center_float_rescaled[2], 2))
            if dist < (diameter + 64):  #  make sure we have a big margin
                ok = False
                print("################### Too close", center_float_rescaled)
                break

        if pos_annos_manual is not None and ok:
            for index, row in pos_annos_manual.iterrows():
                pos_coord_x = row["x"] * patient_imgs.shape[2]
                pos_coord_y = row["y"] * patient_imgs.shape[1]
                pos_coord_z = row["z"] * patient_imgs.shape[0]
                diameter = row["d"] * patient_imgs.shape[2]
                print((pos_coord_x, pos_coord_y, pos_coord_z))
                print(center_float_rescaled)
                dist = math.sqrt(math.pow(pos_coord_x - center_float_rescaled[0], 2) + math.pow(pos_coord_y - center_float_rescaled[1], 2) + math.pow(pos_coord_z - center_float_rescaled[2], 2))
                if dist < (diameter + 72):  #  make sure we have a big margin
                    ok = False
                    print("################### Too close", center_float_rescaled)
                    break

        if not ok:
            continue

        neg_annos.append([anno_index, round(center_float_percent[0], 4), round(center_float_percent[1], 4), round(center_float_percent[2], 4), round(diameter_percent, 4), 1])
        anno_index += 1

    df_annos = pandas.DataFrame(neg_annos, columns=["anno_index", "coord_x", "coord_y", "coord_z", "diameter", "malscore"])
    df_annos.to_csv(settings.LUNA16_EXTRACTED_IMAGE_DIR + "_labels/" + patient_id + "_annos_excluded.csv", index=False)
    return [patient_id, spacing[0], spacing[1], spacing[2]]


def process_luna_candidates_patient(src_path, patient_id):
    dst_dir = settings.LUNA16_EXTRACTED_IMAGE_DIR + "/_labels/"
    img_dir = dst_dir + patient_id + "/"
    df_pos_annos = pandas.read_csv(dst_dir + patient_id + "_annos_pos_lidc.csv")
    if not os.path.exists(dst_dir):
        os.mkdir(dst_dir)

    pos_annos_manual = None
    manual_path = settings.EXTRA_DATA_DIR + "luna16_manual_labels/" + patient_id + ".csv"
    if os.path.exists(manual_path):
        pos_annos_manual = pandas.read_csv(manual_path)

    itk_img = SimpleITK.ReadImage(src_path)
    img_array = SimpleITK.GetArrayFromImage(itk_img)
    print("Img array: ", img_array.shape)
    print("Pos annos: ", len(df_pos_annos))

    num_z, height, width = img_array.shape        #heightXwidth constitute the transverse plane
    origin = numpy.array(itk_img.GetOrigin())      # x,y,z  Origin in world coordinates (mm)
    print("Origin (x,y,z): ", origin)
    spacing = numpy.array(itk_img.GetSpacing())    # spacing of voxels in world coor. (mm)
    print("Spacing (x,y,z): ", spacing)
    rescale = spacing / settings.TARGET_VOXEL_MM
    print("Rescale: ", rescale)

    direction = numpy.array(itk_img.GetDirection())      # x,y,z  Origin in world coordinates (mm)
    print("Direction: ", direction)
    flip_direction_x = False
    flip_direction_y = False
    if round(direction[0]) == -1:
        origin[0] *= -1
        direction[0] = 1
        flip_direction_x = True
        print("Swappint x origin")
    if round(direction[4]) == -1:
        origin[1] *= -1
        direction[4] = 1
        flip_direction_y = True
        print("Swappint y origin")
    print("Direction: ", direction)
    assert abs(sum(direction) - 3) < 0.01

    src_df = pandas.read_csv(settings.EXTRA_DATA_DIR + "candidates_V2.csv")
    src_df = src_df[src_df["seriesuid"] == patient_id]
    src_df = src_df[src_df["class"] == 0]
    patient_imgs = helpers.load_patient_images(patient_id, settings.LUNA16_EXTRACTED_IMAGE_DIR, "*_i.png")
    candidate_list = []

    for df_index, candiate_row in src_df.iterrows():
        node_x = candiate_row["coordX"]
        if flip_direction_x:
            node_x *= -1
        node_y = candiate_row["coordY"]
        if flip_direction_y:
            node_y *= -1
        node_z = candiate_row["coordZ"]
        candidate_diameter = 6
        # print("Node org (x,y,z,diam): ", (round(node_x, 2), round(node_y, 2), round(node_z, 2), round(candidate_diameter, 2)))
        center_float = numpy.array([node_x, node_y, node_z])
        center_int = numpy.rint((center_float-origin) / spacing)
        # center_int = numpy.rint((center_float - origin) )
        # print("Node tra (x,y,z,diam): ", (center_int[0], center_int[1], center_int[2]))
        # center_int_rescaled = numpy.rint(((center_float-origin) / spacing) * rescale)
        center_float_rescaled = (center_float - origin) / settings.TARGET_VOXEL_MM
        center_float_percent = center_float_rescaled / patient_imgs.swapaxes(0, 2).shape
        # center_int = numpy.rint((center_float - origin) )
        # print("Node sca (x,y,z,diam): ", (center_float_rescaled[0], center_float_rescaled[1], center_float_rescaled[2]))
        coord_x = center_float_rescaled[0]
        coord_y = center_float_rescaled[1]
        coord_z = center_float_rescaled[2]

        ok = True

        for index, row in df_pos_annos.iterrows():
            pos_coord_x = row["coord_x"] * patient_imgs.shape[2]
            pos_coord_y = row["coord_y"] * patient_imgs.shape[1]
            pos_coord_z = row["coord_z"] * patient_imgs.shape[0]
            diameter = row["diameter"] * patient_imgs.shape[2]
            dist = math.sqrt(math.pow(pos_coord_x - coord_x, 2) + math.pow(pos_coord_y - coord_y, 2) + math.pow(pos_coord_z - coord_z, 2))
            if dist < (diameter + 64):  #  make sure we have a big margin
                ok = False
                print("################### Too close", (coord_x, coord_y, coord_z))
                break

        if pos_annos_manual is not None and ok:
            for index, row in pos_annos_manual.iterrows():
                pos_coord_x = row["x"] * patient_imgs.shape[2]
                pos_coord_y = row["y"] * patient_imgs.shape[1]
                pos_coord_z = row["z"] * patient_imgs.shape[0]
                diameter = row["d"] * patient_imgs.shape[2]
                print((pos_coord_x, pos_coord_y, pos_coord_z))
                print(center_float_rescaled)
                dist = math.sqrt(math.pow(pos_coord_x - center_float_rescaled[0], 2) + math.pow(pos_coord_y - center_float_rescaled[1], 2) + math.pow(pos_coord_z - center_float_rescaled[2], 2))
                if dist < (diameter + 72):  #  make sure we have a big margin
                    ok = False
                    print("################### Too close", center_float_rescaled)
                    break

        if not ok:
            continue

        candidate_list.append([len(candidate_list), round(center_float_percent[0], 4), round(center_float_percent[1], 4), round(center_float_percent[2], 4), round(candidate_diameter / patient_imgs.shape[0], 4), 0])

    df_candidates = pandas.DataFrame(candidate_list, columns=["anno_index", "coord_x", "coord_y", "coord_z", "diameter", "malscore"])
    df_candidates.to_csv(dst_dir + patient_id + "_candidates_luna.csv", index=False)


def process_auto_candidates_patient(src_path, patient_id, sample_count=1000, candidate_type="white"):
    dst_dir = settings.LUNA16_EXTRACTED_IMAGE_DIR + "/_labels/"
    img_dir = settings.LUNA16_EXTRACTED_IMAGE_DIR + patient_id + "/"
    df_pos_annos = pandas.read_csv(dst_dir + patient_id + "_annos_pos_lidc.csv")

    pos_annos_manual = None
    manual_path = settings.EXTRA_DATA_DIR + "luna16_manual_labels/" + patient_id + ".csv"
    if os.path.exists(manual_path):
        pos_annos_manual = pandas.read_csv(manual_path)

    if not os.path.exists(dst_dir):
        os.mkdir(dst_dir)

    itk_img = SimpleITK.ReadImage(src_path)
    img_array = SimpleITK.GetArrayFromImage(itk_img)
    print("Img array: ", img_array.shape)
    print("Pos annos: ", len(df_pos_annos))

    num_z, height, width = img_array.shape        #heightXwidth constitute the transverse plane
    origin = numpy.array(itk_img.GetOrigin())      # x,y,z  Origin in world coordinates (mm)
    print("Origin (x,y,z): ", origin)
    spacing = numpy.array(itk_img.GetSpacing())    # spacing of voxels in world coor. (mm)
    print("Spacing (x,y,z): ", spacing)
    rescale = spacing / settings.TARGET_VOXEL_MM
    print("Rescale: ", rescale)

    if candidate_type == "white":
        wildcard = "*_c.png"
    else:
        wildcard = "*_m.png"

    src_files = glob.glob(img_dir + wildcard)
    src_files.sort()
    src_candidate_maps = [cv2.imread(src_file, cv2.IMREAD_GRAYSCALE) for src_file in src_files]

    candidate_list = []
    tries = 0
    while len(candidate_list) < sample_count and tries < 10000:
        tries += 1
        coord_z = int(numpy.random.normal(len(src_files) / 2, len(src_files) / 6))
        coord_z = max(coord_z, 0)
        coord_z = min(coord_z, len(src_files) - 1)
        candidate_map = src_candidate_maps[coord_z]
        if candidate_type == "edge":
            candidate_map = cv2.Canny(candidate_map.copy(), 100, 200)

        non_zero_indices = numpy.nonzero(candidate_map)
        if len(non_zero_indices[0]) == 0:
            continue
        nonzero_index = random.randint(0, len(non_zero_indices[0]) - 1)
        coord_y = non_zero_indices[0][nonzero_index]
        coord_x = non_zero_indices[1][nonzero_index]
        ok = True
        candidate_diameter = 6
        for index, row in df_pos_annos.iterrows():
            pos_coord_x = row["coord_x"] * src_candidate_maps[0].shape[1]
            pos_coord_y = row["coord_y"] * src_candidate_maps[0].shape[0]
            pos_coord_z = row["coord_z"] * len(src_files)
            diameter = row["diameter"] * src_candidate_maps[0].shape[1]
            dist = math.sqrt(math.pow(pos_coord_x - coord_x, 2) + math.pow(pos_coord_y - coord_y, 2) + math.pow(pos_coord_z - coord_z, 2))
            if dist < (diameter + 48): #  make sure we have a big margin
                ok = False
                print("# Too close", (coord_x, coord_y, coord_z))
                break

        if pos_annos_manual is not None:
            for index, row in pos_annos_manual.iterrows():
                pos_coord_x = row["x"] * src_candidate_maps[0].shape[1]
                pos_coord_y = row["y"] * src_candidate_maps[0].shape[0]
                pos_coord_z = row["z"] * len(src_files)
                diameter = row["d"] * src_candidate_maps[0].shape[1]
                # print((pos_coord_x, pos_coord_y, pos_coord_z))
                # print(center_float_rescaled)
                dist = math.sqrt(math.pow(pos_coord_x - coord_x, 2) + math.pow(pos_coord_y - coord_y, 2) + math.pow(pos_coord_z - coord_z, 2))
                if dist < (diameter + 72):  #  make sure we have a big margin
                    ok = False
                    print("#Too close",  (coord_x, coord_y, coord_z))
                    break

        if not ok:
            continue


        perc_x = round(coord_x / src_candidate_maps[coord_z].shape[1], 4)
        perc_y = round(coord_y / src_candidate_maps[coord_z].shape[0], 4)
        perc_z = round(coord_z / len(src_files), 4)
        candidate_list.append([len(candidate_list), perc_x, perc_y, perc_z, round(candidate_diameter / src_candidate_maps[coord_z].shape[1], 4), 0])

    if tries > 9999:
        print("****** WARING!! TOO MANY TRIES ************************************")
    df_candidates = pandas.DataFrame(candidate_list, columns=["anno_index", "coord_x", "coord_y", "coord_z", "diameter", "malscore"])
    df_candidates.to_csv(dst_dir + patient_id + "_candidates_" + candidate_type + ".csv", index=False)


def process_images(delete_existing=False, only_process_patient=None):
    if delete_existing and os.path.exists(settings.LUNA16_EXTRACTED_IMAGE_DIR):
        print("Removing old stuff..")
        if os.path.exists(settings.LUNA16_EXTRACTED_IMAGE_DIR):
            shutil.rmtree(settings.LUNA16_EXTRACTED_IMAGE_DIR)

    if not os.path.exists(settings.LUNA16_EXTRACTED_IMAGE_DIR):
        os.mkdir(settings.LUNA16_EXTRACTED_IMAGE_DIR)
        os.mkdir(settings.LUNA16_EXTRACTED_IMAGE_DIR + "_labels/")

    for subject_no in range(settings.LUNA_SUBSET_START_INDEX, 10):
        src_dir = settings.LUNA16_RAW_SRC_DIR  + "train_subset" + str(subject_no).zfill(2) + "/"
        src_paths = glob.glob(src_dir + "*.mhd")

        if only_process_patient is None and True:
            pool = multiprocessing.Pool(6)
            pool.map(process_image, src_paths)
        else:
            for src_path in src_paths:
                print(src_path)
                if only_process_patient is not None:
                    if only_process_patient not in src_path:
                        continue
                process_image(src_path)


def process_pos_annotations_patient2():
    candidate_index = 0
    only_patient = None
    for subject_no in range(settings.LUNA_SUBSET_START_INDEX, 10):
        src_dir = settings.LUNA16_RAW_SRC_DIR  + "train_subset" + str(subject_no).zfill(2) + "/"
        for src_path in glob.glob(src_dir + "*.mhd"):
            if only_patient is not None and only_patient not in src_path:
                continue
            patient_id = ntpath.basename(src_path).replace(".mhd", "")
            print(candidate_index, " patient: ", patient_id)
            process_pos_annotations_patient(src_path, patient_id)
            candidate_index += 1


def process_excluded_annotations_patients(only_patient=None):
    candidate_index = 0
    for subject_no in range(settings.LUNA_SUBSET_START_INDEX, 10):
        src_dir = settings.LUNA16_RAW_SRC_DIR  + "train_subset" + str(subject_no).zfill(2) + "/"
        for src_path in glob.glob(src_dir + "*.mhd"):
            if only_patient is not None and only_patient not in src_path:
                continue
            patient_id = ntpath.basename(src_path).replace(".mhd", "")
            print(candidate_index, " patient: ", patient_id)
            process_excluded_annotations_patient(src_path, patient_id)
            candidate_index += 1


def process_auto_candidates_patients():
    for subject_no in range(settings.LUNA_SUBSET_START_INDEX, 10):
        src_dir = settings.LUNA16_RAW_SRC_DIR  + "train_subset" + str(subject_no).zfill(2) + "/"
        for patient_index, src_path in enumerate(glob.glob(src_dir + "*.mhd")):
            # if not "100621383016233746780170740405" in src_path:
            #     continue
            patient_id = ntpath.basename(src_path).replace(".mhd", "")
            print("Patient: ", patient_index, " ", patient_id)
            # process_auto_candidates_patient(src_path, patient_id, sample_count=500, candidate_type="white")
            process_auto_candidates_patient(src_path, patient_id, sample_count=200, candidate_type="edge")


def process_luna_candidates_patients(only_patient_id=None):
    for subject_no in range(settings.LUNA_SUBSET_START_INDEX, 10):
        src_dir = settings.LUNA16_RAW_SRC_DIR  + "train_subset" + str(subject_no).zfill(2) + "/"
        for patient_index, src_path in enumerate(glob.glob(src_dir + "*.mhd")):
            # if not "100621383016233746780170740405" in src_path:
            #     continue
            patient_id = ntpath.basename(src_path).replace(".mhd", "")
            if only_patient_id is not None and patient_id != only_patient_id:
                continue
            print("Patient: ", patient_index, " ", patient_id)
            process_luna_candidates_patient(src_path, patient_id)


def process_lidc_annotations(only_patient=None, agreement_threshold=0):
    # lines.append(",".join())
    file_no = 0
    pos_count = 0
    neg_count = 0
    all_lines = []
    for anno_dir in [d for d in glob.glob(settings.EXTRA_DATA_DIR + "luna16_annotations/*") if os.path.isdir(d)]:
        xml_paths = glob.glob(anno_dir + "/*.xml")
        for xml_path in xml_paths:
            print(file_no, ": ",  xml_path)
            pos, neg, extended = load_lidc_xml(xml_path=xml_path, only_patient=only_patient, agreement_threshold=agreement_threshold)
            if pos is not None:
                pos_count += len(pos)
                neg_count += len(neg)
                print("Pos: ", pos_count, " Neg: ", neg_count)
                file_no += 1
                all_lines += extended
            # if file_no > 10:
            #     break

            # extended_line = [nodule_id, x_center_perc, y_center_perc, z_center_perc, diameter_perc, malignacy, sphericiy, margin, spiculation, texture, calcification, internal_structure, lobulation, subtlety ]
    df_annos = pandas.DataFrame(all_lines, columns=["patient_id", "anno_index", "coord_x", "coord_y", "coord_z", "diameter", "malscore", "sphericiy", "margin", "spiculation", "texture", "calcification", "internal_structure", "lobulation", "subtlety"])
    df_annos.to_csv(settings.BASE_DIR + "lidc_annotations.csv", index=False)

('User: ', 'mahui')


In [2]:
only_process_patient = None
process_images()

('Patient: ', 'LKDS-00004')
('Patient: ', 'LKDS-00001')
('Patient: ', 'LKDS-00020')
('Patient: ', 'LKDS-00016')
('Patient: ', 'LKDS-00007')
('Patient: ', 'LKDS-00013')
('Img array: ', (221, 512, 512))
('Origin (x,y,z): ', array([-170.5, -170. ,   -5. ]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.66406202,  0.66406202,  1.25      ]))
('Rescale: ', array([ 0.66406202,  0.66406202,  1.25      ]))
('Img array: ', (281, 512, 512))
('Origin (x,y,z): ', array([-207.5, -206.5,   49.5]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.80664098,  0.80664098,  1.25      ]))
('Rescale: ', array([ 0.80664098,  0.80664098,  1.25      ]))
('Img array: ', (325, 512, 512))
('Origin (x,y,z): ', array([-174.,  -12.,   23.]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.68359399,  0.68359399,  1.        ]))
('Rescale: ', array([ 0.683593

('Patient: ', 'LKDS-00047')
('Patient: ', 'LKDS-00051')
('Img array: ', (336, 512, 512))
('Origin (x,y,z): ', array([-171. ,  -10. ,  299.2]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.68359399,  0.68359399,  1.        ]))
('Rescale: ', array([ 0.68359399,  0.68359399,  1.        ]))
('Img array: ', (461, 512, 512))
('Origin (x,y,z): ', array([-164., -180.,   37.]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.703125,  0.703125,  0.625   ]))
('Rescale: ', array([ 0.703125,  0.703125,  0.625   ]))
('Patient: ', 'LKDS-00053')
('Img array: ', (291, 512, 512))
('Origin (x,y,z): ', array([-178. ,  -25. ,  261.8]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.68359399,  0.68359399,  1.        ]))
('Rescale: ', array([ 0.68359399,  0.68359399,  1.        ]))
('Patient: ', 'LKDS-00042')
('Img array: ', (564, 512, 512))
('O

('Patient: ', 'LKDS-00090')
('Img array: ', (314, 512, 512))
('Origin (x,y,z): ', array([-175. ,  -23. ,  893.3]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.68359399,  0.68359399,  1.        ]))
('Rescale: ', array([ 0.68359399,  0.68359399,  1.        ]))
('Img array: ', (307, 512, 512))
('Origin (x,y,z): ', array([ -175.641,  -306.141,  1661.   ]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.71875,  0.71875,  1.     ]))
('Rescale: ', array([ 0.71875,  0.71875,  1.     ]))
('Patient: ', 'LKDS-00092')
('Img array: ', (263, 512, 512))
('Origin (x,y,z): ', array([-193.5,  -14.5,  775.6]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.72070301,  0.72070301,  1.        ]))
('Rescale: ', array([ 0.72070301,  0.72070301,  1.        ]))
('Patient: ', 'LKDS-00095')
('Patient: ', 'LKDS-00099')
('Img array: ', (133, 512, 512

('Img array: ', (497, 512, 512))
('Origin (x,y,z): ', array([-166. ,  -22. ,  430.8]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.68359399,  0.68359399,  0.625     ]))
('Rescale: ', array([ 0.68359399,  0.68359399,  0.625     ]))
('Patient: ', 'LKDS-00126')
('Img array: ', (577, 512, 512))
('Origin (x,y,z): ', array([-199.5 , -199.5 ,  -25.75]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.77929699,  0.77929699,  0.625     ]))
('Rescale: ', array([ 0.77929699,  0.77929699,  0.625     ]))
('Patient: ', 'LKDS-00135')
('Patient: ', 'LKDS-00137')
('Patient: ', 'LKDS-00142')
('Patient: ', 'LKDS-00133')
('Patient: ', 'LKDS-00127')
('Patient: ', 'LKDS-00147')
('Img array: ', (145, 512, 512))
('Origin (x,y,z): ', array([-207.9 , -210.  , -375.75]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.82031202,  0.82031202,  2.5    

('Img array: ', (425, 512, 512))
('Origin (x,y,z): ', array([-145.73, -278.73, -361.5 ]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.53906298,  0.53906298,  0.69999999]))
('Rescale: ', array([ 0.53906298,  0.53906298,  0.69999999]))
('Patient: ', 'LKDS-00173')
('Img array: ', (375, 512, 512))
('Origin (x,y,z): ', array([-153.3 , -160.  , -214.75]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.625,  0.625,  0.625]))
('Rescale: ', array([ 0.625,  0.625,  0.625]))
('Patient: ', 'LKDS-00165')
('Img array: ', (406, 512, 512))
('Origin (x,y,z): ', array([-159.707, -286.207,  -50.5  ]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.58593798,  0.58593798,  0.69999999]))
('Rescale: ', array([ 0.58593798,  0.58593798,  0.69999999]))
('Patient: ', 'LKDS-00176')
('Img array: ', (471, 512, 512))
('Origin (x,y,z): ', array([-154.7

('Img array: ', (605, 512, 512))
('Origin (x,y,z): ', array([-175.3 , -168.1 , -348.56]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.703125,  0.703125,  0.625   ]))
('Rescale: ', array([ 0.703125,  0.703125,  0.625   ]))
('Patient: ', 'LKDS-00219')
('Patient: ', 'LKDS-00213')
('Img array: ', (260, 512, 512))
('Origin (x,y,z): ', array([-167. ,  -17. , -725.8]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.64843798,  0.64843798,  1.        ]))
('Rescale: ', array([ 0.64843798,  0.64843798,  1.        ]))
('Img array: ', (510, 512, 512))
('Origin (x,y,z): ', array([-175.641, -320.641,  433.4  ]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.71875   ,  0.71875   ,  0.60000002]))
('Rescale: ', array([ 0.71875   ,  0.71875   ,  0.60000002]))
('Patient: ', 'LKDS-00223')
('Img array: ', (123, 512, 512))
('Origin (x,y,z): '

('Img array: ', (449, 512, 512))
('Origin (x,y,z): ', array([-175.161, -306.661, -109.   ]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.67773402,  0.67773402,  0.69999999]))
('Rescale: ', array([ 0.67773402,  0.67773402,  0.69999999]))
('Patient: ', 'LKDS-00264')
('Img array: ', (272, 512, 512))
('Origin (x,y,z): ', array([-184.    ,  -48.2291, -585.8   ]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.67578101,  0.67578101,  1.        ]))
('Rescale: ', array([ 0.67578101,  0.67578101,  1.        ]))
('Patient: ', 'LKDS-00261')
('Img array: ', (268, 512, 512))
('Origin (x,y,z): ', array([-162.7 , -145.5 , -338.51]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.56835902,  0.56835902,  1.25      ]))
('Rescale: ', array([ 0.56835902,  0.56835902,  1.25      ]))
('Patient: ', 'LKDS-00269')
('Img array: ', (429, 512, 512)

('Patient: ', 'LKDS-00302')
('Patient: ', 'LKDS-00297')
('Patient: ', 'LKDS-00304')
('Img array: ', (269, 512, 512))
('Origin (x,y,z): ', array([-189.,  -27., -615.]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.68359399,  0.68359399,  1.        ]))
('Rescale: ', array([ 0.68359399,  0.68359399,  1.        ]))
('Patient: ', 'LKDS-00307')
('Img array: ', (633, 512, 512))
('Origin (x,y,z): ', array([-139.229, -268.229, -344.5  ]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.54101598,  0.54101598,  0.5       ]))
('Rescale: ', array([ 0.54101598,  0.54101598,  0.5       ]))
('Img array: ', (356, 512, 512))
('Img array: ', (516, 512, 512))
('Origin (x,y,z): ', array([-190.5,  -79.5,  806. ]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Origin (x,y,z): ', array([-167.654, -326.654,   14.   ]))
('Spacing (x,y,z): ', array([ 0.80664098,  0.80664098,  1

('Rescale: ', array([ 0.66406298,  0.66406298,  1.79999995]))
('Patient: ', 'LKDS-00348')
('Patient: ', 'LKDS-00337')
('Img array: ', (268, 512, 512))
('Origin (x,y,z): ', array([-169.2  , -203.5  , -338.985]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.74218798,  0.74218798,  1.25      ]))
('Rescale: ', array([ 0.74218798,  0.74218798,  1.25      ]))
('Img array: ', (448, 512, 512))
('Origin (x,y,z): ', array([-235., -114.,  779.]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.93359399,  0.93359399,  1.        ]))
('Rescale: ', array([ 0.93359399,  0.93359399,  1.        ]))
('Patient: ', 'LKDS-00335')
('Patient: ', 'LKDS-00357')
('Img array: ', (357, 512, 512))
('Origin (x,y,z): ', array([-217.5,  -39.5,  717.6]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.75976598,  0.75976598,  1.        ]))
('Rescale: ', arra

('Rescale: ', array([ 0.42968801,  0.42968801,  1.        ]))
('Patient: ', 'LKDS-00405')


error: /Users/jenkins/miniconda/0/2.7/conda-bld/work/opencv-2.4.11/modules/core/src/matrix.cpp:116: error: (-215) s >= 0 in function setSize


('Patient: ', 'LKDS-00398')
('Img array: ', (568, 512, 512))
('Origin (x,y,z): ', array([-213.607, -340.107,  -81.   ]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.78515601,  0.78515601,  0.69999999]))
('Rescale: ', array([ 0.78515601,  0.78515601,  0.69999999]))
('Img array: ', (320, 512, 512))
('Origin (x,y,z): ', array([-211.456,  -49.5  ,  381.58 ]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.80664098,  0.80664098,  1.        ]))
('Rescale: ', array([ 0.80664098,  0.80664098,  1.        ]))
('Patient: ', 'LKDS-00407')
('Img array: ', (590, 512, 512))
('Origin (x,y,z): ', array([ -177.146,  -288.646,  1478.9  ]))
('Direction: ', array([ 1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.]))
('Spacing (x,y,z): ', array([ 0.70898402,  0.70898402,  0.60000002]))
('Rescale: ', array([ 0.70898402,  0.70898402,  0.60000002]))
('Patient: ', 'LKDS-00403')
('Img array: ', (236, 512, 5

In [3]:
process_lidc_annotations(only_patient=None, agreement_threshold=0)

In [7]:
#process_pos_annotations_patient2()
process_excluded_annotations_patients(only_patient=None)

(0, ' patient: ', 'LKDS-00001')


IOError: File /Volumes/solo/ali/resources//annotations_excluded.csv does not exist

In [6]:
#process_luna_candidates_patients(only_patient_id=None)
process_auto_candidates_patients()

('Patient: ', 0, ' ', 'LKDS-00001')


IOError: File /Volumes/solo/ali/pic/luna16_extracted_images//_labels/LKDS-00001_annos_pos_lidc.csv does not exist

In [24]:
import settings
import helpers

import os
import glob
import random
import ntpath
import cv2
import numpy
from typing import List, Tuple
from keras.optimizers import Adam, SGD
from keras.layers import Input, Convolution2D, MaxPooling2D, UpSampling2D, merge, BatchNormalization, SpatialDropout2D
from keras.models import Model
from keras import backend as K
from keras.callbacks import ModelCheckpoint, Callback
from scipy.ndimage.interpolation import map_coordinates
from scipy.ndimage.filters import gaussian_filter
import pandas
import shutil

MEAN_FRAME_COUNT = 1
CHANNEL_COUNT = 1


def random_scale_img(img, xy_range, lock_xy=False):
    if random.random() > xy_range.chance:
        return img

    if not isinstance(img, list):
        img = [img]

    import cv2
    scale_x = random.uniform(xy_range.x_min, xy_range.x_max)
    scale_y = random.uniform(xy_range.y_min, xy_range.y_max)
    if lock_xy:
        scale_y = scale_x

    org_height, org_width = img[0].shape[:2]
    xy_range.last_x = scale_x
    xy_range.last_y = scale_y

    res = []
    for img_inst in img:
        scaled_width = int(org_width * scale_x)
        scaled_height = int(org_height * scale_y)
        scaled_img = cv2.resize(img_inst, (scaled_width, scaled_height), interpolation=cv2.INTER_CUBIC)
        if scaled_width < org_width:
            extend_left = (org_width - scaled_width) / 2
            extend_right = org_width - extend_left - scaled_width
            scaled_img = cv2.copyMakeBorder(scaled_img, 0, 0, extend_left, extend_right, borderType=cv2.BORDER_CONSTANT)
            scaled_width = org_width

        if scaled_height < org_height:
            extend_top = (org_height - scaled_height) / 2
            extend_bottom = org_height - extend_top - scaled_height
            scaled_img = cv2.copyMakeBorder(scaled_img, extend_top, extend_bottom, 0, 0,  borderType=cv2.BORDER_CONSTANT)
            scaled_height = org_height

        start_x = (scaled_width - org_width) / 2
        start_y = (scaled_height - org_height) / 2
        tmp = scaled_img[start_y: start_y + org_height, start_x: start_x + org_width]
        res.append(tmp)

    return res


class XYRange:
    def __init__(self, x_min, x_max, y_min, y_max, chance=1.0):
        self.chance = chance
        self.x_min = x_min
        self.x_max = x_max
        self.y_min = y_min
        self.y_max = y_max
        self.last_x = 0
        self.last_y = 0

    def get_last_xy_txt(self):
        res = "x_" + str(int(self.last_x * 100)).replace("-", "m") + "-" + "y_" + str(int(self.last_y * 100)).replace("-", "m")
        return res


def random_translate_img(img, xy_range, border_mode="constant"):
    if random.random() > xy_range.chance:
        return img
    import cv2
    if not isinstance(img, list):
        img = [img]

    org_height, org_width = img[0].shape[:2]
    translate_x = random.randint(xy_range.x_min, xy_range.x_max)
    translate_y = random.randint(xy_range.y_min, xy_range.y_max)
    trans_matrix = numpy.float32([[1, 0, translate_x], [0, 1, translate_y]])

    border_const = cv2.BORDER_CONSTANT
    if border_mode == "reflect":
        border_const = cv2.BORDER_REFLECT

    res = []
    for img_inst in img:
        img_inst = cv2.warpAffine(img_inst, trans_matrix, (org_width, org_height), borderMode=border_const)
        res.append(img_inst)
    if len(res) == 1:
        res = res[0]
    xy_range.last_x = translate_x
    xy_range.last_y = translate_y
    return res


def random_rotate_img(img, chance, min_angle, max_angle):
    import cv2
    if random.random() > chance:
        return img
    if not isinstance(img, list):
        img = [img]

    angle = random.randint(min_angle, max_angle)
    center = (img[0].shape[0] / 2, img[0].shape[1] / 2)
    rot_matrix = cv2.getRotationMatrix2D(center, angle, scale=1.0)

    res = []
    for img_inst in img:
        img_inst = cv2.warpAffine(img_inst, rot_matrix, dsize=img_inst.shape[:2], borderMode=cv2.BORDER_CONSTANT)
        res.append(img_inst)
    if len(res) == 0:
        res = res[0]
    return res


def random_flip_img(img, horizontal_chance=0, vertical_chance=0):
    import cv2
    flip_horizontal = False
    if random.random() < horizontal_chance:
        flip_horizontal = True

    flip_vertical = False
    if random.random() < vertical_chance:
        flip_vertical = True

    if not flip_horizontal and not flip_vertical:
        return img

    flip_val = 1
    if flip_vertical:
        flip_val = -1 if flip_horizontal else 0

    if not isinstance(img, list):
        res = cv2.flip(img, flip_val) # 0 = X axis, 1 = Y axis,  -1 = both
    else:
        res = []
        for img_item in img:
            img_flip = cv2.flip(img_item, flip_val)
            res.append(img_flip)
    return res


ELASTIC_INDICES = None  # needed to make it faster to fix elastic deformation per epoch.
def elastic_transform(image, alpha, sigma, random_state=None):
    global ELASTIC_INDICES
    shape = image.shape

    if ELASTIC_INDICES == None:
        if random_state is None:
            random_state = numpy.random.RandomState(1301)

        dx = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma, mode="constant", cval=0) * alpha
        dy = gaussian_filter((random_state.rand(*shape) * 2 - 1), sigma, mode="constant", cval=0) * alpha
        x, y = numpy.meshgrid(numpy.arange(shape[0]), numpy.arange(shape[1]))
        ELASTIC_INDICES = numpy.reshape(y + dy, (-1, 1)), numpy.reshape(x + dx, (-1, 1))
    return map_coordinates(image, ELASTIC_INDICES, order=1).reshape(shape)


def prepare_image_for_net(img):
    img = img.astype(numpy.float)
    img /= 255.
    if len(img.shape) == 3:
        img = img.reshape(img.shape[-3], img.shape[-2], img.shape[-1])
    else:
        img = img.reshape(1, img.shape[-2], img.shape[-1], 1)
    return img


def get_train_holdout_files(model_type, holdout, train_percentage=80, frame_count=8):
    print("Get train/holdout files.")
    file_paths = glob.glob("resources/segmenter_traindata/" + "*_1.png")
    file_paths.sort()
    train_res = []
    holdout_res = []
    for index, file_path in enumerate(file_paths):
        file_name = ntpath.basename(file_path)
        overlay_path = file_path.replace("_1.png", "_o.png")
        train_set = False
        if "1.3.6.1.4" in file_name or "spie" in file_name or "TIME" in file_name:
            train_set = True
        else:
            patient_id = file_name.split("_")[0]
            if helpers.get_patient_fold(patient_id) % 3 != holdout:
                train_set = True

        if train_set:
            train_res.append((file_path, overlay_path))
        else:
            holdout_res.append((file_path, overlay_path))
    print("Train count: ", len(train_res), ", holdout count: ", len(holdout_res))
    return train_res, holdout_res


def dice_coef(y_true, y_pred):
    y_true_f = K.flatten(y_true)
    y_pred_f = K.flatten(y_pred)
    intersection = K.sum(y_true_f * y_pred_f)
    return (2. * intersection + 100) / (K.sum(y_true_f) + K.sum(y_pred_f) + 100)


def dice_coef_np(y_true, y_pred):
    y_true_f = y_true.flatten()
    y_pred_f = y_pred.flatten()
    intersection = numpy.sum(y_true_f * y_pred_f)
    return (2. * intersection + 100) / (numpy.sum(y_true_f) + numpy.sum(y_pred_f) + 100)


def dice_coef_loss(y_true, y_pred):
    return -dice_coef(y_true, y_pred)


class DumpPredictions(Callback):

    def __init__(self, dump_filelist=List[Tuple[str, str]], model_type):
        super(DumpPredictions, self).__init__()
        self.dump_filelist = dump_filelist
        self.batch_count = 0
        if not os.path.exists("workdir/segmenter/"):
            os.mkdir("workdir/segmenter/")
        for file_path in glob.glob("workdir/segmenter/*.*"):
            os.remove(file_path)
        self.model_type = model_type

    def on_epoch_end(self, epoch, logs=None):
        model = self.model  # type: Model
        generator = image_generator(self.dump_filelist, 1, train_set=False, model_type=self.model_type)
        for i in range(0, 10):
            x, y = next(generator)
            y_pred = model.predict(x, batch_size=1)

            x = x.swapaxes(0, 3)
            x = x[0]
            # print(x.shape, y.shape, y_pred.shape)
            x *= 255.
            x = x.reshape((x.shape[0], x.shape[0])).astype(numpy.uint8)
            y *= 255.
            y = y.reshape((y.shape[1], y.shape[2])).astype(numpy.uint8)
            y_pred *= 255.
            y_pred = y_pred.reshape((y_pred.shape[1], y_pred.shape[2])).astype(numpy.uint8)
            # cv2.imwrite("workdir/segmenter/img_{0:03d}_{1:02d}_i.png".format(epoch, i), x)
            # cv2.imwrite("workdit/segmenter/img_{0:03d}_{1:02d}_o.png".format(epoch, i), y)
            # cv2.imwrite("workdit/segmenter/img_{0:03d}_{1:02d}_p.png".format(epoch, i), y_pred)


def image_generator(batch_files, batch_size, train_set, model_type):
    global ELASTIC_INDICES
    while True:
        if train_set:
            random.shuffle(batch_files)

        img_list = []
        overlay_list = []
        ELASTIC_INDICES = None
        for batch_file_idx, batch_file in enumerate(batch_files):
            images = []
            img = cv2.imread(batch_file[0], cv2.IMREAD_GRAYSCALE)
            images.append(img)
            overlay = cv2.imread(batch_file[1], cv2.IMREAD_GRAYSCALE)

            if train_set:
                if random.randint(0, 100) > 50:
                    for img_index, img in enumerate(images):
                        images[img_index] = elastic_transform(img, 128, 15)
                    overlay = elastic_transform(overlay, 128, 15)

                if True:
                    augmented = images + [overlay]
                    augmented = random_rotate_img(augmented, 0.8, -20, 20)
                    augmented = random_flip_img(augmented, 0.5, 0.5)

                    # processed = helpers_augmentation.random_flip_img(processed, horizontal_chance=0.5, vertical_chance=0)
                    # processed = helpers_augmentation.random_scale_img(processed, xy_range=helpers_augmentation.XYRange(x_min=0.8, x_max=1.2, y_min=0.8, y_max=1.2, chance=1.0))
                    augmented = random_translate_img(augmented, XYRange(-30, 30, -30, 30, 0.8))
                    images = augmented[:-1]
                    overlay = augmented[-1]

            for index, img in enumerate(images):
                # img = img[crop_y: crop_y + settings.TRAIN_IMG_HEIGHT3D, crop_x: crop_x + settings.TRAIN_IMG_WIDTH3D]
                img = prepare_image_for_net(img)
                images[index] = img

            # helpers_augmentation.dump_augmented_image(img, mean_img=None, target_path="c:\\tmp\\" + batch_file[0])
            # overlay = overlay[crop_y: crop_y + settings.TRAIN_IMG_HEIGHT3D, crop_x: crop_x + settings.TRAIN_IMG_WIDTH3D]
            overlay = prepare_image_for_net(overlay)
            # overlay = overlay.reshape(1, overlay.shape[-3] * overlay.shape[-2])
            # overlay *= settings.OVERLAY_MULTIPLIER
            images3d = numpy.vstack(images)
            images3d = images3d.swapaxes(0, 3)

            img_list.append(images3d)
            overlay_list.append(overlay)
            if len(img_list) >= batch_size:
                x = numpy.vstack(img_list)
                y = numpy.vstack(overlay_list)
                # if len(img_list) >= batch_size:
                yield x, y
                img_list = []
                overlay_list = []


def get_unet(learn_rate, load_weights_path=None) -> Model:
    inputs = Input((settings.SEGMENTER_IMG_SIZE, settings.SEGMENTER_IMG_SIZE, CHANNEL_COUNT))
    filter_size = 32
    growth_step = 32
    x = BatchNormalization()(inputs)
    conv1 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(x)
    conv1 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(conv1)
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)

    pool1 = BatchNormalization()(pool1)
    filter_size += growth_step
    conv2 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(pool1)
    conv2 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(conv2)
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
    pool2 = BatchNormalization()(pool2)

    filter_size += growth_step
    conv3 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(pool2)
    conv3 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(conv3)
    pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)
    pool3 = BatchNormalization()(pool3)

    filter_size += growth_step
    conv4 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(pool3)
    conv4 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(conv4)
    pool4 = MaxPooling2D(pool_size=(2, 2))(conv4)
    pool4 = BatchNormalization()(pool4)

    conv5 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(pool4)
    conv5 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same', name="conv5b")(conv5)
    pool5 = MaxPooling2D(pool_size=(2, 2), name="pool5")(conv5)
    pool5 = BatchNormalization()(pool5)

    conv6 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(pool5)
    conv6 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same', name="conv6b")(conv6)

    up6 = UpSampling2D(size=(2, 2), name="up6")(conv6)
    up6 = merge([up6, conv5], mode='concat', concat_axis=3)
    up6 = BatchNormalization()(up6)

    # up6 = SpatialDropout2D(0.1)(up6)
    filter_size -= growth_step
    conv66 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(up6)
    conv66 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(conv66)

    up7 = merge([UpSampling2D(size=(2, 2))(conv66), conv4], mode='concat', concat_axis=3)
    up7 = BatchNormalization()(up7)
    # up7 = SpatialDropout2D(0.1)(up7)

    filter_size -= growth_step
    conv7 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(up7)
    conv7 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(conv7)

    up8 = merge([UpSampling2D(size=(2, 2))(conv7), conv3], mode='concat', concat_axis=3)
    up8 = BatchNormalization()(up8)
    filter_size -= growth_step
    conv8 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(up8)
    conv8 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(conv8)


    up9 = merge([UpSampling2D(size=(2, 2))(conv8), conv2], mode='concat', concat_axis=3)
    up9 = BatchNormalization()(up9)
    conv9 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(up9)
    conv9 = Convolution2D(filter_size, 3, 3, activation='relu', border_mode='same')(conv9)
    # conv9 = BatchNormalization()(conv9)

    up10 = UpSampling2D(size=(2, 2))(conv9)
    conv10 = Convolution2D(1, 1, 1, activation='sigmoid')(up10)

    model = Model(input=inputs, output=conv10)
    # model.load_weights(load_weights_path)
    # model.compile(optimizer=Adam(lr=1.0e-5), loss=dice_coef_loss, metrics=[dice_coef])
    model.compile(optimizer=SGD(lr=learn_rate, momentum=0.9, nesterov=True), loss=dice_coef_loss, metrics=[dice_coef])

    model.summary()
    return model


def train_model(holdout, model_type, continue_from=None):
    batch_size = 4
    train_percentage = 80 if model_type == "masses" else 90
    train_files, holdout_files = get_train_holdout_files( model_type, holdout, train_percentage, frame_count=CHANNEL_COUNT)
    # train_files = train_files[:100]
    # holdout_files = train_files[:10]

    tmp_gen = image_generator(train_files[:2], 2, True, model_type)
    for i in range(10):
        x = next(tmp_gen)
        img = x[0][0].reshape((settings.SEGMENTER_IMG_SIZE, settings.SEGMENTER_IMG_SIZE))
        img *= 255
        # cv2.imwrite("c:/tmp/img_" + str(i).rjust(3, '0') + "i.png", img)
        img = x[1][0].reshape((settings.SEGMENTER_IMG_SIZE, settings.SEGMENTER_IMG_SIZE))
        img *= 255
        # cv2.imwrite("c:/tmp/img_" + str(i).rjust(3, '0') + "o.png", img)
        # print(x.shape)

    train_gen = image_generator(train_files, batch_size, True, model_type)
    holdout_gen = image_generator(holdout_files, batch_size, False, model_type)

    if continue_from is None:
        model = get_unet(0.001)
    else:
        model = get_unet(0.0001)
        model.load_weights(continue_from)

    checkpoint1 = ModelCheckpoint("workdir/" + model_type +"_model_h" + str(holdout) + "_{epoch:02d}-{val_loss:.2f}.hd5", monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)
    checkpoint2 = ModelCheckpoint("workdir/" + model_type +"_model_h" + str(holdout) + "_best.hd5", monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)
    files = []
    idx = 0
    while (idx < (len(holdout_files))):
        files.append(holdout_files[idx])
        idx += 5
    dumper = DumpPredictions(holdout_files[::10], model_type)
    epoch_div = 1
    epoch_count = 200 if model_type == "masses" else 50
    model.fit_generator(train_gen, len(train_files) / epoch_div, epoch_count, validation_data=holdout_gen, nb_val_samples=len(holdout_files) / epoch_div, callbacks=[checkpoint1, checkpoint2, dumper])
    shutil.copy("workdir/" + model_type +"_model_h" + str(holdout) + "_best.hd5", "models/" + model_type +"_model_h" + str(holdout) + "_best.hd5")

def predict_patients(patients_dir, model_path, holdout, patient_predictions, model_type):
    model = get_unet(0.001)
    model.load_weights(model_path)
    for item_name in os.listdir(patients_dir):
        if not os.path.isdir(patients_dir + item_name):
            continue
        patient_id = item_name

        if holdout >= 0:
            patient_fold = helpers.get_patient_fold(patient_id, submission_set_neg=True)
            if patient_fold < 0:
                if holdout != 0:
                    continue
            else:
                patient_fold %= 3
                if patient_fold != holdout:
                    continue

        # if "100953483028192176989979435275" not in patient_id:
        #     continue
        print(patient_id)
        patient_dir = patients_dir + patient_id + "/"
        mass = 0
        img_type = "_i" if model_type == "masses" else "_c"
        slices = glob.glob(patient_dir + "*" + img_type + ".png")
        if model_type == "emphysema":
            slices = slices[int(len(slices) / 2):]
        for img_path in slices:
            src_img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            src_img = cv2.resize(src_img, dsize=(settings.SEGMENTER_IMG_SIZE, settings.SEGMENTER_IMG_SIZE))
            src_img = prepare_image_for_net(src_img)
            p = model.predict(src_img, batch_size=1)
            p[p < 0.5] = 0
            mass += p.sum()
            p = p[0, :, :, 0] * 255
            # cv2.imwrite(img_path.replace("_i.png", "_mass.png"), p)
            src_img = src_img.reshape((settings.SEGMENTER_IMG_SIZE, settings.SEGMENTER_IMG_SIZE))
            src_img *= 255
            # src_img = cv2.cvtColor(src_img.astype(numpy.uint8), cv2.COLOR_GRAY2BGR)
            # p = cv2.cvtColor(p.astype(numpy.uint8), cv2.COLOR_GRAY2BGRA)
            src_img = cv2.addWeighted(p.astype(numpy.uint8), 0.2, src_img.astype(numpy.uint8), 1 - 0.2, 0)
            cv2.imwrite(img_path.replace(img_type + ".png", "_" + model_type + "o.png"), src_img)

        if mass > 1:
            print(model_type + ": ", mass)
        patient_predictions.append((patient_id, mass))
        df = pandas.DataFrame(patient_predictions, columns=["patient_id", "prediction"])
        df.to_csv(settings.BASE_DIR + model_type + "_predictions.csv", index=False)


if __name__ == "__main__":
    continue_from = None
    if True:
        for model_type_name in ["masses"]:
            train_model(holdout=0, model_type=model_type_name, continue_from=continue_from)
            train_model(holdout=1, model_type=model_type_name, continue_from=continue_from)
            train_model(holdout=2, model_type=model_type_name, continue_from=continue_from)

    if True:
        for model_type_name in ["masses"]:
            patient_predictions_global = []
            for holdout_no in [0, 1, 2]:
                patient_base_dir = settings.NDSB3_EXTRACTED_IMAGE_DIR
                predict_patients(patients_dir=patient_base_dir, model_path="models/" + model_type_name + "_model_h" + str(holdout_no) + "_best.hd5", holdout=holdout_no, patient_predictions=patient_predictions_global, model_type=model_type_name)




SyntaxError: invalid syntax (<ipython-input-24-64ca89b38aa7>, line 227)

In [12]:
from types import ListType,TupleType

In [18]:
from typing import List, Tuple

In [20]:
import settings
import helpers

import os
import glob
import random
import ntpath
import cv2
import numpy
from typing import List, Tuple
from keras.optimizers import Adam, SGD
from keras.layers import Input, Convolution2D, MaxPooling2D, UpSampling2D, merge, BatchNormalization, SpatialDropout2D
from keras.models import Model
from keras import backend as K
from keras.callbacks import ModelCheckpoint, Callback
from scipy.ndimage.interpolation import map_coordinates
from scipy.ndimage.filters import gaussian_filter
import pandas
import shutil

MEAN_FRAME_COUNT = 1
CHANNEL_COUNT = 1

ImportError: cannot import name Model