In [1]:
from pathlib import Path

DATA_PATH = Path("/mnt/e/dev/data/luna")
ANNOTATIONS_PATH = DATA_PATH / "annotations.csv"
CANDIDATES_PATH = DATA_PATH / "candidates.csv"

In [2]:
import pandas as pd

annotations_df = pd.read_csv(filepath_or_buffer=ANNOTATIONS_PATH)
annotations_df.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,diameter_mm
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-128.699421,-175.319272,-298.387506,5.651471
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,103.783651,-211.925149,-227.12125,4.224708
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793...,69.639017,-140.944586,876.374496,5.786348
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,-24.013824,192.102405,-391.081276,8.143262
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,2.441547,172.464881,-405.493732,18.54515


In [3]:
candidates_df = pd.read_csv(filepath_or_buffer=CANDIDATES_PATH)
candidates_df.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,class
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-56.08,-67.85,-311.92,0
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,53.21,-244.41,-245.17,0
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,103.66,-121.8,-286.62,0
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-33.66,-72.75,-308.41,0
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-32.25,-85.36,-362.51,0


In [4]:
print(f"annotations.shape={annotations_df.shape}")
print(f"candidates.shape={candidates_df.shape}")
print("=" * 30)

# 0 - non-nodule, 1 - nodule (doesn't have to be malignant)
candidates_nodule_df = candidates_df[candidates_df['class'] == 1]
candidates_non_nodule_df = candidates_df[candidates_df['class'] == 0]

print(f"candidates_nodule_df.shape={candidates_nodule_df.shape}")
print(f"candidates_non_nodule_df.shape={candidates_non_nodule_df.shape}")

annotations.shape=(1186, 5)
candidates.shape=(551065, 5)
candidates_nodule_df.shape=(1351, 5)
candidates_non_nodule_df.shape=(549714, 5)


In [5]:
candidates = candidates_df[
    (candidates_df['seriesuid'] == "1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860") &
    (candidates_df['class'] == 1)
    ]

annotated_candidates = annotations_df[
    (annotations_df['seriesuid'] == "1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860")
]


"""
Doctor annoted data from annotations.csv has different X, Y, Z coordinates than the automatic nodule detection data in candidates.csv

annotations.csv: ground truth, manually annotated by radiologists
candidates.csv: output of automatic nodule detection algorithm, class == 1 if it falls within the doctor annotated nodule diameter
"""
print(candidates.iloc[1])
print(annotated_candidates.iloc[0])

seriesuid    1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...
coordX                                                 -128.94
coordY                                                 -175.04
coordZ                                                 -297.87
class                                                        1
Name: 78, dtype: object
seriesuid      1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...
coordX                                               -128.699421
coordY                                               -175.319272
coordZ                                               -298.387506
diameter_mm                                             5.651471
Name: 0, dtype: object


In [10]:
from collections import namedtuple
import math

CandidateInfoTuple = namedtuple(
    typename="CandidateInfoTuple",
    field_names=["is_nodule", "diameter_mm", "series_uid", "center_xyz"],
)

def _coord_distance(first_coord, second_coord):
    return math.sqrt(
        (first_coord[0] - second_coord[0]) ** 2 +
        (first_coord[1] - second_coord[1]) ** 2 +
        (first_coord[2] - second_coord[2]) ** 2
    )

def getCandidateInfo(annotations_path: str, candidates_path: str) -> list[CandidateInfoTuple]:
    annotations_df = pd.read_csv(filepath_or_buffer=annotations_path)
    candidates_df = pd.read_csv(filepath_or_buffer=candidates_path)

    AnnotationTuple = namedtuple(
        typename="AnnotationTuple",
        field_names=["seriesuid", "coord", "diameter"],
    )

    annotations_dict: dict[list[AnnotationTuple]] = {}
    for row in annotations_df.itertuples():
        annotations_dict.setdefault(row.seriesuid, []).append(
            AnnotationTuple(
                seriesuid=row.seriesuid,
                coord=(row.coordX, row.coordY, row.coordZ),
                diameter=row.diameter_mm,
            )
        )

    candidate_info_list: list[CandidateInfoTuple] = []
    for row in candidates_df.itertuples():
        series_uid = row.seriesuid
        coordX, coordY, coordZ = row.coordX, row.coordY, row.coordZ
        is_nodule = row[5] # .class is a reserved keyword in Python
        diameter_mm = 0

        if series_uid in annotations_dict:
            current_coord = (coordX, coordY, coordZ)
            for annotation in annotations_dict[series_uid]:
                if _coord_distance(annotation.coord, current_coord) < annotation.diameter / 4:
                    diameter_mm = annotation.diameter
                    break

        candidate_info_list.append(
            CandidateInfoTuple(
                is_nodule=bool(is_nodule),
                diameter_mm=diameter_mm,
                series_uid=series_uid,
                center_xyz=(coordX, coordY, coordZ),
            )
        )

    return candidate_info_list

candidates_info = getCandidateInfo(
    annotations_path=ANNOTATIONS_PATH,
    candidates_path=CANDIDATES_PATH,
)

candidates_info.sort(reverse=True)
print(candidates_info[:10])
print(sum(candidates_info[i].diameter_mm > 0 for i in range(len(candidates_info))))

[CandidateInfoTuple(is_nodule=True, diameter_mm=32.27003025, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644280690737019247886', center_xyz=(67.61451718, 85.02525992, -109.8084416)), CandidateInfoTuple(is_nodule=True, diameter_mm=30.61040636, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.112740418331256326754121315800', center_xyz=(47.90350511, 37.60442008, -99.93417567)), CandidateInfoTuple(is_nodule=True, diameter_mm=27.44242293, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.943403138251347598519939390311', center_xyz=(-45.29440163, 74.86925386, -97.52812481)), CandidateInfoTuple(is_nodule=True, diameter_mm=27.07544345, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.481278873893653517789960724156', center_xyz=(-102.571208, -5.186558766, -205.1033412)), CandidateInfoTuple(is_nodule=True, diameter_mm=26.83708074, series_uid='1.3.6.1.4.1.14519.5.2.1.6279.6001.487268565754493433372433148666', center_xyz=(121.152909372, 12.9136003304, -159.399497186)), CandidateInfoTuple(is_nodu