In [14]:
import pandas as pd
import ast
from tqdm import tqdm
import os, pydicom

series_root = "/kaggle/input/rsna-intracranial-aneurysm-detection/series"
localizers_csv = "/kaggle/input/rsna-intracranial-aneurysm-detection/train_localizers.csv"
out_csv = "/kaggle/working/aneurysm_slices_with_z.csv"

In [8]:
df = pd.read_csv(localizers_csv)
df["coords"] = df["coordinates"].apply(ast.literal_eval)

In [13]:
rows = []
for _, r in tqdm(df.iterrows(), total=len(df), desc="Extraction des localisations"):
    uid = r["SeriesInstanceUID"]
    sop = r["SOPInstanceUID"]

    # --- lire InstanceNumber ---
    dcm_path = os.path.join(series_root, uid, f"{sop}.dcm")
    if os.path.exists(dcm_path):
        ds = pydicom.dcmread(dcm_path, stop_before_pixels=True, force=True)
        slice_num = getattr(ds, "InstanceNumber", None)
    else:
        slice_num = None

    rows.append({
        "series_uid": uid,
        "sop_uid": sop,
        "slice_index": slice_num,
        "aneurysm": 1,
        "x": r["coords"]["x"],
        "y": r["coords"]["y"],
        "locality": r["location"]
    })

Extraction des localisations: 100%|██████████| 2254/2254 [00:24<00:00, 93.05it/s] 


In [15]:
out_df = pd.DataFrame(rows)
out_df.to_csv(out_csv, index=False)


✅ Exporté vers /kaggle/working/aneurysm_slices_with_z.csv (2254 lignes)


In [17]:
import pandas as pd

out_csv = "aneurysm_slices_with_z.csv"
df = pd.read_csv(out_csv)

print("=== Aperçu général ===")
print(f"Total d’annotations : {len(df):,}")
print(f"Séries uniques : {df['series_uid'].nunique():,}")
print(f"Slices uniques : {df['sop_uid'].nunique():,}")

print("\n=== Distribution du nombre d’anévrismes par série ===")
count_per_series = df["series_uid"].value_counts()
print(count_per_series.describe())
print("\nTop 10 séries les plus annotées :")
print(count_per_series.head(10))

print("\n=== Localisations les plus fréquentes ===")
print(df["locality"].value_counts().head(10))




=== Aperçu général ===
Total d’annotations : 2,254
Séries uniques : 1,863
Slices uniques : 2,214

=== Distribution du nombre d’anévrismes par série ===
count    1863.000000
mean        1.209877
std         0.553739
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         5.000000
Name: count, dtype: float64

Top 10 séries les plus annotées :
series_uid
1.2.826.0.1.3680043.8.498.31629979420404800139928339434297456334    5
1.2.826.0.1.3680043.8.498.11292203154407642658894712229998766945    5
1.2.826.0.1.3680043.8.498.99028068919105186302294079606577228686    5
1.2.826.0.1.3680043.8.498.88662334466087798807484415780594176763    5
1.2.826.0.1.3680043.8.498.76928456732082261565048056589908832861    5
1.2.826.0.1.3680043.8.498.11527986509512933171256788651291467752    5
1.2.826.0.1.3680043.8.498.89343864244736172393080011763302720900    5
1.2.826.0.1.3680043.8.498.12903717039792869467373823954631668258    4
1.2.826.0.1.3680043.8.498.40603714009813152796

In [26]:
import pandas as pd

train_csv = "/kaggle/input/rsna-intracranial-aneurysm-detection/train.csv"

train = pd.read_csv(train_csv)
slices = pd.read_csv(out_csv)

train_pos = train[train["Aneurysm Present"] == 1].copy()

exclude = ["SeriesInstanceUID","PatientAge","PatientSex","Modality","Aneurysm Present"]
anatomy_cols = [c for c in train.columns if c not in exclude]

for c in anatomy_cols:
    train_pos[c] = pd.to_numeric(train_pos[c], errors="coerce").fillna(0).astype(int)

train_pos["n_localisations"] = train_pos[anatomy_cols].sum(axis=1)
multi_train = set(train_pos.loc[train_pos["n_localisations"] > 1, "SeriesInstanceUID"])

count_slices = slices["series_uid"].value_counts()
multi_slices = set(count_slices[count_slices > 1].index)

only_in_train = multi_train - multi_slices
only_in_slices = multi_slices - multi_train

print(f"Séries multi dans train mais pas dans localizers : {len(only_in_train)}")
print(f"Séries multi dans localizers mais pas dans train : {len(only_in_slices)}\n")

print("Exemples (train mais pas localizers):")
print(list(only_in_train)[:10])

print("\nExemples (localizers mais pas train):")
print(list(only_in_slices)[:10])


Séries multi dans train mais pas dans localizers : 1
Séries multi dans localizers mais pas dans train : 44

Exemples (train mais pas localizers):
['1.2.826.0.1.3680043.8.498.75798029534455454939797323020706657426']

Exemples (localizers mais pas train):
['1.2.826.0.1.3680043.8.498.13128656559176299272467358793386537400', '1.2.826.0.1.3680043.8.498.66886574765471860867952209867524090563', '1.2.826.0.1.3680043.8.498.11431853092322033942451801825977553068', '1.2.826.0.1.3680043.8.498.12780687841924878965940656634052376723', '1.2.826.0.1.3680043.8.498.11639720015527164474926997755882681707', '1.2.826.0.1.3680043.8.498.12600056406312244714678292907491453656', '1.2.826.0.1.3680043.8.498.77219897788345305448159915113352253551', '1.2.826.0.1.3680043.8.498.92038706147499683228381933293532888541', '1.2.826.0.1.3680043.8.498.50369188120242587742908379292729868174', '1.2.826.0.1.3680043.8.498.46532404637743026850058394297074115571']


Multi dans train mais pas dans out : erreurs sur les données
Multi dans out mais pas dans train : erreurs sur les données OU plusieurs anévrismes dans la même zone