-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathchecker.py
196 lines (159 loc) · 7.78 KB
/
checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env python
"""
checker.py:
This script compares anonymized and non-anonymized DICOM files to identify matching files based on the `SOPInstanceUID` metadata.
It is optimized for large datasets by using batch processing, lazy loading of files, and memory management techniques.
Key Functions:
- Compare anonymized and non-anonymized DICOM files in batches.
- Index non-anonymized files by `SOPInstanceUID` for fast lookup.
- Move files to the `checked` or `unsolvable` directories based on the comparison results.
Expected Input:
- Anonymized and non-anonymized DICOM files in separate directories.
- A CSV file containing mappings from Real Patient IDs to Anonymized Patient IDs.
Output:
- The script moves anonymized files to the `checked` directory if a match is found.
- If no match is found, the files are moved to the `unsolvable` directory.
- The script logs the progress and results of the comparison.
Intended Use Case:
- This script is useful for validating the anonymization process and ensuring that the correct files are anonymized.
- It can be used as part of a data curation pipeline to verify the integrity of DICOM files.
Customization & Flexibility:
- The script can be easily extended to support additional metadata fields for comparison.
- The batch size can be adjusted to optimize performance based on the available system resources.
- The script can be adapted to handle other types of medical imaging data or metadata.
Performance & Compatibility:
- The script is designed for performance and efficiency when processing large datasets.
- It uses multiprocessing to parallelize the comparison of DICOM files and optimize resource utilization.
Best Practices & Maintenance:
- The script follows best practices for error handling, logging, and code readability.
- It is well-documented and can be easily maintained or extended by other developers.
- The script is designed to be robust and reliable for long-term use in data curation workflows.
Notes:
- This script is part of a larger data curation pipeline for multimodal breast imaging data.
- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data.
- The script is designed to be run from the command line or as part of an automated workflow.
"""
__author__ = "Francisco Maria Calisto"
__maintainer__ = "Francisco Maria Calisto"
__email__ = "francisco.calisto@tecnico.ulisboa.pt"
__license__ = "ACADEMIC & COMMERCIAL"
__version__ = "1.0.3"
__status__ = "Development"
__credits__ = ["Carlos Santiago",
"Catarina Barata",
"Jacinto C. Nascimento",
"Diogo Araújo"]
import os
import csv
import logging
import warnings
import pydicom
import shutil
from urllib3.exceptions import NotOpenSSLWarning
from multiprocessing import Pool, cpu_count
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Suppress warnings for clean output
warnings.filterwarnings("ignore", category=NotOpenSSLWarning)
# Define supported modalities
SUPPORTED_MODALITIES = ['US', 'MG', 'MRI']
# Mapping file name
mapping_fn = "mamo_patients_mapping_data.csv"
# Define paths
root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
anonymized_dir = os.path.join(root_dir, "dataset-multimodal-breast", "data", "curation", "unchecked")
checked_dir = os.path.join(root_dir, "dataset-multimodal-breast", "data", "curation", "checked")
unsolvable_dir = os.path.join(root_dir, "dataset-multimodal-breast", "data", "curation", "unsolvable")
non_anonymized_dir = os.path.join(root_dir, "dicom-images-breast", "known", "raw")
output_csv_file = os.path.join(root_dir, "dicom-images-breast", "data", "checking", mapping_fn)
BATCH_SIZE = 1000 # Define the size of each batch for processing large datasets
def is_supported_modality(dicom_file):
"""Check if the DICOM file is of a supported modality (US, MG, MRI)."""
dicom_modality = dicom_file.get("Modality", "").upper()
return dicom_modality in SUPPORTED_MODALITIES
def is_dicom_file(filepath):
"""Check if a file is a valid DICOM file by attempting to read it."""
try:
pydicom.dcmread(filepath, stop_before_pixels=True)
return True
except Exception as e:
logging.warning(f"Not a DICOM file: {filepath} - {e}")
return False
def index_non_anonymized_files(non_anonymized_files):
"""Index non-anonymized files by SOPInstanceUID for fast lookup."""
sop_uid_index = {}
for non_anonymized_file in non_anonymized_files:
try:
metadata = pydicom.dcmread(non_anonymized_file, stop_before_pixels=True)
if hasattr(metadata, 'SOPInstanceUID'):
sop_uid = metadata.SOPInstanceUID
sop_uid_index[sop_uid] = non_anonymized_file
except Exception as e:
logging.warning(f"Failed to read non-anonymized file: {non_anonymized_file} - {e}")
logging.info(f"Indexed {len(sop_uid_index)} non-anonymized files.")
return sop_uid_index
def batch_process_dicom_files(anonymized_files_batch, sop_uid_index):
"""Process a batch of anonymized DICOM files and compare them with the indexed non-anonymized files."""
for anonymized_filepath in anonymized_files_batch:
if not is_dicom_file(anonymized_filepath):
continue
anonymized_dicom = pydicom.dcmread(anonymized_filepath)
if not hasattr(anonymized_dicom, 'SOPInstanceUID'):
logging.warning(f"SOPInstanceUID not found in anonymized DICOM file: {anonymized_filepath}")
move_to_unsolvable(anonymized_filepath)
continue
if not is_supported_modality(anonymized_dicom):
logging.info(f"Skipping unsupported modality for file: {anonymized_filepath}")
continue
anonymized_sop_uid = anonymized_dicom.SOPInstanceUID
match_filepath = sop_uid_index.get(anonymized_sop_uid)
if match_filepath:
move_to_checked(anonymized_filepath)
else:
move_to_unsolvable(anonymized_filepath)
def compare_dicom_files_in_batches(anonymized_path, non_anonymized_path):
"""Compare anonymized and non-anonymized DICOM files in batches to handle large datasets efficiently."""
anonymized_files = get_all_files(anonymized_path)
non_anonymized_files = get_all_files(non_anonymized_path)
# Index non-anonymized files by SOPInstanceUID for fast lookup
sop_uid_index = index_non_anonymized_files(non_anonymized_files)
# Use multiprocessing to process batches in parallel
num_batches = len(anonymized_files) // BATCH_SIZE + 1
pool = Pool(cpu_count())
for i in range(num_batches):
batch = anonymized_files[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]
pool.apply_async(batch_process_dicom_files, args=(batch, sop_uid_index))
pool.close()
pool.join()
logging.info("All batches processed.")
def get_all_files(directory):
"""Get all files recursively from a directory."""
file_list = []
for root, _, files in os.walk(directory):
for file in files:
file_list.append(os.path.join(root, file))
return file_list
def move_to_checked(filepath):
"""Move the file to the 'checked' directory."""
filename = os.path.basename(filepath)
dest_path = os.path.join(checked_dir, filename)
move_file(filepath, dest_path)
logging.info(f"File {filename} moved to checked directory.")
def move_to_unsolvable(filepath):
"""Move the file to the 'unsolvable' directory."""
filename = os.path.basename(filepath)
dest_path = os.path.join(unsolvable_dir, filename)
move_file(filepath, dest_path)
logging.info(f"File {filename} moved to unsolvable directory.")
def move_file(src, dest):
"""Move a file from src to dest."""
if os.path.exists(src):
os.makedirs(os.path.dirname(dest), exist_ok=True)
shutil.move(src, dest)
logging.info(f"File moved to {dest}")
else:
logging.warning(f"File not found: {src}")
if __name__ == '__main__':
compare_dicom_files_in_batches(anonymized_dir, non_anonymized_dir)
logging.info("Comparison complete!")
# End of file