From 2827aff4ed6b8b28e58207fc7f2dd83f31959369 Mon Sep 17 00:00:00 2001 From: Chinmay Vibhute Date: Thu, 1 Jul 2021 23:05:42 +0530 Subject: [PATCH 1/2] refactor meta extractor for unit tests --- modules/meta-extraction/MetadataExtractor.py | 180 +++++++++++-------- 1 file changed, 104 insertions(+), 76 deletions(-) diff --git a/modules/meta-extraction/MetadataExtractor.py b/modules/meta-extraction/MetadataExtractor.py index 9afb39b..8c98d3e 100644 --- a/modules/meta-extraction/MetadataExtractor.py +++ b/modules/meta-extraction/MetadataExtractor.py @@ -28,59 +28,22 @@ import json -with open('service/system.json', 'r') as f: - niffler = json.load(f) - -# Get constants from system.json -DCM4CHE_BIN = niffler['DCM4CHEBin'] -STORAGE_FOLDER = niffler['StorageFolder'] -FILE_PATH = niffler['FilePath'] -QUERY_AET = niffler['QueryAet'] - -# Global Constants: Configurations and folder locations -EXTRACTION_RUNNING = False -IS_DCM4CHE_NOT_RUNNING = True -logging.basicConfig(level=logging.INFO) - -FEATURES_FOLDER = os.getcwd() + "/conf/" -PICKLE_FOLDER = os.getcwd() + "/pickles/" - - -# Variables to track progress between iterations. -processed_series_but_yet_to_delete = list() -processed_and_deleted_series = list() - - -# Get features of a txt file -features_lists = list() -feature_files = list() - -# All processed series is saved between iterations as pickle files. -try: - with open(PICKLE_FOLDER + 'processed_series_but_yet_to_delete.pickle', 'rb') as f: - processed_series_but_yet_to_delete = pickle.load(f) -except: - logging.info("Unable to load a valid pickle file. Initialized with empty value for processed_series_but_yet_to_delete") - -try: - with open(PICKLE_FOLDER + 'processed_and_deleted_series.pickle', 'rb') as f: - processed_and_deleted_series = pickle.load(f) -except: - logging.info("Unable to load a valid pickle file. Initialized with empty value for processed_and_deleted_series") - -# Read the txt file which includes the features, then extract them -os.chdir(FEATURES_FOLDER) -txt_files = glob.glob('*.txt') -logging.info('Number of files consisting of the features to extract: %s', str(len(txt_files))) - -for file in txt_files: - filename, file_extension = os.path.splitext(file) - text_file = open(file, "r") - feature_list = text_file.read().split('\n') - del feature_list[-1] - features_lists.append(feature_list) - feature_files.append(filename) - +# Define Global Vars +# (Initialization outside of __main__ needed for mocking in unit tests) +# These are overwritten in __main__ +DCM4CHE_BIN = None +STORAGE_FOLDER = None +FILE_PATH = None +QUERY_AET = None +EXTRACTION_RUNNING = None +IS_DCM4CHE_NOT_RUNNING = None +FEATURES_FOLDER = None +PICKLE_FOLDER = None +DB = None +processed_series_but_yet_to_delete = None +processed_and_deleted_series = None +features_lists = None +feature_files = None # Function for getting tuple for field, val pairs for this file # plan is instance of dicom class, the data for single mammo file @@ -133,6 +96,7 @@ def get_dict_fields(bigdict, features): # The core method of extracting metadata def extract(): + global STORAGE_FOLDER os.chdir(STORAGE_FOLDER) if len([name for name in os.listdir(".") if os.path.isdir(name)]) == 0: # Print once if the storage folder is empty. @@ -156,6 +120,7 @@ def extract_metadata(): global EXTRACTION_RUNNING global features_lists global feature_files + global DB extracted_in_this_iteration = 0 first_inst_of_series = list() @@ -221,10 +186,11 @@ def extract_metadata(): # Delete the processed DICOM objects from the storage. def clear_storage(): - os.chdir(STORAGE_FOLDER) + global STORAGE_FOLDER global processed_series_but_yet_to_delete global processed_and_deleted_series + os.chdir(STORAGE_FOLDER) deleted_in_this_iteration = 0 logging.info('Clean up process initiated at series level at: %s', str(datetime.datetime.now())) @@ -260,7 +226,8 @@ def clear_storage(): def update_pickle(): global processed_series_but_yet_to_delete global processed_and_deleted_series - + global PICKLE_FOLDER + # Pickle using the highest protocol available. with open(PICKLE_FOLDER + 'processed_series_but_yet_to_delete.pickle', 'wb') as f: pickle.dump(processed_series_but_yet_to_delete, f, pickle.HIGHEST_PROTOCOL) @@ -272,6 +239,7 @@ def update_pickle(): # Measure storage folder disk space utilization def measure_diskutil(): + global STORAGE_FOLDER total, used, free = shutil.disk_usage(STORAGE_FOLDER) logging.info("Disk space used by the Storage Folder: %d GB" % (used // (2**30))) @@ -279,6 +247,10 @@ def measure_diskutil(): # Run DCM4CHE only once, when the extraction script starts, and keep it running in a separate thread. def run_dcm4che(): global IS_DCM4CHE_NOT_RUNNING + global STORAGE_FOLDER + global FILE_PATH + global QUERY_AET + if IS_DCM4CHE_NOT_RUNNING: IS_DCM4CHE_NOT_RUNNING = False logging.info('Starting DCM4CHE..') @@ -291,31 +263,87 @@ def run_threaded(job_func): job_thread.start() -logging.info('The execution started at: %s', str(datetime.datetime.now())) +if __name__ == "__main__": -logging.debug('Debug logs enabled.') + with open('service/system.json', 'r') as f: + niffler = json.load(f) + + # Set Niffler Config. + DCM4CHE_BIN = niffler['DCM4CHEBin'] + STORAGE_FOLDER = niffler['StorageFolder'] + FILE_PATH = niffler['FilePath'] + QUERY_AET = niffler['QueryAet'] -# Create connections for communicating with Mongo DB server, to store metadata -try: - if os.environ['MONGO_URI'] != "": - mongo_uri = 'mongodb://' + os.environ['MONGO_URI'] - else: + + # Global Constants: Configurations and folder locations + EXTRACTION_RUNNING = False + IS_DCM4CHE_NOT_RUNNING = True + logging.basicConfig(level=logging.INFO) + + FEATURES_FOLDER = os.getcwd() + "/conf/" + PICKLE_FOLDER = os.getcwd() + "/pickles/" + + + # Variables to track progress between iterations. + processed_series_but_yet_to_delete = list() + processed_and_deleted_series = list() + + + # Get features of a txt file + features_lists = list() + feature_files = list() + + # All processed series is saved between iterations as pickle files. + try: + with open(PICKLE_FOLDER + 'processed_series_but_yet_to_delete.pickle', 'rb') as f: + processed_series_but_yet_to_delete = pickle.load(f) + except: + logging.info("Unable to load a valid pickle file. Initialized with empty value for processed_series_but_yet_to_delete") + + try: + with open(PICKLE_FOLDER + 'processed_and_deleted_series.pickle', 'rb') as f: + processed_and_deleted_series = pickle.load(f) + except: + logging.info("Unable to load a valid pickle file. Initialized with empty value for processed_and_deleted_series") + + # Read the txt file which includes the features, then extract them + os.chdir(FEATURES_FOLDER) + txt_files = glob.glob('*.txt') + logging.info('Number of files consisting of the features to extract: %s', str(len(txt_files))) + + for file in txt_files: + filename, file_extension = os.path.splitext(file) + text_file = open(file, "r") + feature_list = text_file.read().split('\n') + del feature_list[-1] + features_lists.append(feature_list) + feature_files.append(filename) + + logging.info('The execution started at: %s', str(datetime.datetime.now())) + + logging.debug('Debug logs enabled.') + + # Create connections for communicating with Mongo DB server, to store metadata + try: + if os.environ['MONGO_URI'] != "": + mongo_uri = 'mongodb://' + os.environ['MONGO_URI'] + else: + mongo_uri = 'mongodb://' + sys.argv[1] + except KeyError: mongo_uri = 'mongodb://' + sys.argv[1] -except KeyError: - mongo_uri = 'mongodb://' + sys.argv[1] -client = pymongo.MongoClient(mongo_uri) -DB = client.ScannersInfo + client = pymongo.MongoClient(mongo_uri) + DB = client.ScannersInfo -# The thread scheduling -schedule.every(5).minutes.do(run_threaded, run_dcm4che) -schedule.every(1).hours.do(run_threaded, measure_diskutil) -schedule.every(1).day.at("23:59").do(run_threaded, clear_storage) -schedule.every(20).minutes.do(run_threaded, update_pickle) -schedule.every(10).minutes.do(run_threaded, extract) + # The thread scheduling + schedule.every(5).minutes.do(run_threaded, run_dcm4che) + schedule.every(1).hours.do(run_threaded, measure_diskutil) + schedule.every(1).day.at("23:59").do(run_threaded, clear_storage) + schedule.every(20).minutes.do(run_threaded, update_pickle) + schedule.every(10).minutes.do(run_threaded, extract) -# Keep running in a loop. -while True: - schedule.run_pending() - time.sleep(1) + # Keep running in a loop. + while True: + schedule.run_pending() + time.sleep(1) From 2dd50e7ef6fdb6284b89b6ab65d921872fd22b84 Mon Sep 17 00:00:00 2001 From: Chinmay Vibhute Date: Thu, 1 Jul 2021 23:06:24 +0530 Subject: [PATCH 2/2] meta extrn unit tests --- .gitignore | 3 +- tests/README.md | 9 +- tests/data/meta-extraction/input/.gitkeep | 0 tests/data/meta-extraction/input/no-img.dcm | 0 tests/unit/test_meta_extraction.py | 189 ++++++++++++++++++++ 5 files changed, 198 insertions(+), 3 deletions(-) create mode 100644 tests/data/meta-extraction/input/.gitkeep create mode 100644 tests/data/meta-extraction/input/no-img.dcm create mode 100644 tests/unit/test_meta_extraction.py diff --git a/.gitignore b/.gitignore index f4858d0..85fa31e 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,5 @@ htmlcov coverage.xml /tests/data/tmp /tests/data/**/*.dcm -!/tests/data/**/no-img.dcm \ No newline at end of file +/tests/data/**/*.pickle +!/tests/data/**/no-img.dcm diff --git a/tests/README.md b/tests/README.md index 3580cfe..504ff51 100644 --- a/tests/README.md +++ b/tests/README.md @@ -10,12 +10,18 @@ pip install -r requirements-dev.txt Add the test data in `/tests/data//input` for respective tests. -### PNG Extraction Data Setup +### PNG Extraction Test Data Setup Test data in `/tests/data/png-extraction/input`. For unit tests, add a valid dcm file, with name `test-img.dcm`. +### Meta Extraction Test Data Setup + +Test data in `/tests/data/meta-extraction/input`. + +For unit tests, add a valid dcm file, with name `test-img.dcm`. + ## Running Tests Initialize the required data, and run tests from ``. @@ -31,4 +37,3 @@ pytest ./tests --cov=./modules --cov-report=html ``` and open the `/htmlcov/index.html` - diff --git a/tests/data/meta-extraction/input/.gitkeep b/tests/data/meta-extraction/input/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/meta-extraction/input/no-img.dcm b/tests/data/meta-extraction/input/no-img.dcm new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/test_meta_extraction.py b/tests/unit/test_meta_extraction.py new file mode 100644 index 0000000..52bb649 --- /dev/null +++ b/tests/unit/test_meta_extraction.py @@ -0,0 +1,189 @@ +import glob +import os +import pickle +import pytest +import pydicom +import sys + +from pathlib import Path, PurePath +from pytest_mock import MockerFixture + +niffler_modules_path = Path.cwd() / 'modules' +meta_extraction_path = niffler_modules_path / 'meta-extraction' +sys.path.append(str(meta_extraction_path)) +import MetadataExtractor + + +@pytest.fixture(autouse=True) +def mock_logger(mocker: MockerFixture): + return mocker.patch('MetadataExtractor.logging') + + +@pytest.fixture(autouse=True) +def mock_global_config(mocker: MockerFixture): + STORAGE_FOLDER = pytest.data_dir / 'meta-extraction' / 'storage_folder' + PICKLE_FOLDER = pytest.out_dir / 'meta-extraction' / 'pickles' + pytest.create_dirs(STORAGE_FOLDER, PICKLE_FOLDER) + return mocker.patch.multiple( + MetadataExtractor, + STORAGE_FOLDER=str(STORAGE_FOLDER), + PICKLE_FOLDER=str(PICKLE_FOLDER) + "/" + ) + + +@pytest.fixture +def features_lists(): + features_lists = list() + txt_files = glob.glob( + str(meta_extraction_path / 'conf') + '/*.txt' + ) + for file in txt_files: + text_file = open(file, "r") + feature_list = text_file.read().split('\n') + del feature_list[-1] + features_lists.append(feature_list) + return features_lists + + +@pytest.fixture +def feature_files(): + feature_files = list() + txt_files = glob.glob( + str(meta_extraction_path / 'conf') + '/*.txt' + ) + for file in txt_files: + filename, ext = os.path.splitext(file) + feature_files.append(filename) + return feature_files + + +class TestGetTuples: + test_dcm_file = str( + pytest.data_dir / 'meta-extraction' / 'input' / 'test-img.dcm') + test_valid_plan = pydicom.dcmread(test_dcm_file) + + def test_correct_output(self, features_lists): + for features in features_lists: + first_key = features[0] + tuple_list = MetadataExtractor.get_tuples( + self.test_valid_plan, features) + assert tuple_list[0][0] == first_key + + def test_correct_output_with_key(self, features_lists): + key = "rand_key" + for features in features_lists: + first_key = key + "_" + features[0] + tuple_list = MetadataExtractor.get_tuples( + self.test_valid_plan, features, key=key) + assert tuple_list[0][0] == first_key + + def test_feature_error(self, mock_logger): + invalid_features = ["some_invalid_feature"] + MetadataExtractor.get_tuples(self.test_valid_plan, invalid_features) + MetadataExtractor.logging.debug.assert_called_once() + + # TODO image with feature value of type pydicom.sequence.Sequence + # TODO minor code coverage + + +class TestGetDictFields: + test_dcm_file = str( + pytest.data_dir / 'meta-extraction' / 'input' / 'test-img.dcm') + test_valid_plan = pydicom.dcmread(test_dcm_file) + + def test_success(self, features_lists): + for features in features_lists: + tuple_list = MetadataExtractor.get_tuples( + self.test_valid_plan, features) + dict_fields = MetadataExtractor.get_dict_fields( + dict(tuple_list), features) + assert dict_fields[tuple_list[0][0]] == tuple_list[0][1] + + +class TestMeasureDiskUtil: + + def test_measure_diskutil(self, mock_logger, mock_global_config): + MetadataExtractor.measure_diskutil() + MetadataExtractor.logging.info.assert_called_once() + + +class TestUpdatePickle: + + def setup_method(self): + self.processed_series_but_yet_to_delete = [] + self.processed_and_deleted_series = [] + + def test_update_success(self, mocker, mock_logger, mock_global_config): + mocker.patch.multiple( + MetadataExtractor, + processed_series_but_yet_to_delete=self.processed_series_but_yet_to_delete, + processed_and_deleted_series=self.processed_and_deleted_series, + ) + # Function Completes write without error + MetadataExtractor.update_pickle() + MetadataExtractor.logging.debug.assert_called_once_with( + 'dumping complete') + + +class TestClearStorage: + + def setup_method(self): + self.processed_and_deleted_series = [] + self.processed_series_but_yet_to_delete = ['yet_to_delete'] + + @pytest.fixture(autouse=True) + def mock_series(self, mocker: MockerFixture): + return mocker.patch.multiple( + MetadataExtractor, + processed_series_but_yet_to_delete=self.processed_series_but_yet_to_delete, + processed_and_deleted_series=self.processed_and_deleted_series, + ) + + @pytest.fixture(autouse=True) + def mock_shutil(self, mocker: MockerFixture): + return mocker.patch('MetadataExtractor.shutil') + + def test_file_not_found_error(self, mock_shutil): + + mock_shutil.rmtree.side_effect = FileNotFoundError + + MetadataExtractor.clear_storage() + MetadataExtractor.logging.debug.assert_any_call( + 'The series of id %s was not found. Hence, not deleted in this iteration. Probably it was already deleted in a previous iteration without being tracked or by an external process', + self.processed_series_but_yet_to_delete[0] + ) + + def test_conn_reset_error(self, mock_shutil): + + mock_shutil.rmtree.side_effect = ConnectionResetError + + MetadataExtractor.clear_storage() + MetadataExtractor.logging.debug.assert_any_call( + 'The connection was reset during the deletion') + + def test_os_error(self, mock_shutil): + + mock_shutil.rmtree.side_effect = OSError + + MetadataExtractor.clear_storage() + MetadataExtractor.logging.debug.assert_any_call( + 'New images arriving for the series. Therefore, do not delete the series: %s', + self.processed_series_but_yet_to_delete[0] + ) + + def test_success(self): + MetadataExtractor.clear_storage() + MetadataExtractor.logging.debug.assert_any_call( + 'Deleted: %s %s %s %s', str(1), + ' out of ', + str(len(self.processed_series_but_yet_to_delete)), + ' remaining extraction completed series.' + ) + + +class TestRunThreaded: + + def test_thread_start_success(self, mocker): + stub = mocker.stub(name='job_func_stub') + MetadataExtractor.run_threaded(stub) + stub.assert_called_once()