From 2827aff4ed6b8b28e58207fc7f2dd83f31959369 Mon Sep 17 00:00:00 2001
From: Chinmay Vibhute <chinmayvib@gmail.com>
Date: Thu, 1 Jul 2021 23:05:42 +0530
Subject: [PATCH 1/2] refactor meta extractor for unit tests

---
 modules/meta-extraction/MetadataExtractor.py | 180 +++++++++++--------
 1 file changed, 104 insertions(+), 76 deletions(-)

diff --git a/modules/meta-extraction/MetadataExtractor.py b/modules/meta-extraction/MetadataExtractor.py
index 9afb39b..8c98d3e 100644
--- a/modules/meta-extraction/MetadataExtractor.py
+++ b/modules/meta-extraction/MetadataExtractor.py
@@ -28,59 +28,22 @@
 import json
 
 
-with open('service/system.json', 'r') as f:
-    niffler = json.load(f)
-
-# Get constants from system.json
-DCM4CHE_BIN = niffler['DCM4CHEBin']
-STORAGE_FOLDER = niffler['StorageFolder']
-FILE_PATH = niffler['FilePath']
-QUERY_AET = niffler['QueryAet']
-
-# Global Constants: Configurations and folder locations
-EXTRACTION_RUNNING = False
-IS_DCM4CHE_NOT_RUNNING = True
-logging.basicConfig(level=logging.INFO)
-
-FEATURES_FOLDER = os.getcwd() + "/conf/"
-PICKLE_FOLDER = os.getcwd() + "/pickles/"
-
-
-# Variables to track progress between iterations.
-processed_series_but_yet_to_delete = list()
-processed_and_deleted_series = list()            
-
-
-# Get features of a txt file
-features_lists = list()
-feature_files = list()
-
-# All processed series is saved between iterations as pickle files.
-try:
-    with open(PICKLE_FOLDER + 'processed_series_but_yet_to_delete.pickle', 'rb') as f:
-        processed_series_but_yet_to_delete = pickle.load(f)
-except:
-    logging.info("Unable to load a valid pickle file. Initialized with empty value for processed_series_but_yet_to_delete")
-
-try:
-    with open(PICKLE_FOLDER + 'processed_and_deleted_series.pickle', 'rb') as f:
-        processed_and_deleted_series = pickle.load(f)
-except:
-    logging.info("Unable to load a valid pickle file. Initialized with empty value for processed_and_deleted_series")
-
-# Read the txt file which includes the features, then extract them
-os.chdir(FEATURES_FOLDER)
-txt_files = glob.glob('*.txt')
-logging.info('Number of files consisting of the features to extract: %s', str(len(txt_files)))
-
-for file in txt_files:
-    filename, file_extension = os.path.splitext(file)
-    text_file = open(file, "r")
-    feature_list = text_file.read().split('\n')
-    del feature_list[-1]
-    features_lists.append(feature_list)
-    feature_files.append(filename)
-
+# Define Global Vars 
+# (Initialization outside of __main__ needed for mocking in unit tests)
+# These are overwritten in __main__
+DCM4CHE_BIN = None
+STORAGE_FOLDER = None
+FILE_PATH = None
+QUERY_AET = None
+EXTRACTION_RUNNING = None
+IS_DCM4CHE_NOT_RUNNING = None
+FEATURES_FOLDER = None
+PICKLE_FOLDER = None
+DB = None
+processed_series_but_yet_to_delete = None
+processed_and_deleted_series = None
+features_lists = None
+feature_files = None
 
 # Function for getting tuple for field, val pairs for this file
 # plan is instance of dicom class, the data for single mammo file
@@ -133,6 +96,7 @@ def get_dict_fields(bigdict, features):
 
 # The core method of extracting metadata
 def extract():
+    global STORAGE_FOLDER
     os.chdir(STORAGE_FOLDER)
 
     if len([name for name in os.listdir(".") if os.path.isdir(name)]) == 0:  # Print once if the storage folder is empty.
@@ -156,6 +120,7 @@ def extract_metadata():
     global EXTRACTION_RUNNING
     global features_lists
     global feature_files
+    global DB
 
     extracted_in_this_iteration = 0
     first_inst_of_series = list()
@@ -221,10 +186,11 @@ def extract_metadata():
 
 # Delete the processed DICOM objects from the storage.
 def clear_storage():
-    os.chdir(STORAGE_FOLDER)
+    global STORAGE_FOLDER
     global processed_series_but_yet_to_delete
     global processed_and_deleted_series
 
+    os.chdir(STORAGE_FOLDER)
     deleted_in_this_iteration = 0
 
     logging.info('Clean up process initiated at series level at: %s', str(datetime.datetime.now()))
@@ -260,7 +226,8 @@ def clear_storage():
 def update_pickle():
     global processed_series_but_yet_to_delete
     global processed_and_deleted_series
-    
+    global PICKLE_FOLDER
+
     # Pickle using the highest protocol available.
     with open(PICKLE_FOLDER + 'processed_series_but_yet_to_delete.pickle', 'wb') as f:
         pickle.dump(processed_series_but_yet_to_delete, f, pickle.HIGHEST_PROTOCOL)
@@ -272,6 +239,7 @@ def update_pickle():
 
 # Measure storage folder disk space utilization
 def measure_diskutil():
+    global STORAGE_FOLDER    
     total, used, free = shutil.disk_usage(STORAGE_FOLDER)
     logging.info("Disk space used by the Storage Folder: %d GB" % (used // (2**30)))
 
@@ -279,6 +247,10 @@ def measure_diskutil():
 # Run DCM4CHE only once, when the extraction script starts, and keep it running in a separate thread.
 def run_dcm4che():
     global IS_DCM4CHE_NOT_RUNNING
+    global STORAGE_FOLDER
+    global FILE_PATH
+    global QUERY_AET
+    
     if IS_DCM4CHE_NOT_RUNNING:
         IS_DCM4CHE_NOT_RUNNING = False   
         logging.info('Starting DCM4CHE..')
@@ -291,31 +263,87 @@ def run_threaded(job_func):
     job_thread.start()
 
 
-logging.info('The execution started at: %s', str(datetime.datetime.now()))
+if __name__ == "__main__":
 
-logging.debug('Debug logs enabled.')
+    with open('service/system.json', 'r') as f:
+        niffler = json.load(f)
+    
+    # Set Niffler Config.
+    DCM4CHE_BIN = niffler['DCM4CHEBin']
+    STORAGE_FOLDER = niffler['StorageFolder']
+    FILE_PATH = niffler['FilePath']
+    QUERY_AET = niffler['QueryAet']
 
-# Create connections for communicating with Mongo DB server, to store metadata
-try:
-    if os.environ['MONGO_URI'] != "":
-        mongo_uri = 'mongodb://' + os.environ['MONGO_URI']
-    else:
+    
+    # Global Constants: Configurations and folder locations
+    EXTRACTION_RUNNING = False
+    IS_DCM4CHE_NOT_RUNNING = True
+    logging.basicConfig(level=logging.INFO)
+
+    FEATURES_FOLDER = os.getcwd() + "/conf/"
+    PICKLE_FOLDER = os.getcwd() + "/pickles/"
+
+
+    # Variables to track progress between iterations.
+    processed_series_but_yet_to_delete = list()
+    processed_and_deleted_series = list()            
+
+
+    # Get features of a txt file
+    features_lists = list()
+    feature_files = list()
+
+    # All processed series is saved between iterations as pickle files.
+    try:
+        with open(PICKLE_FOLDER + 'processed_series_but_yet_to_delete.pickle', 'rb') as f:
+            processed_series_but_yet_to_delete = pickle.load(f)
+    except:
+        logging.info("Unable to load a valid pickle file. Initialized with empty value for processed_series_but_yet_to_delete")
+
+    try:
+        with open(PICKLE_FOLDER + 'processed_and_deleted_series.pickle', 'rb') as f:
+            processed_and_deleted_series = pickle.load(f)
+    except:
+        logging.info("Unable to load a valid pickle file. Initialized with empty value for processed_and_deleted_series")
+
+    # Read the txt file which includes the features, then extract them
+    os.chdir(FEATURES_FOLDER)
+    txt_files = glob.glob('*.txt')
+    logging.info('Number of files consisting of the features to extract: %s', str(len(txt_files)))
+
+    for file in txt_files:
+        filename, file_extension = os.path.splitext(file)
+        text_file = open(file, "r")
+        feature_list = text_file.read().split('\n')
+        del feature_list[-1]
+        features_lists.append(feature_list)
+        feature_files.append(filename)
+
+    logging.info('The execution started at: %s', str(datetime.datetime.now()))
+
+    logging.debug('Debug logs enabled.')
+
+    # Create connections for communicating with Mongo DB server, to store metadata
+    try:
+        if os.environ['MONGO_URI'] != "":
+            mongo_uri = 'mongodb://' + os.environ['MONGO_URI']
+        else:
+            mongo_uri = 'mongodb://' + sys.argv[1]
+    except KeyError:
         mongo_uri = 'mongodb://' + sys.argv[1]
-except KeyError:
-    mongo_uri = 'mongodb://' + sys.argv[1]
 
-client = pymongo.MongoClient(mongo_uri)
-DB = client.ScannersInfo
+    client = pymongo.MongoClient(mongo_uri)
+    DB = client.ScannersInfo
 
 
-# The thread scheduling
-schedule.every(5).minutes.do(run_threaded, run_dcm4che)
-schedule.every(1).hours.do(run_threaded, measure_diskutil)
-schedule.every(1).day.at("23:59").do(run_threaded, clear_storage)
-schedule.every(20).minutes.do(run_threaded, update_pickle)
-schedule.every(10).minutes.do(run_threaded, extract)
+    # The thread scheduling
+    schedule.every(5).minutes.do(run_threaded, run_dcm4che)
+    schedule.every(1).hours.do(run_threaded, measure_diskutil)
+    schedule.every(1).day.at("23:59").do(run_threaded, clear_storage)
+    schedule.every(20).minutes.do(run_threaded, update_pickle)
+    schedule.every(10).minutes.do(run_threaded, extract)
 
-# Keep running in a loop.
-while True:
-    schedule.run_pending()
-    time.sleep(1)
+    # Keep running in a loop.
+    while True:
+        schedule.run_pending()
+        time.sleep(1)

From 2dd50e7ef6fdb6284b89b6ab65d921872fd22b84 Mon Sep 17 00:00:00 2001
From: Chinmay Vibhute <chinmayvib@gmail.com>
Date: Thu, 1 Jul 2021 23:06:24 +0530
Subject: [PATCH 2/2] meta extrn unit tests

---
 .gitignore                                  |   3 +-
 tests/README.md                             |   9 +-
 tests/data/meta-extraction/input/.gitkeep   |   0
 tests/data/meta-extraction/input/no-img.dcm |   0
 tests/unit/test_meta_extraction.py          | 189 ++++++++++++++++++++
 5 files changed, 198 insertions(+), 3 deletions(-)
 create mode 100644 tests/data/meta-extraction/input/.gitkeep
 create mode 100644 tests/data/meta-extraction/input/no-img.dcm
 create mode 100644 tests/unit/test_meta_extraction.py

diff --git a/.gitignore b/.gitignore
index f4858d0..85fa31e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,5 @@ htmlcov
 coverage.xml
 /tests/data/tmp
 /tests/data/**/*.dcm
-!/tests/data/**/no-img.dcm
\ No newline at end of file
+/tests/data/**/*.pickle
+!/tests/data/**/no-img.dcm
diff --git a/tests/README.md b/tests/README.md
index 3580cfe..504ff51 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -10,12 +10,18 @@ pip install -r requirements-dev.txt
 
 Add the test data in `<repo-home>/tests/data/<module-name>/input` for respective tests.
 
-### PNG Extraction Data Setup
+### PNG Extraction Test Data Setup
 
 Test data in `<repo-home>/tests/data/png-extraction/input`.
 
 For unit tests, add a valid dcm file, with name `test-img.dcm`.
 
+### Meta Extraction Test Data Setup
+
+Test data in `<repo-home>/tests/data/meta-extraction/input`.
+
+For unit tests, add a valid dcm file, with name `test-img.dcm`.
+
 ## Running Tests
 
 Initialize the required data, and run tests from `<repo-home>`.
@@ -31,4 +37,3 @@ pytest ./tests --cov=./modules --cov-report=html
 ```
 
 and open the `<repo-home>/htmlcov/index.html`
-
diff --git a/tests/data/meta-extraction/input/.gitkeep b/tests/data/meta-extraction/input/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/meta-extraction/input/no-img.dcm b/tests/data/meta-extraction/input/no-img.dcm
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/test_meta_extraction.py b/tests/unit/test_meta_extraction.py
new file mode 100644
index 0000000..52bb649
--- /dev/null
+++ b/tests/unit/test_meta_extraction.py
@@ -0,0 +1,189 @@
+import glob
+import os
+import pickle
+import pytest
+import pydicom
+import sys
+
+from pathlib import Path, PurePath
+from pytest_mock import MockerFixture
+
+niffler_modules_path = Path.cwd() / 'modules'
+meta_extraction_path = niffler_modules_path / 'meta-extraction'
+sys.path.append(str(meta_extraction_path))
+import MetadataExtractor
+
+
+@pytest.fixture(autouse=True)
+def mock_logger(mocker: MockerFixture):
+    return mocker.patch('MetadataExtractor.logging')
+
+
+@pytest.fixture(autouse=True)
+def mock_global_config(mocker: MockerFixture):
+    STORAGE_FOLDER = pytest.data_dir / 'meta-extraction' / 'storage_folder'
+    PICKLE_FOLDER = pytest.out_dir / 'meta-extraction' / 'pickles'
+    pytest.create_dirs(STORAGE_FOLDER, PICKLE_FOLDER)
+    return mocker.patch.multiple(
+        MetadataExtractor,
+        STORAGE_FOLDER=str(STORAGE_FOLDER),
+        PICKLE_FOLDER=str(PICKLE_FOLDER) + "/"
+    )
+
+
+@pytest.fixture
+def features_lists():
+    features_lists = list()
+    txt_files = glob.glob(
+        str(meta_extraction_path / 'conf') + '/*.txt'
+    )
+    for file in txt_files:
+        text_file = open(file, "r")
+        feature_list = text_file.read().split('\n')
+        del feature_list[-1]
+        features_lists.append(feature_list)
+    return features_lists
+
+
+@pytest.fixture
+def feature_files():
+    feature_files = list()
+    txt_files = glob.glob(
+        str(meta_extraction_path / 'conf') + '/*.txt'
+    )
+    for file in txt_files:
+        filename, ext = os.path.splitext(file)
+        feature_files.append(filename)
+    return feature_files
+
+
+class TestGetTuples:
+    test_dcm_file = str(
+        pytest.data_dir / 'meta-extraction' / 'input' / 'test-img.dcm')
+    test_valid_plan = pydicom.dcmread(test_dcm_file)
+
+    def test_correct_output(self, features_lists):
+        for features in features_lists:
+            first_key = features[0]
+            tuple_list = MetadataExtractor.get_tuples(
+                self.test_valid_plan, features)
+            assert tuple_list[0][0] == first_key
+
+    def test_correct_output_with_key(self, features_lists):
+        key = "rand_key"
+        for features in features_lists:
+            first_key = key + "_" + features[0]
+            tuple_list = MetadataExtractor.get_tuples(
+                self.test_valid_plan, features, key=key)
+            assert tuple_list[0][0] == first_key
+
+    def test_feature_error(self, mock_logger):
+        invalid_features = ["some_invalid_feature"]
+        MetadataExtractor.get_tuples(self.test_valid_plan, invalid_features)
+        MetadataExtractor.logging.debug.assert_called_once()
+
+    # TODO image with feature value of type pydicom.sequence.Sequence
+    # TODO minor code coverage
+
+
+class TestGetDictFields:
+    test_dcm_file = str(
+        pytest.data_dir / 'meta-extraction' / 'input' / 'test-img.dcm')
+    test_valid_plan = pydicom.dcmread(test_dcm_file)
+
+    def test_success(self, features_lists):
+        for features in features_lists:
+            tuple_list = MetadataExtractor.get_tuples(
+                self.test_valid_plan, features)
+            dict_fields = MetadataExtractor.get_dict_fields(
+                dict(tuple_list), features)
+            assert dict_fields[tuple_list[0][0]] == tuple_list[0][1]
+
+
+class TestMeasureDiskUtil:
+
+    def test_measure_diskutil(self, mock_logger, mock_global_config):
+        MetadataExtractor.measure_diskutil()
+        MetadataExtractor.logging.info.assert_called_once()
+
+
+class TestUpdatePickle:
+
+    def setup_method(self):
+        self.processed_series_but_yet_to_delete = []
+        self.processed_and_deleted_series = []
+
+    def test_update_success(self, mocker, mock_logger, mock_global_config):
+        mocker.patch.multiple(
+            MetadataExtractor,
+            processed_series_but_yet_to_delete=self.processed_series_but_yet_to_delete,
+            processed_and_deleted_series=self.processed_and_deleted_series,
+        )
+        # Function Completes write without error
+        MetadataExtractor.update_pickle()
+        MetadataExtractor.logging.debug.assert_called_once_with(
+            'dumping complete')
+
+
+class TestClearStorage:
+
+    def setup_method(self):
+        self.processed_and_deleted_series = []
+        self.processed_series_but_yet_to_delete = ['yet_to_delete']
+
+    @pytest.fixture(autouse=True)
+    def mock_series(self, mocker: MockerFixture):
+        return mocker.patch.multiple(
+            MetadataExtractor,
+            processed_series_but_yet_to_delete=self.processed_series_but_yet_to_delete,
+            processed_and_deleted_series=self.processed_and_deleted_series,
+        )
+
+    @pytest.fixture(autouse=True)
+    def mock_shutil(self, mocker: MockerFixture):
+        return mocker.patch('MetadataExtractor.shutil')
+
+    def test_file_not_found_error(self, mock_shutil):
+
+        mock_shutil.rmtree.side_effect = FileNotFoundError
+
+        MetadataExtractor.clear_storage()
+        MetadataExtractor.logging.debug.assert_any_call(
+            'The series of id %s was not found. Hence, not deleted in this iteration. Probably it was already deleted in a previous iteration without being tracked or by an external process',
+            self.processed_series_but_yet_to_delete[0]
+        )
+
+    def test_conn_reset_error(self, mock_shutil):
+
+        mock_shutil.rmtree.side_effect = ConnectionResetError
+
+        MetadataExtractor.clear_storage()
+        MetadataExtractor.logging.debug.assert_any_call(
+            'The connection was reset during the deletion')
+
+    def test_os_error(self, mock_shutil):
+
+        mock_shutil.rmtree.side_effect = OSError
+
+        MetadataExtractor.clear_storage()
+        MetadataExtractor.logging.debug.assert_any_call(
+            'New images arriving for the series. Therefore, do not delete the series: %s',
+            self.processed_series_but_yet_to_delete[0]
+        )
+
+    def test_success(self):
+        MetadataExtractor.clear_storage()
+        MetadataExtractor.logging.debug.assert_any_call(
+            'Deleted: %s %s %s %s', str(1),
+            ' out of ',
+            str(len(self.processed_series_but_yet_to_delete)),
+            ' remaining extraction completed series.'
+        )
+
+
+class TestRunThreaded:
+
+    def test_thread_start_success(self, mocker):
+        stub = mocker.stub(name='job_func_stub')
+        MetadataExtractor.run_threaded(stub)
+        stub.assert_called_once()