In [1]:
import pandas as pd
import numpy as np
import os
import json
import operator
import math

In [2]:
# Set paths
BASE_DIR = os.path.dirname(
    os.path.abspath('')
)

DATA_DIR    = os.path.join(BASE_DIR, 'data')
RESULTS_DIR = os.path.join(BASE_DIR, 'results')
CONFIG_DIR = os.path.join(BASE_DIR, 'config')
SOURCE_DIR  = os.path.join(BASE_DIR, 'source')

In [3]:
# Output from the indentify conditions macro
conditions_report_file = 'conditions_full.csv'
conditions_report_file = os.path.join(RESULTS_DIR, conditions_report_file)

report_df = pd.read_csv(
    conditions_report_file,
    header=0,
    index_col=None,
    quotechar='"',
    sep=',',
    parse_dates=[
        'chron_diab_date',
        'chron_card_date',
        'study_period_start',
        'study_period_end',
        'coverage_start',
        'coverage_end',
        'age_valid_date',
        'birthdate'
    ],
    low_memory=False
)

report_df.head(5)

Unnamed: 0,subject_id,gender,birthdate,age,coverage_start,coverage_end,age_valid_date,study_period_start,study_period_end,valid_participant,chron_diab_date,chron_card_date
0,1000001,1,1960-04-03,60,1970-12-31,1987-09-27,2000-04-03,1995-01-01,2004-12-31,0,NaT,NaT
1,1000002,1,1930-02-13,90,1970-12-31,2020-03-25,1970-02-13,1995-01-01,2004-12-31,1,NaT,NaT
2,1000003,1,1936-06-06,84,1970-12-31,1995-11-15,1976-06-06,1995-01-01,2004-12-31,1,1995-01-10,1995-01-20
3,1000004,1,1960-07-08,60,1987-08-05,2020-03-25,2000-07-08,1995-01-01,2004-12-31,1,2000-07-14,2000-07-23
4,1000005,0,1942-05-15,78,1970-12-31,2020-03-25,1982-05-15,1995-01-01,2004-12-31,1,NaT,NaT


In [4]:
# Config file specifying what tests to perform
test_config_file = 'config_testing.json'
test_config_file = os.path.join(CONFIG_DIR, test_config_file)

with open(test_config_file) as json_file:
    tests = json.load(json_file)

In [5]:
print("Tests include:")

for test in tests["testing_items"]:
    print("  ", test["description"])

Tests include:
   Date of condition is after the coverage start date
   Date of condition is after the study period start date
   Date of condition is after 40 years old
   Date of condition is before the study period end date
   Date of condition is before the coverage end date


In [6]:
def run_test(tests, target_df):
    ops = {
        ">=": operator.ge,
        "<=": operator.le
    }
    
    for test in tests:
        testing_results_df = target_df[["subject_id"]].copy()
        testing_results_df["test_result"] = True
        
        target_vars   = test["target_vars"]
        compared_vars = test["compared_vars"]

        op = ops[test["operation"]]
        
        for target_var in target_vars:
            testing_results_df["missing"] = pd.isnull(report_df[target_var])
            for compared_var in compared_vars:                
                testing_results_df["correct"] = op(report_df[target_var], report_df[compared_var])
                testing_results_df["test_result"] = (
                    (
                        testing_results_df["test_result"] & testing_results_df["correct"]
                    ) | 
                    testing_results_df["missing"]
                )
        
        print("Testing: ", test["description"])
        print("         ", "Passed!" if testing_results_df["test_result"] .all() else "Not passed!")

In [7]:
run_test(tests["testing_items"], report_df)

Testing:  Date of condition is after the coverage start date
          Passed!
Testing:  Date of condition is after the study period start date
          Passed!
Testing:  Date of condition is after 40 years old
          Passed!
Testing:  Date of condition is before the study period end date
          Passed!
Testing:  Date of condition is before the coverage end date
          Passed!
