# Data Validation

### Table of Contents

- [Imports and Setup](#Imports-and-Setup)
- [Parameters Check](#Parameters-Check)
- [Inputs Check](#Inputs-Check)
- [Outputs Check](#Outputs-Check)
    - [Pre_Processing.py and Daily_Input_Read.py Outputs Check](#Pre_Processing.py-and-Daily_Input_Read.py-Outputs-Check)
    - [CleintPharmacyMacOptimization.py Output Check](#CleintPharmacyMacOptimization.py-Output-Check)

Use this notebook to investigate differences between optimization pipeline runs. Note that differences can occur for many reasons, including changes in user input parameters, changes in input data, and changes in code functionality. They can also occur due to differences in environments where the code is run; for example, a different version of Python, or a different version of the Pandas library is likely to lead to some changes in the outputs (for example, a different order of Python dictionary outputs, which are never guarenteed anyway). Environment differencies may be subtle, but they can have a large impact. For example, for example case-insensitivity on Windows means that `myfile.csv` and `MyFile.csv` are the same reference; however, they are not the same on Linux, and so this can lead to different results, for example on a Windows laptop vs. a Linux server on GCP.

To set things up, you will need to have two seperate runs of the code pipeline, and access to the code, inputs, and outputs from both runs. The notebook is designed to make it easier to compare the resources from these separate runs.

### Imports and Setup

In [None]:
import os, re, shutil, hashlib, difflib
import datetime as dt
import pandas as pd
import numpy as np

from IPython.core.display import HTML

**File Paths**

Paths in cloud storage should start with: 'gs://', the files saved on the local VM should be saved with the path starting with '/home/'.  Note that that the paths should end on the target directory not with a '/'

In [None]:
orig_code_dir = 'gs://pbm-mac-lp-prod-ai-bucket/LP_MI/validation/UNC'
compare_code_dir = '/home/jupyter/clientpharmacymacoptimization/GER_LP_Code'
orig_input_dir = os.path.join(orig_code_dir, 'Input')
compare_input_dir = orig_input_dir
orig_output_dir = '/home/jupyter/Output'
compare_output_dir = '/home/jupyter/Output'

**Functions**

In [None]:
def hashcompare(d1, d2, ignore_case=False, ignore_nums_n_dates=False, verbose=False):
    '''
    Compares directories (d1 and d2) and extracts their hash.  
    This function is use in convination with `compareDFs` to determine if two data frames are the same.
    '''
    h1 = pd.Series(dtype=str)
    h2 = pd.Series(dtype=str)
    h1_names = pd.Series(dtype=str)
    h2_names = pd.Series(dtype=str)
    # get d1 hashes
    for f in os.listdir(d1):
        fpath = os.path.join(d1, f)
        if os.path.isfile(fpath):
            fname = f
            if ignore_case:
                fname = f.lower()
            if ignore_nums_n_dates:
                fname = re.sub('\d(\d/-)*', '*', f)           
            h1[fname] = hashlib.sha1(open(fpath, 'rb').read()).hexdigest()
            h1_names[fname] = f  # keep track of original filename
    # get d2 hashes
    for f in os.listdir(d2):
        fpath = os.path.join(d2, f)
        if os.path.isfile(fpath):
            fname = f
            if ignore_case:
                fname = f.lower()
            if ignore_nums_n_dates:
                fname = re.sub('\d(\d/-)*', '*', f)
            h2[fname] = hashlib.sha1(open(fpath, 'rb').read()).hexdigest()
            h2_names[fname] = f  # keep track of original filename
    # create comparison dataframe
    if verbose:
        print(h1, end='\n')
        print(h2)
    df = pd.DataFrame({d1: h1, d2: h2, 'filename1': h1_names, 'filename2': h2_names})
    df['Equal'] = df[d1] == df[d2]
    eq = df.pop('Equal')
    df.insert(0, 'Equal', eq)
    return df

In [None]:
def compareDFs(compare_df):
    '''
    Uses the input of `hashcompare` as a way to compare the outputs of two different runs. 
    It uses the `pd.testing.assert_frame_equal` to test if two data frames are equal or not.
    In case that they are different it will display the reason why.
    '''
    dirpath1, dirpath2, fname1, fname2 = compare_df.columns[1:]
    res_s = pd.Series(dtype=str)
    for f in compare_df.index:
        # build dataframes
        if f[-3:] == 'csv':
            p1 = os.path.join(dirpath1, compare_df.loc[f, fname1])
            p2 = os.path.join(dirpath2, compare_df.loc[f, fname2])
            df1 = pd.read_csv(p1)
            df2 = pd.read_csv(p2)
        if f[-4:] == 'xlsx':
            p1 = os.path.join(dirpath1, compare_df.loc[f, fname1])
            p2 = os.path.join(dirpath2, compare_df.loc[f, fname2])
            df1 = pd.read_excel(p1)
            df2 = pd.read_excel(p2)
        try:
            res = pd.testing.assert_frame_equal(df1, df2)
            print('-'*80+'\n'+f'Frames equal for: {f}', end='\n'+'-'*80 + '\n')
        except AssertionError as e:
            print('-'*80+'\n'+f'Frames NOT equal for: {f}')
            print(e, end='\n'+'-'*80 + '\n')
            err = f'{e}'
            r0 = err.find('column name=')
            if r0>0:
                r1 = err[(13+r0):].find('\"')
                var = err[(13+r0):(13+r0+r1)]
                try:
                    tf = np.isclose(df1[var], df2[var],equal_nan=True)
                    print(df1[~tf][var])
                    print(df2[~tf][var])
                except TypeError:
                    print('SET DIFF:', set(df1[var])-set(df2[var]))


### Parameters Check

In [None]:
orig_param_file = os.path.join(orig_code_dir, 'CPMO_parameters.py')
compare_param_file = os.path.join(compare_code_dir, 'CPMO_parameters.py')

# if dirs are cloud storage, download to compare files
if orig_param_file[:5] == 'gs://':
    tstamp = dt.datetime.now().strftime("%Y-%m-%d_%s%f")
    orig_f = f'TEMP_ORIG_params_{tstamp}.py'
    !gsutil cp {orig_param_file} {orig_f}
else:
    orig_f = orig_param_file
if compare_param_file[:5] == 'gs://':
    comp_f = f'TEMP_COMP_params_{tstamp}.py'
    !gsutil cp {compare_param_file} {comp_f}
else:
    comp_f = compare_param_file

In [None]:
!diff --minimal --minimal --ignore-trailing-space --ignore-space-change --suppress-common-lines --color='always' \
    {orig_f} {comp_f}

In [None]:
# Cleanup
if orig_param_file[:5] == 'gs://':
    os.remove(orig_f)
if compare_param_file[:5] == 'gs://':
    os.remove(comp_f)

### Inputs Check

In [None]:
# if dirs are cloud storage, download to compare files
if orig_input_dir[:5] == 'gs://':
    tstamp = dt.datetime.now().strftime("%Y-%m-%d_%s%f")
    orig_d = f'TEMP_ORIG_{tstamp}'
    os.makedirs(orig_d)
    !gsutil -m cp -r {orig_input_dir}/* {orig_d}
else:
    orig_d = orig_input_dir
if compare_input_dir[:5] == 'gs://':
    comp_d = f'TEMP_COMP_{tstamp}'
    os.makedirs(comp_d)
    !gsutil -m cp -r {compare_input_dir}/* {comp_d}
else:
    comp_d = comp_input_dir

Which input files differ? The `hashcompare` function returns `True` if the hashes are the same, and `False` if they are different.

In [None]:
inputs_df = hashcompare(orig_d, comp_d)
inputs_df['Equal']

In [None]:
if ~inputs_df.Equal.all():  # if file hashes are different for some files
    compareDFs(inputs_df)

In [None]:
# cleanup
if 'orig_temp' in locals().keys():
    shutil.rmtree(orig_d)
if 'compare_temp' in locals().keys():
    shutil.rmtree(compare_d)    

### Outputs Check

#### Pre_Processing.py Outputs Check

In [None]:
orig_pp_dir = os.path.join(orig_output_dir, 'Dynamic_Input')
compare_pp_dir = os.path.join(compare_output_dir, 'Dynamic_Input')

if orig_pp_dir[:5] == 'gs://':
    tstamp = dt.datetime.now().strftime("%Y-%m-%d_%s%f")
    orig_pp_d = f'TEMP_ORIG_PREP_{tstamp}'
    os.makedirs(orig_pp_d)
    !gsutil -m cp -r {orig_pp_dir}/* {orig_pp_d}
else:
    orig_pp_d = orig_pp_dir
if compare_pp_dir[:5] == 'gs://':
    comp_pp_d = f'TEMP_COMP_PREP_{tstamp}'
    os.makedirs(comp_pp_d)
    ! gsutil -m cp -r {compare_pp_dir}/* {comp_pp_d}
else:
    comp_pp_d = compare_pp_dir

Compare File Hashes

In [None]:
pp_df = hashcompare(orig_pp_d, comp_pp_d, ignore_case=True, ignore_nums_n_dates=False)
# pp_df.dropna()['Equal']
pp_df.dropna()

Compare Dataframes

In [None]:
compareDFs(pp_df.dropna())

In [None]:
# cleanup
if 'orig_pp_d' in locals().keys():
    shutil.rmtree(orig_pp_d)
if 'compare_pp_d' in locals().keys():
    shutil.rmtree(compare_pp_d)    

#### Daily_Input_Read.py Outputs Check

In [None]:
orig_daily_dir = os.path.join(orig_output_dir, 'Dynamic_Input')
compare_daily_dir = os.path.join(compare_output_dir, 'Dynamic_Input')

if orig_daily_dir[:5] == 'gs://':
    tstamp = dt.datetime.now().strftime("%Y-%m-%d_%s%f")
    orig_daily_d = f'TEMP_ORIG_PREP_{tstamp}'
    os.makedirs(orig_daily_d)
    !gsutil -m cp -r {orig_daily_dir}/* {orig_daily_d}
else:
    orig_daily_d = orig_daily_dir
if compare_daily_dir[:5] == 'gs://':
    comp_daily_d = f'TEMP_COMP_PREP_{tstamp}'
    os.makedirs(comp_daily_d)
    ! gsutil -m cp -r {compare_daily_dir}/* {comp_daily_d}
else:
    comp_daily_d = compare_daily_dir

Compare File Hashes

In [None]:
daily_df = hashcompare(orig_daily_d, comp_daily_d, ignore_case=True, ignore_nums_n_dates=False)
# daily_df.dropna()['Equal']
daily_df.dropna()

Compare Dataframes

In [None]:
compareDFs(daily_df.dropna())

In [None]:
# cleanup
if 'orig_daily_d' in locals().keys():
    shutil.rmtree(orig_daily_d)
if 'comp_daily_d' in locals().keys():
    shutil.rmtree(comp_daily_d)    

#### CleintPharmacyMacOptimization.py  Output Check

In [None]:
orig_out_d = os.path.join(orig_output_dir, 'Output')
comp_out_d = os.path.join(compare_output_dir, 'Output')

if orig_out_d[:5] == 'gs://':
    tstamp = dt.datetime.now().strftime("%Y-%m-%d_%s%f")
    orig_opt_d = f'TEMP_ORIG_OPT_{tstamp}'
    os.makedirs(orig_opt_d)
    !gsutil -m cp -r {orig_out_d}/* {orig_opt_d}
else:
    orig_opt_d = orig_out_d
if comp_out_d[:5] == 'gs://':
    comp_opt_d = f'TEMP_COMP_OPT_{tstamp}'
    os.makedirs(comp_opt_d)
    ! gsutil -m cp -r {comp_out_d}/* {comp_opt_d}
else:
    comp_opt_d = comp_out_d

Compare File Hashes

In [None]:
opt_df = hashcompare(orig_opt_d, comp_opt_d, ignore_case=True, ignore_nums_n_dates=False)
opt_df

Compare Dataframes

In [None]:
compareDFs(opt_df.dropna())

In [None]:
# cleanup
if 'orig_opt_d' in locals().keys():
    shutil.rmtree(orig_opt_d)
if 'comp_opt_d' in locals().keys():
    shutil.rmtree(comp_opt_d)    