# Data Validation

Compare data updated using the classic and modern pipeline. The current scope of validation is to ensure completeness and integrity. 

In [6]:
import pandas as pd
import numpy as np
import dataframe_image as dfi
import pyodbc
from sqlalchemy.engine import URL
import sqlalchemy as sa

In [38]:
host = #Provide host
user = #Provide username
password = #Provide password
database = 'rdb_modern'

In [8]:
#SQLACLHEMY
connection_string = "DRIVER={ODBC Driver 17 for SQL Server};SERVER="+host+";DATABASE="+database+";UID="+user+";PWD="+password
connection_url = URL.create("mssql+pyodbc", query={"odbc_connect": connection_string})
from sqlalchemy import create_engine
engine = create_engine(connection_url)

In [9]:
with engine.begin() as conn:
    sql_rdb_mod = pd.read_sql_query(sa.text("Select * from rdb_modern.dbo.HEPATITIS_DATAMART hd where hd.PATIENT_UID  = '101831376'"), conn)
    sql_rdb = pd.read_sql_query(sa.text("Select * from rdb.dbo.HEPATITIS_DATAMART hd where hd.PATIENT_UID  = '101831376'"), conn)


# Highlight Dataframe

In [10]:
df_combine = pd.concat([sql_rdb, sql_rdb_mod], axis='columns', keys=['Q1', 'Q2'])

In [11]:
df_axis= df_combine.swaplevel(axis='columns')[sql_rdb.columns[1:]]

In [2]:
df_axis

In [12]:
df_axis.fillna('NAN', inplace=True)

In [13]:
def highlight_id(data, color='yellow'):
    attr = 'background-color: {}'.format(color)
    other = data.xs('Q1', axis='columns', level=-1)
    df = pd.DataFrame(np.where((data.ne(other, level=0)), attr, ''),
                        index=data.index, columns=data.columns)
    df = df[df.apply(lambda row: row.astype(str).str.contains('background-color: yellow', case=False).any(), axis=1)]
    id_list = df.index.get_level_values(0).to_list()

    return id_list

id_l = highlight_id(df_axis)
df = df_axis[df_axis.index.get_level_values(0).isin(id_l)]

df

In [14]:
def highlight_diff(data, color='yellow'):
    attr = 'background-color: {}'.format(color)
    other = data.xs('Q1', axis='columns', level=-1)
    df = pd.DataFrame(np.where((data.ne(other, level=0)), attr, ''),
                        index=data.index, columns=data.columns)
    df = df[df.apply(lambda row: row.astype(str).str.contains('background-color: yellow', case=False).any(), axis=1)]

    return df

df.style.apply(highlight_diff, axis=None)

# Column View

View the only the differences side by side. 
Identify differences between two pandas DataFrames

In [24]:
def diff_func(df1, df2):
    assert (df1.columns == df2.columns).all(), \
        "DataFrame column names are different"
    if any(df1.dtypes != df2.dtypes):
        "Data Types are different, trying to convert"
        df2 = df2.astype(df1.dtypes)
    if df1.equals(df2):
        return None
    else:
        diff_mask = (df1 != df2) & ~(df1.isnull() & df2.isnull())
        ne_stacked = diff_mask.stack()
        changed = ne_stacked[ne_stacked]
        changed.index.names = ['ID', 'Column']
        difference_locations = np.where(diff_mask)
        dataset_1 = df1.values[difference_locations]
        dataset_2 = df2.values[difference_locations]
        return pd.DataFrame({'rdb': dataset_1, 'rdb_modern': dataset_2},
                            index=changed.index)

In [25]:
df = diff_func(sql_rdb, sql_rdb_mod)

In [15]:
len(df)

In [23]:
print(df.to_string())

                                                 rdb                  rdb_modern
ID Column                                                                       
0  HEP_MEDS_RECVD_IND                        Unknown                        None
   CNTRY_USUAL_RESIDENCE               UNITED STATES                        None
   HEP_CONTACT_IND                           Unknown                        None
   HEP_CONTACT_EVER_IND                      Unknown                        None
   STREET_DRUG_INJECTED                      Unknown                        None
   STREET_DRUG_USED                          Unknown                        None
   SEX_PREF                                  Unknown                        None
   INV_CASE_STATUS                        Not a Case                    Probable
   HCV_RNA                                  Positive                        None
   HCV_RNA_DT                             2021-01-03                        None
   PREV_NEG_HEP_TEST_IND    

In [69]:
##Print log file 

## Case 2: New UIDs
### TODO: UIDs that have been introduced by ELR. Not ingested by classic pipeline yet. 