In [0]:
%run "./NB_Configuration"

In [0]:
spark.sql(f"use catalog {catalog}")

In [0]:
# Source_list = ['Silver.Financial_Account','Silver.Financial_Account_Party','Silver.Financial_Account_Balance']
# Destination_list = ['crm.Financial_Account','crm.Financial_Account_Party','crm.Financial_Account_Balance']

In [0]:
Source_list = [
"Silver.financial_account_party",
"Silver.service_xref",
"Silver.customer_master",
"Silver.financial_account_address",
"Silver.financial_account",
"Silver.branch",
"Silver.financial_account_service",
"Silver.financial_account_fee",
"Silver.product",
"Silver.financial_account_balance"]


### CRM Tables
Destination_list= [
"CRM.financial_account_party",
"CRM.service_xref",
"CRM.account",
"CRM.financial_account_address",
"CRM.financial_account",
"CRM.branch",
"CRM.financial_account_service",
"CRM.financial_account_fee",
"CRM.product",
"CRM.financial_account_balance"]

In [0]:
def columns_match(source_table, destination_table, source_table_info, destination_table_info):
    result_df = []

    source_columns = [
        row["column_name"]
        for row in source_table_info.select("column_name").distinct().collect()
    ]
    destination_columns = [
        row["column_name"]
        for row in destination_table_info.select("column_name").distinct().collect()
    ]
    source_diff = list(set(source_columns) - set(destination_columns))
    destination_diff = list(set(destination_columns) - set(source_columns))

    if len(source_diff) > 0 or len(destination_diff) > 0:
        description = f"Columns in {source_table} are not present in {destination_table}"
        result_df.append((source_table, destination_table, 'Column Existence Check', 'NA', len(source_diff), len(destination_diff), False, description))
    else:
        description = f"Columns in {source_table} are present in {destination_table}"
        result_df.append((source_table, destination_table, 'Column Existence Check', 'NA', len(source_diff), len(destination_diff), True, description))

    return result_df

In [0]:
datatype_dict= {'string':'''select '{system1}' as System, count(*) as {column_name}_count from {table1} where {column_name} is not null and {column_name} != '' and {column_name} != ' ' ''', 
'timestamp':  '''select '{system1}' as System, count(*) as {column_name}_count from {table1} where {column_name} is not null''',
'date':  '''select '{system1}' as System, count(*) as {column_name}_count from {table1} where {column_name} is not null''', 
'integer': '''select '{system1}' as System, count(*) as {column_name}_count from {table1} where {column_name} is not null and {column_name} >0''',
'float': '''select '{system1}' as System, count(*) as {column_name}_count from {table1} where {column_name} is not null and {column_name} >0''',
'double': '''select '{system1}' as System, count(*) as {column_name}_count from {table1} where {column_name} is not null and {column_name} >0''',
'decimal': '''select '{system1}' as System, count(*) as {column_name}_count from {table1} where {column_name} is not null and {column_name} >0'''}

In [0]:
blanks_nulls = {'string':'''select '{system}' as System, count(*) as {column_name}_count from {table} where {column_name} is  null and {column_name} == '' and {column_name} == ' ' ''',
'timestamp':  '''select '{system}' as System, count(*) as {column_name}_count from {table} where {column_name} is null''',
'date':  '''select '{system}' as System, count(*) as {column_name}_count from {table} where {column_name} is  null''',
'integer': '''select '{system}' as System, count(*) as {column_name}_count from {table} where {column_name} is null and {column_name} ==0''',
'float': '''select '{system}' as System, count(*) as {column_name}_count from {table} where {column_name} is null and {column_name} ==0''',
'double': '''select '{system}' as System, count(*) as {column_name}_count from {table} where {column_name} is null and {column_name} ==0''',
'decimal': '''select '{system}' as System, count(*) as {column_name}_count from {table} where {column_name} is null and {column_name} ==0'''}

In [0]:
def datavalidation(source_table, destination_table, source_table_info, destination_table_info):
    result_df = []

    source_columns = [
        row["column_name"]
        for row in source_table_info.select("column_name").distinct().collect()
    ]
    destination_columns = [
        row["column_name"]
        for row in destination_table_info.select("column_name").distinct().collect()
    ]

    source_schema = spark.table(source_table).schema
    destination_schema = spark.table(destination_table).schema

    source_columns_types = {field.name: field.dataType for field in source_schema}
    destination_columns_types = {field.name: field.dataType for field in destination_schema}

    for col in destination_columns:
        if col in source_columns:
            source_type = source_columns_types[col]
            destination_type = destination_columns_types[col]

            source_type_str = str(source_type).split('Type')[0].lower()
            destination_type_str = str(destination_type).split('Type')[0].lower()

            if source_type_str in datatype_dict and destination_type_str in datatype_dict:
                source_query = datatype_dict[source_type_str].format(system1='source', column_name=col, table1=source_table)
                destination_query = datatype_dict[destination_type_str].format(system1='destination', column_name=col,table1=destination_table)

                source_count = spark.sql(source_query).collect()[0][1]
                destination_count = spark.sql(destination_query).collect()[0][1]

                if source_count == destination_count:
                    description = f"Row count for column {col} matches between {source_table} and {destination_table}"
                    result_df.append((source_table, destination_table, 'Data Validation', col, source_count, destination_count, True, description))
                else:
                    description = f"Row count for column {col} does not match between {source_table} and {destination_table}"
                    result_df.append((source_table, destination_table, 'Data Validation', col, source_count, destination_count, False, description))
            else:
                description = f"Data type for column {col} is not supported"
                result_df.append((source_table, destination_table, 'Data Validation', col, None, None, False, description))
        else:
            description = f"Column {col} not found in {source_table}"
            result_df.append((source_table, destination_table, 'Data Validation', col, None, None, False, description))

    return result_df

In [0]:
def datavalidation_nulls_blanks(source_table, destination_table, source_table_info, destination_table_info):
    
    result_df = []

    source_columns = [
        row["column_name"]
        for row in source_table_info.select("column_name").distinct().collect()
    ]
    destination_columns = [
        row["column_name"]
        for row in destination_table_info.select("column_name").distinct().collect()
    ]

    source_schema = spark.table(source_table).schema
    destination_schema = spark.table(destination_table).schema

    source_columns_types = {field.name: field.dataType for field in source_schema}
    destination_columns_types = {field.name: field.dataType for field in destination_schema}

    for col in destination_columns:
        if col in source_columns:
            source_type = source_columns_types[col]
            destination_type = destination_columns_types[col]

            source_type_str = str(source_type).split('Type')[0].lower()
            destination_type_str = str(destination_type).split('Type')[0].lower()

            if source_type_str in datatype_dict and destination_type_str in datatype_dict:
                source_query = datatype_dict[source_type_str].format(system1='source', column_name=col, table1=source_table)
                destination_query = datatype_dict[destination_type_str].format(system1='destination', column_name=col, table1=destination_table)
                source_query_blank_null = blanks_nulls[source_type_str].format(system='source', column_name=col, table=source_table)
                destination_query_blank_null = blanks_nulls[destination_type_str].format(system='destination', column_name=col, table=destination_table)

                source_count = spark.sql(source_query).collect()[0][1]
                destination_count = spark.sql(destination_query).collect()[0][1]
                source_blank_null_count = spark.sql(source_query_blank_null).collect()[0][1]
                destination_blank_null_count = spark.sql(destination_query_blank_null).collect()[0][1]


                if source_count == destination_count or (destination_count >= (source_count*.90)):
                    description = f"Row count for column {col} matches between {source_table} and {destination_table}"
                    result_df.append((source_table, destination_table, 'Data Validation', col, source_count, destination_count, True, description))
                else:
                    description = f"Row count for column {col} does not match between {source_table} and {destination_table}"
                    result_df.append((source_table, destination_table, 'Data Validation', col, source_count, destination_count, False, description))

                if source_blank_null_count == destination_blank_null_count or (destination_blank_null_count >= (source_blank_null_count*.90)):
                    description = f"Row count for column {col} blank, null, and zero matches between {source_table} and {destination_table}"
                    result_df.append((source_table, destination_table, 'Data Validation', col, source_blank_null_count, destination_blank_null_count, True, description))
                else:
                    description = f"Row count for column {col} blank, null, and zero does not match between {source_table} and {destination_table}"
                    result_df.append((source_table, destination_table, 'Data Validation', col, source_blank_null_count, destination_blank_null_count, False, description))

            else:
                description = f"Data type for column {col} is not supported"
                result_df.append((source_table, destination_table, 'Data Validation', col, None, None, False, description))
        else:
            description = f"Column {col} not found in {source_table}"
            result_df.append((source_table, destination_table, 'Data Validation', col, None, None, False, description))

    return result_df

In [0]:
def decimal_quality(source_type, destination_type):
    def is_bad_decimal(data_type):
        if data_type is None:
            return False
        return data_type.typeName() == 'decimal' and data_type.precision < 6

    return is_bad_decimal(source_type) or is_bad_decimal(destination_type)

In [0]:
def DataTypeComparison(source_table, destination_table):
    result_df = []

    source_schema = spark.table(source_table).schema
    destination_schema = spark.table(destination_table).schema

    source_columns_types = {field.name: field.dataType for field in source_schema}
    destination_columns_types = {field.name: field.dataType for field in destination_schema} #dict to define the column:datatype

    for col, dest_type in destination_columns_types.items():
        source_type = source_columns_types.get(col)

        if source_type is None:
            description = f"{col} is not present in {source_table}"
            result_df.append((source_table, destination_table, 'Data Type Check', col, None, str(dest_type), False, description))
        elif source_type != dest_type:
            description = f"Data Type in {source_table}.{col} is not matching with {destination_table}.{col}"
            result_df.append((source_table, destination_table, 'Data Type Check', col, str(source_type), str(dest_type), False, description))
        else:
            description = f"Data Type Matched"
            result_df.append((source_table, destination_table, 'Data Type Check', col, str(source_type), str(dest_type), True, description))

        if decimal_quality(source_type, dest_type):
            description = f"Source and/or Destination Decimal Type has bad precision"
            result_df.append((source_table, destination_table, 'Data Type Check', col, str(source_type), str(dest_type), False, description))

    return result_df

In [0]:
result_list = []

for index in range(len(Source_list)):
    source_table = Source_list[index] #silver.customer_master
    destination_table = Destination_list[index] #crm.account

    # source table info
    source_column_df = spark.sql(
        f"SELECT table_schema, table_name, column_name, full_data_type FROM information_schema.columns WHERE table_schema = '{source_table.split('.')[0].lower()}' AND table_name = '{source_table.split('.')[1].lower()}'"
    )

    # destination table info
    destination_column_df = spark.sql(
        f"SELECT table_schema, table_name, column_name, full_data_type FROM information_schema.columns WHERE table_schema = '{destination_table.split('.')[0].lower()}' AND table_name = '{destination_table.split('.')[1].lower()}'"
    )

    #---------Column existence check---------
    column_existence_check = columns_match(source_table, destination_table, source_column_df, destination_column_df)
    result_list.extend(column_existence_check)

    #--------- Data Type Check ----------
    data_type_check = DataTypeComparison(source_table, destination_table) 
    result_list.extend(data_type_check)

    #--------- Data validation  ---------- #need to update mark it as fail it the difference is >10%
    data_validation = datavalidation_nulls_blanks(source_table, destination_table, source_column_df, destination_column_df) 
    result_list.extend(data_validation)

# Ensure all elements in result_list are dictionaries with the same keys
result_df = spark.createDataFrame(result_list, schema=['Source_Table', 'Destination_Table', 'Check_Type', 'Column_Name', 'Source_DataType', 'Destination_DataType', 'Status', 'Description'])

# result_df = result_df.withColumn("current_processed_date", current_timestamp())
# result_df.write.mode("append").saveAsTable("config.regression_testresults")

In [0]:
from pyspark.sql.functions import current_timestamp

result_df = result_df.withColumn("current_processed_date", current_timestamp())
result_df.write.mode("append").saveAsTable("config.regression_testresults")