Import the required packages and modules

Installing Requirements

In [43]:
#Connector Packages:
from snowflake.snowpark.session import Session
import openpyxl

#ML Packages:
from config import *
import numpy as np
import pandas as pd
from scipy.stats import norm
from datetime import datetime

#Other Packages:
import os

Initialising the required variables

In [44]:
summary = {}
excel_columns = ["COLUMN_NAME",]

db_name = input("DATABASE NAME : ").upper()
schema_name = input("SCHEMA NAME : ").upper()

workbook = openpyxl.Workbook()

Connect with the Snowflake to read the data

In [45]:
def snowpark_session():
  conn_params = {
      "account" : "pr91731-production_northeurope",
      "user" : input("Username : "),
      "password" : input("Password : "),
      "database" : db_name,     
      "role" : "DATAENGINEER",
      "warehouse" : "INGESTION_WH"
  }

  session = Session.builder.configs(conn_params).create()

  return session

In [46]:
snowpark = snowpark_session()

In [47]:
def write_to_excel_file(worksheet,summary):
    row = 2
    
    #Adding heading
    for excel_column_name in excel_columns:
        # print(excel_column_name,end=" ")
        worksheet.cell(row = 1, column = excel_columns.index(excel_column_name)+1, value = excel_column_name)
    # print()
    for column in summary.keys():
        # print(column,end=" > ")
        worksheet.cell(
            row = row, 
            column = 1,
            value = column.lower()
        )
        
        calculations = summary[column]
        
        for metric in calculations.keys():
            column = excel_columns.index(metric) + 1
            # print("\n\t",column," - ",calculations[metric],end="")
            if(isinstance(calculations[metric],str)):
                cal = calculations[metric].lower()
            else:
                cal = calculations[metric]
            
            worksheet.cell(
                row = row, 
                column = column, 
                value = cal
            )
        row += 1
        # print()
    return worksheet

In [48]:
def show(report,indent = 0,start_delimiter = "[",end_delimiter = "]",seperator = " -> "):    
    if(isinstance(report,list)):
        val = ""
        val += start_delimiter
        for value in report:
            if(report.index(value) == len(report)-1):
                val += str(value)
            else:
                val += str(value)+", "
        val+= end_delimiter
        print(indent+val)
    
    elif(isinstance(report,str)):
        print('"'+report+'"')
    
    
    elif(isinstance(report,dict)):
        for key,value in report.items():
            space = indent*'\t'
            print(f"{space}{key} {seperator} ",end="")
            if(isinstance(value,dict) or isinstance(value,list) or isinstance(value,tuple) or isinstance(value,set)):
                indent += 1
                print("")
                show(indent = indent,report = value)
                indent -=1
            elif(isinstance(value,str)):
                show(indent = indent,report = value)
            else:
                print(f"{value}")

In [49]:
#
def data_quality_check(table_name):
  try:
    #collecting the data structure(Datatype)/schematics of the table for identifying the 
    ddl = snowpark.sql(f"DESCRIBE TABLE {db_name}.{schema_name}.{table_name}").collect()
    data = snowpark.sql(f'SELECT * FROM {db_name}.{schema_name}.{table_name}')
    data.show()

    for row in ddl: 
      
      #Initialising column name for future reference
      col = '"'+row["name"]+'"'
      summary[col] = {}
      
      print(data[col])
      
      #FINDING THE DATA TYPE OF THE COLUMN
      summary[col]["DATA_TYPE"] = row["type"]
      if("DATA_TYPE" not in excel_columns):
        excel_columns.append("DATA_TYPE")
      
      #FINDING THE NULL COUNT IN EACH COLUMN
      summary[col]["NULL_COUNT"] = data.filter(data[col].isNull()).count()
      if("NULL_COUNT" not in excel_columns):
        excel_columns.append("NULL_COUNT")
      
      #FINDING THE TOTAL COUNT IN EACH COLUMN
      summary[col]["TOTAL_COUNT"] = data.count()
      if("TOTAL_COUNT" not in excel_columns):
        excel_columns.append("TOTAL_COUNT")
      
      #FINDING THE NOT-NULL COUNT IN EACH COLUMN
      summary[col]["NOT_NULL_COUNT"] = summary[col]["TOTAL_COUNT"] - summary[col]["NULL_COUNT"]
      if("NOT_NULL_COUNT" not in excel_columns):
        excel_columns.append("NOT_NULL_COUNT")
      
      #FINDING THE UNIQUE(DISTINCT) VALUE COUNT IN EACH COLUMN
      summary[col]["UNIQUE_COUNT"] = data.select(col).distinct().count() - 1
      if("UNIQUE_COUNT" not in excel_columns):
        excel_columns.append("UNIQUE_COUNT")
        
      #FINDING THE REPEARING/DUPLICATE VALUE COUNT IN EACH COLUMN EXCLUDING THE UNIQUE COUNT
      summary[col]["DUPLICATE"] = summary[col]["NOT_NULL_COUNT"] - summary[col]["UNIQUE_COUNT"]
      if("DUPLICATE" not in excel_columns):
        excel_columns.append("DUPLICATE")
      
      if("NUMBER" in summary[col]["DATA_TYPE"] or "FLOAT" in summary[col]["DATA_TYPE"]):
        try:
          quartile = data.approx_quantile(col,[0.25,0.75])

          summary[col]["Q1"] = Q1 = quartile[0]
          if("Q1" not in excel_columns):
            excel_columns.append("Q1")
          summary[col]["Q3"] = Q3 = quartile[1]
          if("Q3" not in excel_columns):
            excel_columns.append("Q3")

          summary[col]["IQR"] = IQR = Q3-Q1
          if("IQR" not in excel_columns):
            excel_columns.append("IQR")

          summary[col]["LOWER_LIMIT"] = lower_limit = Q1 - 1.5 * IQR
          if("LOWER_LIMIT" not in excel_columns):
            excel_columns.append("LOWER_LIMIT")
          summary[col]["UPPER_LIMIT"] = upper_limit = Q3 + 1.5 * IQR
          if("UPPER_LIMIT" not in excel_columns):
            excel_columns.append("UPPER_LIMIT")

          summary[col]["OUTLIERS"] = outliers = data.filter((data[col] < lower_limit) | (data[col] > upper_limit)).count()
          if("OUTLIERS" not in excel_columns):
            excel_columns.append("OUTLIERS")
        
        except Exception as e:
          print("Error in calculating the Number data type",e)
          
      elif("BOOLEAN" in summary[col]["DATA_TYPE"]):
        try:
          if(summary[col]["UNIQUE_COUNT"] > 2):
            summary[col]["MISCELLANEOUS_COUNT"] = summary[col]["UNIQUE_COUNT"] - 2
            if("MISCELLANEOUS_COUNT" not in excel_columns):
              excel_columns.append("MISCELLANEOUS_COUNT")
          
          else:
            summary[col]["MISCELLANEOUS_COUNT"] = 0
            if("MISCELLANEOUS_COUNT" not in excel_columns):
              excel_columns.append("MISCELLANEOUS_COUNT")
        
        except Exception as e:
          print("Error in calculating the Boolean data type",e)

  except Exception as e:
    print("Error in calculating common data types - ",e)
  
  return summary

In [50]:
all_tables = snowpark.sql(f"SHOW TABLES IN {db_name}.{schema_name}").collect()

table_names = [row["name"] for row in all_tables]

download_folder_path = os.path.expanduser("~" + os.path.sep + "Downloads")
report_path = os.path.join(download_folder_path, 'Data_Quality_Report.xlsx')

for table_name in table_names:
    if(table_name == 'COHORT_STAGING_MASTER_DATA'):
        
        if(len(workbook.sheetnames) == 1):
            new_worksheet = workbook.active
            new_worksheet.title = table_name
        else:
            new_worksheet = workbook.create_sheet(title = table_name)
        
        report_summary = data_quality_check(table_name)
    
    # show(report = report_summary)
    
    new_worksheet = write_to_excel_file(new_worksheet,report_summary,)
    
downloads_folder = os.path.expanduser("~" + os.path.sep + "Downloads")
file_path = os.path.join(downloads_folder,'Data_Quality_Report_'+schema_name+"("+str(datetime.now().strftime("%d-%m-%Y"))+').xlsx')
workbook.save(file_path)

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"Company"                                  |"Client Account"                      |"SF/FF Client Account ID"  |"Entity Name"                               |"FF Entity Code"  |"Customer"  |"Client Group Name"  |"Region"    |"GLA"                           |"Acquired date"  |"DIM 1"  |"Year"  |"Currency"  |"Jan"     |"Feb"     |"Mar"     |"Apr"  |"May"  |"Jun"  |"Jul"  |"Aug"  |"Sep"  |"Oct"  |"Nov"  |"Dec"  |"Total Revenue"  |"CompanyCode"  |
----------------------------------------------------------------------------------------------------------

Validation

In [51]:
#Validation
from openpyxl import load_workbook

workbook = load_workbook(file_path)

# Get the number of sheets
number_of_sheets = len(workbook.sheetnames)

snowpark.sql("USE {db_name};")
snowflake_tables = list(snowpark.sql("SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = '{schema_name}';").collect())

excel_sheets = list(workbook.sheetnames)

print(len(snowflake_tables),len(excel_sheets))

print("Worksheet not created for : ",set(snowflake_tables)-set(excel_sheets))
print("Unwanted worksheet :", set(excel_sheets)-set(snowflake_tables))

0 1
Worksheet not created for :  set()
Unwanted worksheet : {'COHORT_STAGING_MASTER_DATA'}
