## Ingest raw SQL File for Views

### Summary

This will break down the file in the following steps:
1. Each "; GO" is treated as a unique SQL
2. look at each line and remove comments (--)
3. Look at comments block then remove (/* */)
4. Look for keywords like FROM, JOIN , then grab the next word (Schema.Name)
5. Concatenating into a single dataframe and output to CSV

### Import Packages

In [1]:
import pandas as pd
import numpy as np
import re
import math
from tqdm import tqdm
import nbformat

In [2]:
%run "0_Configuration.ipynb"

input_script_view =  1.INPUT/DATAWAREHOUSE/ViewScript.sql
input_script_sp =  1.INPUT/DATAWAREHOUSE/SPsScript.sql
input_script_table =  1.INPUT/DATAWAREHOUSE/tableScript.sql
graph_ingestion_view =  3.OUTPUT_GRAPH/graph_input_vw.csv
graph_ingestion_sp =  3.OUTPUT_GRAPH/graph_input_sp.csv
graph_ingestion_sp2 =  3.OUTPUT_GRAPH/graph_input_sp2.csv
graph_ingestion_table =  3.OUTPUT_GRAPH/graph_input_table.csv


### Import Files
#### We will use 1 files from 1.INPUT folder
1A. VIEW SQL files

In [3]:
def read_raw_sql_view(sql_input, regex_str):
    str_found  = re.findall(regex_str, sql_input)
  #  print("viewname found", str_found)
    df = pd.DataFrame (str_found, columns = ['VW_SCHEMA', 'VW_NAME'])
    df['SYNTAX'] = sql_input
    return df

def final_table_name_cleansing(text):
    if text==text:
          #  input_str=re.findall(r'(\[?\w+\]?\.\[?\w+\]?)', text)
            input_str = text.replace('FROM', '').replace('JOIN', '').replace(']', '').replace('[', '').replace(' ', '')
    else:
        input_str = ""
    return input_str

In [4]:
# Open and read the file as a single buffer
fd = open(input_script_view, 'r', encoding="utf-16")
sqlFile = fd.read()
fd.close()

#This convert the raw SQL file into a Dataframe for our later REGEX manipulation

df = pd.DataFrame()

for sql_statement in re.split(r'(?:;(\s+|\n))(GO)', sqlFile):
   # print("split")
    concat_sql = ""
    sql_statement = sql_statement.upper()
    
    for line in sql_statement.split("\n"):
        #Remove anythign on the right of the comment
        if not "/" in line: 
            q = line.split("--")[0]
            concat_sql = concat_sql + " " + q

    concat_sql = concat_sql.replace('\t', ' ').replace("  ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ")
    concat_sql = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", concat_sql)
    
   # print(concat_sql)
    
    df = pd.concat([df, read_raw_sql_view(concat_sql, r"(?ims)\b(?:CREATE\s+VIEW)\s+(\[?\w+\]?)\.(\[?\w+\]?)")], ignore_index=True )

df['VW_SCHEMA']=df['VW_SCHEMA'].apply(lambda x: final_table_name_cleansing(x))
df['VW_NAME']=df['VW_NAME'].apply(lambda x: final_table_name_cleansing(x))

In [5]:
df

Unnamed: 0,VW_SCHEMA,VW_NAME,SYNTAX
0,PUBLISH,VW_FACC_GL_TRANSACTION_FIGURES,SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER ON...
1,PUBLISH,VW_FACC_GL_STATISTICAL_KEY_FIGURES,SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER ON...
2,PUBLISH,VW_FACC_GL_INTERFACE,SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER ON...
3,ANALYSE,DBT_STG_V_BASE_SITE_ADDRESS_HKSE,SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER ON...
4,ANALYSE,DBT_STG_V_BASE_SITE_HKSE,SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER ON...
...,...,...,...
557,PUBLISH_TAB_SGHNB_VW,VW_SGHNB_TX_POS_SALES,SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER ON...
558,PUBLISH_TAB_SGHNB_VW,VW_SGHNB_TX_PURCHASE,SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER ON...
559,PUBLISH_TAB_SGHNB_VW,VW_SGHNB_TX_SALES_TRANSACTION_COUNT,SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER ON...
560,PUBLISH_TAB_SGHNB_VW,VW_SGHNB_TX_SHRINKAGE,SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER ON...


### 2. Now using REGEX to identify the View-Table relationships
#### We will use the View Dataframe we got earlier

In [6]:

#This function is used to loop complex pattern like "INSERT INTO FROM SELECT * FROM xxx"
#Since there can be multiple SELECT FROM tables, Will start from the most outer shell, then move inner
#If There is a CREATE TABLE/INSERT TABLE statement, then will get from there
def regex_part_analyser(sql_input, regex_type, regex_str, df_master):
    str_found  = re.findall(regex_str, sql_input)
    str_cat=[]
    
    while str_found:
        sql_input = sql_input.replace(str_found[0], '')
        str_cat = str_cat + str_found
        str_found  = re.findall(regex_str, sql_input)

    df = pd.DataFrame (str_cat, columns = ['TABLE_FULL_NAME'])
    df['REGEX_TYPE'] = regex_type
    return df
        
def isNaN(string):
    return string != string

def regex_main_analyser( index, rowInput, sql_Input):
    
    # remove the /* */ comments
    q = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", sql_Input)

    #remove the */ and /* comments
    q  = re.sub(r"[^*]*\*+(?:[^*/][^*]*\*+)*/", "", q)
    q  = re.sub(r"\/\*[^,]*$", "", q)
    q  = q.replace('"','')
    q  = q.upper()
    df = pd.DataFrame()
    #print(q)
    
    #Splitting the Source Code into multiple SQL statement by ';'
    list_sql_statement = q.split(';')
    
    
    for sql_statement in list_sql_statement:
        #print(sql_statement)
        #To be fixed
    #    df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
    #                                                     r"(?ims)\b(?:SELECT)\s.*\b((?:FROM|JOIN|UPDATE)\s+\[?\w+\]?\.\[?\w+\]?)", df)])
        
         #2 components TABLE string database.schema.table
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?))(?:,|\)|;| |$){1}", df)])

        #if the table name is like this [SCHEMA].[TABLE]IT then the square bracket must exists
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]))", df)])        
        
        #This matches for the JOIN then comma
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+(?:[\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?))\s+\w*\s*,\s*([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?))", df)])
      
        
        #3 components TABLE string database.schema.table
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?\.[\[]?\w+[\]]?))(?:,|\)|;| |$){1}", df)])

        
        #if the table name is like this [DATABASE].[SCHEMA].[TABLE]IT then the square bracket must exists
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?\.[\[]?\w+[\]]))", df)])        

        
                #This matches for the JOIN then comma
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+(?:[\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?\.[\[]?\w+[\]]?))\s+\w*\s*,\s*([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?\.[\[]?\w+[\]]?))", df)])
        
        
    df = df.drop_duplicates()  
    df['VW_SCHEMA'] =rowInput['VW_SCHEMA'] 
    df['VW_NAME'] =rowInput['VW_NAME'] 
    #df['SYNTAX'] =rowInput['SYNTAX'] 
    df['VW_INDEX'] = index
    df['SYNTAX_WORDCOUNT'] = len(q.split())
        
    return df

### Export the results into 3. OUTPUT_GRAPH

In [7]:
#Each Row in the Dataframe are looped iteratively against the REGEX logics

df_cat = pd.DataFrame()    

print(f"Total count of {len(df)} VW in the file")

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    #This will put the SQL Syntax into the main regex code analyser to identify table names
    if not isNaN(row['SYNTAX']):
        df_cat = pd.concat([ df_cat, regex_main_analyser(index, row, row['SYNTAX'])])
    
if not df_cat.empty:
    #This will clean up the FULL TABLE NAME, and split them into Database, Schema, Name
    df_cat['test']=df_cat['TABLE_FULL_NAME'].apply(lambda x: final_table_name_cleansing(x))
    df_cat[["PART1", "PART2", "PART3"]] = df_cat["test"].str.split(pat=".",  expand=True)
                                          
    df_cat['TABLE_DATABASE'] = np.select([~df_cat.PART3.isnull()], [df_cat.PART1], default=None )
    df_cat['TABLE_SCHEMA'] = np.select([~df_cat.PART3.isnull()], [df_cat.PART2], default=df_cat.PART1 )
    df_cat['TABLE_NAME'] = np.select([~df_cat.PART3.isnull()], [df_cat.PART3], default=df_cat.PART2 )

    
    df_cat.drop(columns=["PART1", "PART2", "PART3", 'test'], inplace=True)
    
df_cat.to_csv(graph_ingestion_view)
            
print("COMPLETED : Graph Import File saved to output_vw.csv")

Total count of 562 VW in the file


100%|██████████| 562/562 [00:53<00:00, 10.60it/s] 

COMPLETED : Graph Import File saved to output_vw.csv





In [8]:
df_cat

Unnamed: 0,TABLE_FULL_NAME,REGEX_TYPE,VW_SCHEMA,VW_NAME,VW_INDEX,SYNTAX_WORDCOUNT,TABLE_DATABASE,TABLE_SCHEMA,TABLE_NAME
0,ANALYSE.GL_ACCOUNTS_TRANSACTION_FIGURES,SELECT_FROM,PUBLISH,VW_FACC_GL_TRANSACTION_FIGURES,0,340,,ANALYSE,GL_ACCOUNTS_TRANSACTION_FIGURES
1,PREPARE.PROFIT_CTR_TEXT,SELECT_FROM,PUBLISH,VW_FACC_GL_TRANSACTION_FIGURES,0,340,,PREPARE,PROFIT_CTR_TEXT
2,[PREPARE].[COSTCENTER_TEXT],SELECT_FROM,PUBLISH,VW_FACC_GL_TRANSACTION_FIGURES,0,340,,PREPARE,COSTCENTER_TEXT
3,PREPARE.PROFIT_CTR_HIER_FACC,SELECT_FROM,PUBLISH,VW_FACC_GL_TRANSACTION_FIGURES,0,340,,PREPARE,PROFIT_CTR_HIER_FACC
4,PREPARE.GL_ACCOUNT_TEXT,SELECT_FROM,PUBLISH,VW_FACC_GL_TRANSACTION_FIGURES,0,340,,PREPARE,GL_ACCOUNT_TEXT
...,...,...,...,...,...,...,...,...,...
0,[ANALYSE].[SGHNB_TX_MARGIN_DTL],SELECT_FROM,PUBLISH_TAB_SGHNB_VW,VW_SGHNB_TX_MARGIN_DTL,556,48,,ANALYSE,SGHNB_TX_MARGIN_DTL
0,[ANALYSE].[SGHNB_TX_POS_SALES],SELECT_FROM,PUBLISH_TAB_SGHNB_VW,VW_SGHNB_TX_POS_SALES,557,62,,ANALYSE,SGHNB_TX_POS_SALES
0,[ANALYSE].[SGHNB_TX_SALES_TRANSACTION_COUNT],SELECT_FROM,PUBLISH_TAB_SGHNB_VW,VW_SGHNB_TX_SALES_TRANSACTION_COUNT,559,28,,ANALYSE,SGHNB_TX_SALES_TRANSACTION_COUNT
0,[ANALYSE].[SGHNB_TX_SHRINKAGE],SELECT_FROM,PUBLISH_TAB_SGHNB_VW,VW_SGHNB_TX_SHRINKAGE,560,41,,ANALYSE,SGHNB_TX_SHRINKAGE
