### Summary
#### This will break down the file in the following steps:
1. loop through the table name file and see if anythings matched in the SQL script
2. This won't look at the syntax patterns, just basically seeing if the TABLE name exists

### Import Packages

In [1]:
import pandas as pd
import re
import math
from tqdm import tqdm
import glob
import shutil

pd.options.mode.chained_assignment = None

In [2]:
%run "0_Configuration.ipynb"

input_script_view =  1.INPUT/DATAWAREHOUSE/ViewScript.sql
input_script_sp =  1.INPUT/DATAWAREHOUSE/SPsScript.sql
input_script_table =  1.INPUT/DATAWAREHOUSE/tableScript.sql
graph_ingestion_view =  3.OUTPUT_GRAPH/graph_input_vw.csv
graph_ingestion_sp =  3.OUTPUT_GRAPH/graph_input_sp.csv
graph_ingestion_sp2 =  3.OUTPUT_GRAPH/graph_input_sp2.csv
graph_ingestion_table =  3.OUTPUT_GRAPH/graph_input_table.csv


### 1. Import Files
#### We will use 2 files from 1.INPUT folder
1A. Stored Procedure SQL files

1B. Excel Table Name files

In [3]:
def read_raw_sql_view(sql_input, regex_str):
    str_found  = re.findall(regex_str, sql_input)
  #  print("viewname found", str_found)
    df = pd.DataFrame (str_found, columns = ['SP_SCHEMA', 'SP_NAME'])
    df['SYNTAX'] = sql_input
    return df

def read_raw_sql_table(sql_input, regex_str):
    str_found  = re.findall(regex_str, sql_input)
  #  print("viewname found", str_found)
    df = pd.DataFrame (str_found, columns = ['TABLE_SCHEMA', 'TABLE_NAME'])
    df['SYNTAX'] = sql_input
    return df


def sql_syntax_cleansing(text):
    if text==text:
        # remove the /* */ comments
        q = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", text)

        #remove the */ and /* comments
        #q  = re.sub(r"[^*]*\*+(?:[^*/][^*]*\*+)*/", "", q)
        q  = re.sub(r"\/\*[^,]*$", "", q)
        q  = q.replace('"','')
        q  = q.replace('[','').replace(']','')
        input_str  = q.upper()
    else:
        input_str = ""
    return input_str

def final_table_name_cleansing(text):
    if text==text:
          #  input_str=re.findall(r'(\[?\w+\]?\.\[?\w+\]?)', text)
            input_str = text.replace('FROM', '').replace('JOIN', '').replace(']', '').replace('[', '').replace(' ', '').upper()
    else:
        input_str = ""
    return input_str

#### 1A. Store Procedure SQL files into a Dataframe

In [4]:
# Open and read the SP SQL file as a single buffer
fd = open(input_script_sp, 'r', encoding="utf-16")
sqlFile = fd.read()
fd.close()

#Then we clean the Stored Procedure Dataframe
df = pd.DataFrame()
i= 1

for sql_statement in re.split(r'CREATE\s+PROC', sqlFile):
    i = i + 1
    concat_sql = ""
    sql_statement = sql_statement.upper()
    
    for line in sql_statement.split("\n"):
            #Remove anythign on the right of the comment
        q = line.split("--")[0]
        concat_sql = concat_sql + " " + q

    concat_sql = concat_sql.replace('\t', ' ').replace("  ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ")
    concat_sql = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", concat_sql)
    
   # print(concat_sql)
    
    df = pd.concat([df, read_raw_sql_view("CREATE PROC " + concat_sql, r"(?ims)\b(?:CREATE\s+PROC)\s+(\[?\w+\]?)\.(\[?\w+\]?)")], ignore_index=True )

df['SP_SCHEMA']=df['SP_SCHEMA'].apply(lambda x: final_table_name_cleansing(x))
df['SP_NAME']=df['SP_NAME'].apply(lambda x: final_table_name_cleansing(x))
df['SYNTAX']=df['SYNTAX'].apply(lambda x: sql_syntax_cleansing(x))



#### 1B. Store Procedure SQL file into a Dataframe

In [5]:
# Open and read the Table name (Table List as csv)
#table_name_df = pd.read_csv('1.INPUT/DATAWAREHOUSE/tableList.csv', header=0)
#table_name_df= table_name_df.rename(columns={"database": "TABLE_DATABASE",
#                       "schema": "TABLE_SCHEMA",
#                       "name": "TABLE_NAME"
#                      })

#table_name_df = table_name_df[["TABLE_DATABASE", 'TABLE_SCHEMA', 'TABLE_NAME']]

#table_name_df = table_name_df.drop_duplicates(ignore_index=True)


#First we clean the Table Name Dataframe
#table_name_df['TABLE_SCHEMA']=table_name_df['TABLE_SCHEMA'].apply(lambda x: sql_syntax_cleansing(x))
#table_name_df['TABLE_NAME']=table_name_df['TABLE_NAME'].apply(lambda x: sql_syntax_cleansing(x))


# Open and read the Table name (Table List as SQL DDL file)
# Open and read the file as a single buffer
fd = open(input_script_table, 'r', encoding="utf-8")
sqlFile = fd.read()
fd.close()

#This convert the raw SQL file into a Dataframe for our later REGEX manipulation

table_name_df = pd.DataFrame()
i= 1

for sql_statement in re.split(r'CREATE\s+TABLE', sqlFile):
    i = i + 1
    concat_sql = ""
    sql_statement = sql_statement.upper()
    
    for line in sql_statement.split("\n"):
            #Remove anythign on the right of the comment
        q = line.split("--")[0]
        concat_sql = concat_sql + " " + q

    concat_sql = concat_sql.replace('\t', ' ').replace("  ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ")
    concat_sql = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", concat_sql)
    
    table_name_df = pd.concat([table_name_df, read_raw_sql_table("CREATE TABLE " + concat_sql, r"(?ims)\b(?:CREATE\s+TABLE)\s+(\[?\w+\]?)\.(\[?\w+\]?)")], ignore_index=True )

table_name_df['TABLE_SCHEMA']=table_name_df['TABLE_SCHEMA'].apply(lambda x: final_table_name_cleansing(x))
table_name_df['TABLE_NAME']=table_name_df['TABLE_NAME'].apply(lambda x: final_table_name_cleansing(x))

table_name_df.apply(lambda x: x.astype(str).str.upper())
table_name_df['last_element'] =  table_name_df['TABLE_NAME'].str.split('_').str[-1]


# This is where you set logics to exclude certain SP by name
exclusion_list = ['TMP', 'TEMP', 
                  'BCK', 'BKP', 'BACK',
                  'TEST', 'RSV']

exclusion_list_v2 = ["_" + sub for sub in exclusion_list]

table_name_df['EXCLUSION_v1'] = table_name_df['TABLE_NAME'].apply(lambda x: any([k in x for k in exclusion_list_v2]))
table_name_df['EXCLUSION_v2'] = table_name_df['last_element'].apply(lambda x: any([k in x for k in exclusion_list]))

def find_suffix_digits(stringInput):
    return bool(re.search(r"_[\d]{4,8}$", stringInput))

table_name_df['EXCLUSION_digit'] = table_name_df['TABLE_NAME'].apply(lambda x: find_suffix_digits(x))
table_name_df['EXCLUSION_utils'] = table_name_df.TABLE_SCHEMA.str.contains("util")

table_name_df['EXCLUSION'] = table_name_df.EXCLUSION_v1 | table_name_df.EXCLUSION_v2 | table_name_df.EXCLUSION_digit | table_name_df.EXCLUSION_utils

table_name_df = table_name_df[['TABLE_SCHEMA', 'TABLE_NAME', 'EXCLUSION']]

#### What the 2 Dataframe should look like :

In [6]:
df

Unnamed: 0,SP_SCHEMA,SP_NAME,SYNTAX
0,ANALYSE,ADF_RAISE_EXCEPTION,CREATE PROC ANALYSE.ADF_RAISE_EXCEPTION @ERRO...
1,ANALYSE,ANFIELD_TO_INGESTION_SUMMARY,CREATE PROC ANALYSE.ANFIELD_TO_INGESTION_SUMM...
2,ANALYSE,ANFIELD_TO_INGESTION_SUMMARY_ALP,CREATE PROC ANALYSE.ANFIELD_TO_INGESTION_SUMM...
3,ANALYSE,ANFIELD_TO_INGESTION_SUMMARY_IKEA,CREATE PROC ANALYSE.ANFIELD_TO_INGESTION_SUMM...
4,ANALYSE,ANFIELD_TO_INGESTION_SUMMARY_MA,CREATE PROC ANALYSE.ANFIELD_TO_INGESTION_SUMM...
...,...,...,...
1226,PUBLISH_YUU,YUUTB_RFM_TAB_4_SP_MONTHENDUPDATE_AGGR,CREATE PROC PUBLISH_YUU.YUUTB_RFM_TAB_4_SP_MO...
1227,UTIL,ANFIELD_LOY_PARAMETERS,CREATE PROC UTIL.ANFIELD_LOY_PARAMETERS @PREV...
1228,UTIL,CREATE_OR_REPLACE_TBL,CREATE PROC UTIL.CREATE_OR_REPLACE_TBL @FROM_...
1229,UTIL,DROP_TBL_IF_EXISTS,CREATE PROC UTIL.DROP_TBL_IF_EXISTS @TBL NVAR...


In [7]:
table_name_df

Unnamed: 0,TABLE_SCHEMA,TABLE_NAME,EXCLUSION
0,ANALYSE,HKMN_MD_ITEM,False
1,ANALYSE,HKMN_MD_MMDS_PROMOTION,False
2,ANALYSE,HKMN_MD_TIME,False
3,ANALYSE,HKMN_MD_STORE,False
4,ANALYSE,HKMN_TX_SALES_DETAIL,False
...,...,...,...
3315,PUBLISH_YUU,YUUTB_RFM_TAB_4_DATA,False
3316,PUBLISH_YUU,YUUTB_RFM_TAB_4_DATA_AGGR,False
3317,PUBLISH_YUU,YUUTB_RFM_TAB_4_DATA_AGGR_BAK_20220629,True
3318,PUBLISH_YUU,YUUTB_RFM_TAB_4_DATA2,False


### 2. Now using REGEX to identify the SP-Table relationships
#### We will use the 2 Dataframe we got earlier


In [8]:
#This function is used to loop complex pattern like "INSERT INTO FROM SELECT * FROM xxx"
#Since there can be multiple SELECT FROM tables, Will start from the most outer shell, then move inner
#If There is a CREATE TABLE/INSERT TABLE statement, then will get from there
def regex_part_analyser(sql_input, regex_type, regex_str, df_master):
    str_found  = re.findall(regex_str, sql_input)
    str_cat=[]
    
    while str_found:
        sql_input = sql_input.replace(str_found[0], '')
        str_cat = str_cat + str_found
        str_found  = re.findall(regex_str, sql_input)

    df = pd.DataFrame (str_cat, columns = ['TABLE_FULL_NAME'])
    df['REGEX_TYPE'] = regex_type
    return df

def regex_manipulation(sql_input, regex_type, regex_str, opt_df=None, opt_col=None):
    output = re.findall(regex_str, sql_input)
    output = list(dict.fromkeys(output))
    
    if opt_df is not None:
            output = [x for x in output if x not in opt_df[opt_df.REGEX_TYPE==opt_col]['TABLE_FULL_NAME'].tolist()]

    df = pd.DataFrame (output, columns = ['TABLE_FULL_NAME'])
    df['REGEX_TYPE'] = regex_type
    
    return df

def isNaN(string):
    return string != string


def list_main_analyser( sql_df, table_name):

    sql_df = sql_df.loc[sql_df['SYNTAX'].str.contains(pat = '\s+' + table_name + '(?:,|\)|;| |$){1}', case=False, regex = True)]
  #  sql_df['TABLE_NAME'] = table_name
    if not sql_df.empty:
        sql_df.loc[:, 'TABLE_NAME'] = table_name
   # print(sql_df)
     
    return sql_df

In [9]:
import numpy as np

df_cat = pd.DataFrame()    


print(f"Total count of {len(table_name_df)} Table in the file")
print(f"Will loop through each table against the SQL file")
    
#This will put the SQL Syntax into the main regex code analyser to identify table names
for table_name_index, table_name_row in tqdm(table_name_df.iterrows()):
    table_string = table_name_row['TABLE_SCHEMA'] + "." + table_name_row['TABLE_NAME']
    
    df_new = list_main_analyser(df, table_string)
    
    if not df_new.empty:
        df_cat = pd.concat([ df_cat, df_new])
        
            
print("COMPLETED : Graph Import File saved to graph_input_sp.csv")

Total count of 3320 Table in the file
Will loop through each table against the SQL file


3320it [05:49,  9.49it/s]

COMPLETED : Graph Import File saved to graph_input_sp.csv





In [10]:
df_cat

Unnamed: 0,SP_SCHEMA,SP_NAME,SYNTAX,TABLE_NAME
13,ANALYSE,HKMN_TO_TX_AGGR_CONSIGNMENT_SALES_BY_DATE_STOR...,CREATE PROC ANALYSE.HKMN_TO_TX_AGGR_CONSIGNME...,ANALYSE.HKMN_MD_ITEM
14,ANALYSE,HKMN_TO_TX_AGGR_SALES_BY_DATE_STORE_CATEGORY,CREATE PROC ANALYSE.HKMN_TO_TX_AGGR_SALES_BY_...,ANALYSE.HKMN_MD_ITEM
15,ANALYSE,HKMN_TO_TX_SALES_TRANSACTION_COUNT,CREATE PROC ANALYSE.HKMN_TO_TX_SALES_TRANSACT...,ANALYSE.HKMN_MD_ITEM
49,ANALYSE,PCDT2_HK_SUPPLIER_DEAL_TRACKING,CREATE PROC ANALYSE.PCDT2_HK_SUPPLIER_DEAL_TR...,ANALYSE.HKMN_MD_ITEM
50,ANALYSE,PCDT2_HK_SUPPLIER_DEAL_TRACKING_BAK,CREATE PROC ANALYSE.PCDT2_HK_SUPPLIER_DEAL_TR...,ANALYSE.HKMN_MD_ITEM
...,...,...,...,...
833,PUBLISH,ANFIELD_TO_LOY_POINT_ACT_AND_LIA,CREATE PROC PUBLISH.ANFIELD_TO_LOY_POINT_ACT_...,UTIL.ANFIELD_LOY_COMMON_PARAMETERS
835,PUBLISH,ANFIELD_TO_LOY_REWARD_STATUS_ANALYTICS,CREATE PROC PUBLISH.ANFIELD_TO_LOY_REWARD_STA...,UTIL.ANFIELD_LOY_COMMON_PARAMETERS
836,PUBLISH,ANFIELD_TO_LOY_REWARD_USE,CREATE PROC PUBLISH.ANFIELD_TO_LOY_REWARD_USE...,UTIL.ANFIELD_LOY_COMMON_PARAMETERS
837,PUBLISH,ANFIELD_TO_LOY_SALES_METRICS,CREATE PROC PUBLISH.ANFIELD_TO_LOY_SALES_METR...,UTIL.ANFIELD_LOY_COMMON_PARAMETERS


### Export the results into 3. OUTPUT_GRAPH

#### We will add back the remaining SP Name that have not been matched into the Output csv

In [11]:
#This checks if there are any left out scripts without TABLE name
delta_df = pd.merge(df, df_cat, how='left', left_on=['SP_SCHEMA','SP_NAME'], right_on = ['SP_SCHEMA','SP_NAME'])
#delta_df.drop(columns=["SYNTAX"], inplace=True)
delta_df['TABLE_FULL_NAME']=delta_df['TABLE_NAME'].apply(lambda x: final_table_name_cleansing(x))
delta_df = delta_df.applymap(lambda s: s.upper() if type(s) == str else s)

#delta_df
#delta_df.to_excel(f"checkin1.xlsx")
delta_df['SYNTAX'] = delta_df['SYNTAX_x']
delta_df.drop(columns=["SYNTAX_y", "SYNTAX_x"], inplace=True)

delta_df.drop(columns=["SYNTAX"], inplace=True)
delta_df.to_csv(graph_ingestion_sp2)

In [12]:
delta_df

Unnamed: 0,SP_SCHEMA,SP_NAME,TABLE_NAME,TABLE_FULL_NAME
0,ANALYSE,ADF_RAISE_EXCEPTION,,
1,ANALYSE,ANFIELD_TO_INGESTION_SUMMARY,ANALYSE.ANFIELD_INGESTION_SUMMARY,ANALYSE.ANFIELD_INGESTION_SUMMARY
2,ANALYSE,ANFIELD_TO_INGESTION_SUMMARY,ANALYSE.ANFIELD_MD_LOCATION,ANALYSE.ANFIELD_MD_LOCATION
3,ANALYSE,ANFIELD_TO_INGESTION_SUMMARY,ANALYSE.ANFIELD_MD_MEMBER_ACCOUNT,ANALYSE.ANFIELD_MD_MEMBER_ACCOUNT
4,ANALYSE,ANFIELD_TO_INGESTION_SUMMARY,ANALYSE.ANFIELD_MD_MEMBER_CLUB,ANALYSE.ANFIELD_MD_MEMBER_CLUB
...,...,...,...,...
6212,PUBLISH_YUU,YUUTB_RFM_TAB_4_SP_MONTHENDUPDATE_AGGR,PUBLISH_YUU.YUUTB_RFM_TAB_4_DATA_AGGR,PUBLISH_YUU.YUUTB_RFM_TAB_4_DATA_AGGR
6213,UTIL,ANFIELD_LOY_PARAMETERS,,
6214,UTIL,CREATE_OR_REPLACE_TBL,,
6215,UTIL,DROP_TBL_IF_EXISTS,,


In [13]:
table_name_df.to_csv(graph_ingestion_table)