### Summary

This will break down the file in the following steps:
1. Each "END GO" is treated as a unique SQL
2. look at each line and remove comments (--)
3. Look at comments block then remove (/* */)
4. Look for keywords like FROM, JOIN , UPDATE, INTO, CREATE TABLE then grab the next word (Database.Schema.Name or Schema.Name)
    (not necessary to find relationships, so all will be just using the RELY_ON relationship.)
5. Concatenating into a single dataframe and output to CSV
6. Ingest into Graph Database

### Import Packages

In [1]:
import pandas as pd
import numpy as np
import re
import math
from tqdm import tqdm

### Import Files
#### We will use 1 files from 1.INPUT folder
1A. Stored Procedure SQL files

In [2]:
def read_raw_sql_view(sql_input, regex_str):
    str_found  = re.findall(regex_str, sql_input)
  #  print("viewname found", str_found)
    df = pd.DataFrame (str_found, columns = ['SP_SCHEMA', 'SP_NAME'])
    df['SYNTAX'] = sql_input
    return df


def final_table_name_cleansing(text):
    if text==text:
            input_str = text.replace('FROM', '').replace('JOIN', '').replace(']', '').replace('[', '').replace(' ', '')
    else:
        input_str = ""
    return input_str

In [3]:
# Open and read the file as a single buffer
fd = open('1.INPUT/DATAWAREHOUSE/SPsScript.sql', 'r', encoding="utf-16")
sqlFile = fd.read()
fd.close()

In [4]:
#This convert the raw SQL file into a Dataframe for our later REGEX manipulation

df = pd.DataFrame()
i= 1

for sql_statement in re.split(r'CREATE\s+PROC', sqlFile):
    i = i + 1
    concat_sql = ""
    
    for line in sql_statement.split("\n"):
            #Remove anythign on the right of the comment
        q = line.split("--")[0]
        concat_sql = concat_sql + " " + q

    concat_sql = concat_sql.replace('\t', ' ').replace("  ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ")
    concat_sql = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", concat_sql)
    
   # print(concat_sql)
    
    df = pd.concat([df, read_raw_sql_view("CREATE PROC " + concat_sql, r"(?ims)\b(?:CREATE\s+PROC)\s+(\[?\w+\]?)\.(\[?\w+\]?)")], ignore_index=True )

df['SP_SCHEMA']=df['SP_SCHEMA'].apply(lambda x: final_table_name_cleansing(x))
df['SP_NAME']=df['SP_NAME'].apply(lambda x: final_table_name_cleansing(x))

df.apply(lambda x: x.astype(str).str.upper())
df['last_element'] =  df['SP_NAME'].str.split('_').str[-1]


# This is where you set logics to exclude certain SP by name
exclusion_list = ['TMP', 'TEMP', 
                  'BCK', 'BKP', 'BACK',
                  'TEST', 'RSV']

exclusion_list_v2 = ["_" + sub for sub in exclusion_list]

df['EXCLUSION_v1'] = df['SP_NAME'].apply(lambda x: any([k in x for k in exclusion_list_v2]))
df['EXCLUSION_v2'] = df['last_element'].apply(lambda x: any([k in x for k in exclusion_list]))

def find_suffix_digits(stringInput):
    return bool(re.search(r"_[\d]{4,8}$", stringInput))

df['EXCLUSION_digit'] = df['SP_SCHEMA'].apply(lambda x: find_suffix_digits(x))
df['EXCLUSION_utils'] = df.SP_SCHEMA.str.contains("util")

df['EXCLUSION'] = df.EXCLUSION_v1 | df.EXCLUSION_v2 | df.EXCLUSION_digit | df.EXCLUSION_utils

df = df[['SP_SCHEMA', 'SP_NAME', 'SYNTAX', 'EXCLUSION']]

#### This is what the output dataframe should look like to feed into the Regex Engine

In [5]:
df

Unnamed: 0,SP_SCHEMA,SP_NAME,SYNTAX,EXCLUSION
0,PREPARE,SAMPLE_SP_1,CREATE PROC [PREPARE].[SAMPLE_SP_1] AS BEGIN ...,False
1,PREPARE,SAMPLE_SP_2,CREATE PROC [PREPARE].[SAMPLE_SP_2] AS BEGIN ...,False


### 2. Main ETL Codes
#### Now using REGEX to identify the SP-Table relationships
#### We will use only the SP Dataframe we got earlier

In [6]:

#This function is used to loop complex pattern like "INSERT INTO FROM SELECT * FROM xxx"
#Since there can be multiple SELECT FROM tables, Will start from the most outer shell, then move inner
#If There is a CREATE TABLE/INSERT TABLE statement, then will get from there
def regex_part_analyser(sql_input, regex_type, regex_str, df_master):
    str_found  = re.findall(regex_str, sql_input)
    str_cat=str_found

    df = pd.DataFrame (str_cat, columns = ['TABLE_FULL_NAME'])
    df['REGEX_TYPE'] = regex_type
    return df

def regex_manipulation(sql_input, regex_type, regex_str, opt_df=None, opt_col=None):
    output = re.findall(regex_str, sql_input)
    output = list(dict.fromkeys(output))
    
    if opt_df is not None:
            output = [x for x in output if x not in opt_df[opt_df.REGEX_TYPE==opt_col]['TABLE_FULL_NAME'].tolist()]

    df = pd.DataFrame (output, columns = ['TABLE_FULL_NAME'])
    df['REGEX_TYPE'] = regex_type
    
    return df

def isNaN(string):
    return string != string



def regex_main_analyser( index, rowInput, sql_Input):
    
    # remove the /* */ comments
    q = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", sql_Input)

    #remove the */ and /* comments
    q  = re.sub(r"[^*]*\*+(?:[^*/][^*]*\*+)*/", "", q)
    q  = re.sub(r"\/\*[^,]*$", "", q)
    q  = q.replace('"','')
    q  = q.upper()
    df = pd.DataFrame()
    #print(q)
    
    #Splitting the Source Code into multiple SQL statement by ';'
    list_sql_statement = q.split(';')
    
    
    for sql_statement in list_sql_statement:

        
        #1 components TABLE string database.schema.table
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?))(?:,|\)|;| |$){1}", df)])
      
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+(?:[\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?\.[\[]?\w+[\]]?))\s*\w*\s*,\s*([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?\.[\[]?\w+[\]]?))(?:,|\)|;| |$){1}", df)])   
    
    
        df = pd.concat([df, regex_part_analyser(sql_statement, "INSERT_UPDATE", 
                                                         r"(?ims)\b(?:UPDATE|INTO|TABLE)\s+([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?))(?:,|\)|;| |$){1}", df)])
 
        df = pd.concat([df, regex_part_analyser(sql_statement, "INSERT_UPDATE", 
                                                         r"(?ims)\b(?:UPDATE|INTO|TABLE)\s+(?:[\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?\.[\[]?\w+[\]]?))\s*\w*\s*,\s*([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?\.[\[]?\w+[\]]?))(?:,|\)|;| |$){1}", df)])   
    

        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:,)\s+(?:[\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?))", df)])
                
            
        df = pd.concat([df, regex_manipulation(sql_statement, "INSERT_UPDATE", 
                                               r"(?ims)(?:UPDATE|STATISTICS)\s+([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?))")])  

        df = pd.concat([df, regex_manipulation(sql_statement, "EXEC",
                                               r"(?ims)\b(?:EXEC|EXECUTE)\s+([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?))")])

    df = df.drop_duplicates()  
    df['SP_SCHEMA'] =rowInput['SP_SCHEMA'] 
    df['SP_NAME'] =rowInput['SP_NAME'] 
  # df['SYNTAX'] =rowInput['SYNTAX'] 
    df['VW_INDEX'] = index
    df['SYNTAX_WORDCOUNT'] = len(q.split())
        
    return df

In [7]:

df_cat = pd.DataFrame()    

print(f"Total count of {len(df)} SP in the file")
for index, row in tqdm(df.iterrows()):
    
    #This will put the SQL Syntax into the main regex code analyser to identify table names
    if not isNaN(row['SYNTAX']):
        df_cat = pd.concat([ df_cat, regex_main_analyser(index, row, row['SYNTAX'])])
            
print("COMPLETED : Graph Import File saved to graph_input_sp.csv")

Total count of 2 SP in the file


2it [00:00, 27.20it/s]

COMPLETED : Graph Import File saved to graph_input_sp.csv





### Export the results into 3. OUTPUT_GRAPH
#### We will add back the remaining SP Name that have not been matched into the Output csv

In [8]:
df_cat2 = df_cat.copy()

df_cat2['regex_type_value'] = df_cat2['REGEX_TYPE'].map({'SELECT_FROM': 0, 'EXEC': 0, 
                                                        'INSERT_UPDATE': 1})

df_cat2['regex_sum_value'] = df_cat2.groupby(['TABLE_FULL_NAME', 'VW_INDEX']).regex_type_value.transform(np.sum)

df_cat3 = df_cat2.copy()
df_cat2 = df_cat2[(df_cat2.regex_sum_value == 1) & (df_cat2.REGEX_TYPE == 'INSERT_UPDATE')]
df_cat3 = df_cat3[(df_cat3.regex_sum_value == 0)]

df_final = pd.concat([df_cat2, df_cat3], ignore_index=True).drop_duplicates()

df_final

Unnamed: 0,TABLE_FULL_NAME,REGEX_TYPE,SP_SCHEMA,SP_NAME,VW_INDEX,SYNTAX_WORDCOUNT,regex_type_value,regex_sum_value
0,[SCHEMA_MALL].[SAMPLE_TB_1],INSERT_UPDATE,PREPARE,SAMPLE_SP_1,0,198,1,1
1,[SCHEMA_MALL].[SAMPLE_TB_2],INSERT_UPDATE,PREPARE,SAMPLE_SP_1,0,198,1,1
2,[SCHEMA_MALL].[SAMPLE_TB_5],INSERT_UPDATE,PREPARE,SAMPLE_SP_2,1,92,1,1
3,[SCHEMA_MALL].[SAMPLE_TB_2],INSERT_UPDATE,PREPARE,SAMPLE_SP_2,1,92,1,1
4,[PREPARE].[SAMPLE_TB_3],SELECT_FROM,PREPARE,SAMPLE_SP_1,0,198,0,0
5,[PREPARE].[SAMPLE_TB_6],SELECT_FROM,PREPARE,SAMPLE_SP_2,1,92,0,0


In [9]:
#This checks if there are any left out scripts without TABLE name
delta_df = pd.merge(df, df_final, how='left', left_on=['SP_SCHEMA','SP_NAME'], right_on = ['SP_SCHEMA','SP_NAME'])
delta_df.drop(columns=["SYNTAX"], inplace=True)
delta_df['TABLE_FULL_NAME']=delta_df['TABLE_FULL_NAME'].apply(lambda x: final_table_name_cleansing(x))
delta_df = delta_df.applymap(lambda s: s.upper() if type(s) == str else s)

delta_df.to_csv(f'3.OUTPUT_GRAPH/graph_input_sp.csv')


In [10]:
delta_df

Unnamed: 0,SP_SCHEMA,SP_NAME,EXCLUSION,TABLE_FULL_NAME,REGEX_TYPE,VW_INDEX,SYNTAX_WORDCOUNT,regex_type_value,regex_sum_value
0,PREPARE,SAMPLE_SP_1,False,SCHEMA_MALL.SAMPLE_TB_1,INSERT_UPDATE,0,198,1,1
1,PREPARE,SAMPLE_SP_1,False,SCHEMA_MALL.SAMPLE_TB_2,INSERT_UPDATE,0,198,1,1
2,PREPARE,SAMPLE_SP_1,False,PREPARE.SAMPLE_TB_3,SELECT_FROM,0,198,0,0
3,PREPARE,SAMPLE_SP_2,False,SCHEMA_MALL.SAMPLE_TB_5,INSERT_UPDATE,1,92,1,1
4,PREPARE,SAMPLE_SP_2,False,SCHEMA_MALL.SAMPLE_TB_2,INSERT_UPDATE,1,92,1,1
5,PREPARE,SAMPLE_SP_2,False,PREPARE.SAMPLE_TB_6,SELECT_FROM,1,92,0,0
