### Summary
#### This will break down the file in the following steps:
1. loop through the table name file and see if anythings matched in the SQL script
2. This won't look at the syntax patterns, just basically seeing if the TABLE name exists

### Import Packages

In [1]:
import pandas as pd
import re
import math
from tqdm import tqdm
import glob
import shutil

pd.options.mode.chained_assignment = None

### 1. Import Files
#### We will use 2 files from 1.INPUT folder
1A. Stored Procedure SQL files

1B. Excel Table Name files

In [2]:
def read_raw_sql_view(sql_input, regex_str):
    str_found  = re.findall(regex_str, sql_input)
  #  print("viewname found", str_found)
    df = pd.DataFrame (str_found, columns = ['SP_SCHEMA', 'SP_NAME'])
    df['SYNTAX'] = sql_input
    return df


def sql_syntax_cleansing(text):
    if text==text:
        # remove the /* */ comments
        q = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", text)

        #remove the */ and /* comments
        #q  = re.sub(r"[^*]*\*+(?:[^*/][^*]*\*+)*/", "", q)
        q  = re.sub(r"\/\*[^,]*$", "", q)
        q  = q.replace('"','')
        q  = q.replace('[','').replace(']','')
        input_str  = q.upper()
    else:
        input_str = ""
    return input_str

def final_table_name_cleansing(text):
    if text==text:
          #  input_str=re.findall(r'(\[?\w+\]?\.\[?\w+\]?)', text)
            input_str = text.replace('FROM', '').replace('JOIN', '').replace(']', '').replace('[', '').replace(' ', '').upper()
    else:
        input_str = ""
    return input_str

#### 1A. Store Procedure SQL files into a Dataframe

In [3]:
# Open and read the SP SQL file as a single buffer
fd = open('1.INPUT/DATAWAREHOUSE/SPsScript.sql', 'r', encoding="utf-16")
sqlFile = fd.read()
fd.close()

#Then we clean the Stored Procedure Dataframe
df = pd.DataFrame()
i= 1

for sql_statement in re.split(r'CREATE\s+PROC', sqlFile):
    i = i + 1
    concat_sql = ""
    
    for line in sql_statement.split("\n"):
            #Remove anythign on the right of the comment
        q = line.split("--")[0]
        concat_sql = concat_sql + " " + q

    concat_sql = concat_sql.replace('\t', ' ').replace("  ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ")
    concat_sql = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", concat_sql)
    
   # print(concat_sql)
    
    df = pd.concat([df, read_raw_sql_view("CREATE PROC " + concat_sql, r"(?ims)\b(?:CREATE\s+PROC)\s+(\[?\w+\]?)\.(\[?\w+\]?)")], ignore_index=True )

df['SP_SCHEMA']=df['SP_SCHEMA'].apply(lambda x: final_table_name_cleansing(x))
df['SP_NAME']=df['SP_NAME'].apply(lambda x: final_table_name_cleansing(x))
df['SYNTAX']=df['SYNTAX'].apply(lambda x: sql_syntax_cleansing(x))



#### 1B. Store Procedure SQL file into a Dataframe

In [4]:
# Open and read the Table name
table_name_df = pd.read_csv('1.INPUT/DATAWAREHOUSE/tableList.csv', header=0)
table_name_df= table_name_df.rename(columns={"database": "TABLE_DATABASE",
                       "schema": "TABLE_SCHEMA",
                       "name": "TABLE_NAME"
                      })

table_name_df = table_name_df[["TABLE_DATABASE", 'TABLE_SCHEMA', 'TABLE_NAME']]

table_name_df = table_name_df.drop_duplicates(ignore_index=True)


#First we clean the Table Name Dataframe
table_name_df['TABLE_SCHEMA']=table_name_df['TABLE_SCHEMA'].apply(lambda x: sql_syntax_cleansing(x))
table_name_df['TABLE_NAME']=table_name_df['TABLE_NAME'].apply(lambda x: sql_syntax_cleansing(x))





#### What the 2 Dataframe should look like :

In [5]:
df

Unnamed: 0,SP_SCHEMA,SP_NAME,SYNTAX
0,PREPARE,SAMPLE_SP_1,CREATE PROC PREPARE.SAMPLE_SP_1 AS BEGIN SET ...
1,PREPARE,SAMPLE_SP_2,CREATE PROC PREPARE.SAMPLE_SP_2 AS BEGIN SET ...


In [6]:
table_name_df

Unnamed: 0,TABLE_DATABASE,TABLE_SCHEMA,TABLE_NAME
0,DATABASE1,PREPARE,SAMPLE_TB_6
1,DATABASE1,SCHEMA_MALL,SAMPLE_TB_5
2,DATABASE1,PREPARE,SAMPLE_TB_3
3,DATABASE1,SCHEMA_MALL,SAMPLE_TB_2
4,DATABASE1,SCHEMA_MALL,SAMPLE_TB_1


### 2. Now using REGEX to identify the SP-Table relationships
#### We will use the 2 Dataframe we got earlier


In [7]:
#This function is used to loop complex pattern like "INSERT INTO FROM SELECT * FROM xxx"
#Since there can be multiple SELECT FROM tables, Will start from the most outer shell, then move inner
#If There is a CREATE TABLE/INSERT TABLE statement, then will get from there
def regex_part_analyser(sql_input, regex_type, regex_str, df_master):
    str_found  = re.findall(regex_str, sql_input)
    str_cat=[]
    
    while str_found:
        sql_input = sql_input.replace(str_found[0], '')
        str_cat = str_cat + str_found
        str_found  = re.findall(regex_str, sql_input)

    df = pd.DataFrame (str_cat, columns = ['TABLE_FULL_NAME'])
    df['REGEX_TYPE'] = regex_type
    return df

def regex_manipulation(sql_input, regex_type, regex_str, opt_df=None, opt_col=None):
    output = re.findall(regex_str, sql_input)
    output = list(dict.fromkeys(output))
    
    if opt_df is not None:
            output = [x for x in output if x not in opt_df[opt_df.REGEX_TYPE==opt_col]['TABLE_FULL_NAME'].tolist()]

    df = pd.DataFrame (output, columns = ['TABLE_FULL_NAME'])
    df['REGEX_TYPE'] = regex_type
    
    return df

def isNaN(string):
    return string != string


def list_main_analyser( sql_df, table_name):

    sql_df = sql_df.loc[sql_df['SYNTAX'].str.contains(pat = '\s+' + table_name + '(?:,|\)|;| |$){1}', case=False, regex = True)]
  #  sql_df['TABLE_NAME'] = table_name
    if not sql_df.empty:
        sql_df.loc[:, 'TABLE_NAME'] = table_name
   # print(sql_df)
     
    return sql_df

In [8]:
import numpy as np

df_cat = pd.DataFrame()    


print(f"Total count of {len(table_name_df)} Table in the file")
print(f"Will loop through each table against the SQL file")
    
#This will put the SQL Syntax into the main regex code analyser to identify table names
for table_name_index, table_name_row in tqdm(table_name_df.iterrows()):
    table_string = table_name_row['TABLE_SCHEMA'] + "." + table_name_row['TABLE_NAME']
    
    df_new = list_main_analyser(df, table_string)
    
    if not df_new.empty:
        df_cat = pd.concat([ df_cat, df_new])
        
            
print("COMPLETED : Graph Import File saved to graph_input_sp.csv")

Total count of 5 Table in the file
Will loop through each table against the SQL file


5it [00:00, 1000.07it/s]

COMPLETED : Graph Import File saved to graph_input_sp.csv





In [9]:
df_cat

Unnamed: 0,SP_SCHEMA,SP_NAME,SYNTAX,TABLE_NAME
1,PREPARE,SAMPLE_SP_2,CREATE PROC PREPARE.SAMPLE_SP_2 AS BEGIN SET ...,PREPARE.SAMPLE_TB_6
1,PREPARE,SAMPLE_SP_2,CREATE PROC PREPARE.SAMPLE_SP_2 AS BEGIN SET ...,SCHEMA_MALL.SAMPLE_TB_5
0,PREPARE,SAMPLE_SP_1,CREATE PROC PREPARE.SAMPLE_SP_1 AS BEGIN SET ...,PREPARE.SAMPLE_TB_3
0,PREPARE,SAMPLE_SP_1,CREATE PROC PREPARE.SAMPLE_SP_1 AS BEGIN SET ...,SCHEMA_MALL.SAMPLE_TB_2
1,PREPARE,SAMPLE_SP_2,CREATE PROC PREPARE.SAMPLE_SP_2 AS BEGIN SET ...,SCHEMA_MALL.SAMPLE_TB_2
0,PREPARE,SAMPLE_SP_1,CREATE PROC PREPARE.SAMPLE_SP_1 AS BEGIN SET ...,SCHEMA_MALL.SAMPLE_TB_1


### Export the results into 3. OUTPUT_GRAPH

#### We will add back the remaining SP Name that have not been matched into the Output csv

In [10]:
#This checks if there are any left out scripts without TABLE name
delta_df = pd.merge(df, df_cat, how='left', left_on=['SP_SCHEMA','SP_NAME'], right_on = ['SP_SCHEMA','SP_NAME'])
#delta_df.drop(columns=["SYNTAX"], inplace=True)
delta_df['TABLE_FULL_NAME']=delta_df['TABLE_NAME'].apply(lambda x: final_table_name_cleansing(x))
delta_df = delta_df.applymap(lambda s: s.upper() if type(s) == str else s)

#delta_df
#delta_df.to_excel(f"checkin1.xlsx")
delta_df['SYNTAX'] = delta_df['SYNTAX_x']
delta_df.drop(columns=["SYNTAX_y", "SYNTAX_x"], inplace=True)

delta_df.drop(columns=["SYNTAX"], inplace=True)
delta_df.to_csv(f'3.OUTPUT_GRAPH/graph_input_sp2.csv')

In [11]:
delta_df

Unnamed: 0,SP_SCHEMA,SP_NAME,TABLE_NAME,TABLE_FULL_NAME
0,PREPARE,SAMPLE_SP_1,PREPARE.SAMPLE_TB_3,PREPARE.SAMPLE_TB_3
1,PREPARE,SAMPLE_SP_1,SCHEMA_MALL.SAMPLE_TB_2,SCHEMA_MALL.SAMPLE_TB_2
2,PREPARE,SAMPLE_SP_1,SCHEMA_MALL.SAMPLE_TB_1,SCHEMA_MALL.SAMPLE_TB_1
3,PREPARE,SAMPLE_SP_2,PREPARE.SAMPLE_TB_6,PREPARE.SAMPLE_TB_6
4,PREPARE,SAMPLE_SP_2,SCHEMA_MALL.SAMPLE_TB_5,SCHEMA_MALL.SAMPLE_TB_5
5,PREPARE,SAMPLE_SP_2,SCHEMA_MALL.SAMPLE_TB_2,SCHEMA_MALL.SAMPLE_TB_2
