## Ingest raw SQL File for Views

### Summary

This will break down the file in the following steps:
1. Each "; GO" is treated as a unique SQL
2. look at each line and remove comments (--)
3. Look at comments block then remove (/* */)
4. Look for keywords like FROM, JOIN , then grab the next word (Schema.Name)
5. Concatenating into a single dataframe and output to CSV

### Import Packages

In [8]:
import pandas as pd
import numpy as np
import re
import math
from tqdm import tqdm

### Import Files
#### We will use 1 files from 1.INPUT folder
1A. VIEW SQL files

In [11]:
def read_raw_sql_view(sql_input, regex_str):
    str_found  = re.findall(regex_str, sql_input)
  #  print("viewname found", str_found)
    df = pd.DataFrame (str_found, columns = ['VW_SCHEMA', 'VW_NAME'])
    df['SYNTAX'] = sql_input
    return df

def final_table_name_cleansing(text):
    if text==text:
          #  input_str=re.findall(r'(\[?\w+\]?\.\[?\w+\]?)', text)
            input_str = text.replace('FROM', '').replace('JOIN', '').replace(']', '').replace('[', '').replace(' ', '')
    else:
        input_str = ""
    return input_str

In [12]:
# Open and read the file as a single buffer
fd = open('1.INPUT/DATAWAREHOUSE/ViewScript.sql', 'r', encoding="utf-16")
sqlFile = fd.read()
fd.close()

#This convert the raw SQL file into a Dataframe for our later REGEX manipulation

df = pd.DataFrame()

for sql_statement in re.split(r'(?:;(\s+|\n))(GO)', sqlFile):
   # print("split")
    concat_sql = ""
    
    for line in sql_statement.split("\n"):
            #Remove anythign on the right of the comment
        q = line.split("--")[0]
        concat_sql = concat_sql + " " + q

    concat_sql = concat_sql.replace('\t', ' ').replace("  ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ")
    concat_sql = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", concat_sql)
    
   # print(concat_sql)
    
    df = pd.concat([df, read_raw_sql_view(concat_sql, r"(?ims)\b(?:CREATE\s+VIEW)\s+(\[?\w+\]?)\.(\[?\w+\]?)")], ignore_index=True )

df['VW_SCHEMA']=df['VW_SCHEMA'].apply(lambda x: final_table_name_cleansing(x))
df['VW_NAME']=df['VW_NAME'].apply(lambda x: final_table_name_cleansing(x))

In [17]:
df

Unnamed: 0,VW_SCHEMA,VW_NAME,SYNTAX
0,prepare,SAMPLE_TB_6_VW,SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER O...
1,schema_mall,SAMPLE_TB_1_temp_view,SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER O...


### 2. Now using REGEX to identify the View-Table relationships
#### We will use the View Dataframe we got earlier

In [14]:

#This function is used to loop complex pattern like "INSERT INTO FROM SELECT * FROM xxx"
#Since there can be multiple SELECT FROM tables, Will start from the most outer shell, then move inner
#If There is a CREATE TABLE/INSERT TABLE statement, then will get from there
def regex_part_analyser(sql_input, regex_type, regex_str, df_master):
    str_found  = re.findall(regex_str, sql_input)
    str_cat=[]
    
    while str_found:
        sql_input = sql_input.replace(str_found[0], '')
        str_cat = str_cat + str_found
        str_found  = re.findall(regex_str, sql_input)

    df = pd.DataFrame (str_cat, columns = ['TABLE_FULL_NAME'])
    df['REGEX_TYPE'] = regex_type
    return df
        
def isNaN(string):
    return string != string

def regex_main_analyser( index, rowInput, sql_Input):
    
    # remove the /* */ comments
    q = re.sub(r"/\*[^*]*\*+(?:[^*/][^*]*\*+)*/", "", sql_Input)

    #remove the */ and /* comments
    q  = re.sub(r"[^*]*\*+(?:[^*/][^*]*\*+)*/", "", q)
    q  = re.sub(r"\/\*[^,]*$", "", q)
    q  = q.replace('"','')
    q  = q.upper()
    df = pd.DataFrame()
    #print(q)
    
    #Splitting the Source Code into multiple SQL statement by ';'
    list_sql_statement = q.split(';')
    
    
    for sql_statement in list_sql_statement:
        #print(sql_statement)
        #To be fixed
    #    df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
    #                                                     r"(?ims)\b(?:SELECT)\s.*\b((?:FROM|JOIN|UPDATE)\s+\[?\w+\]?\.\[?\w+\]?)", df)])
        
         #2 components TABLE string database.schema.table
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?))(?:,|\)|;| |$){1}", df)])

        #if the table name is like this [SCHEMA].[TABLE]IT then the square bracket must exists
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]))", df)])        
        
        #This matches for the JOIN then comma
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+(?:[\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?))\s+\w*\s*,\s*([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?))", df)])
      
        
        #3 components TABLE string database.schema.table
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?\.[\[]?\w+[\]]?))(?:,|\)|;| |$){1}", df)])

        
        #if the table name is like this [DATABASE].[SCHEMA].[TABLE]IT then the square bracket must exists
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?\.[\[]?\w+[\]]))", df)])        

        
                #This matches for the JOIN then comma
        df = pd.concat([df, regex_part_analyser(sql_statement, "SELECT_FROM", 
                                                         r"(?ims)\b(?:FROM|JOIN)\s+(?:[\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?\.[\[]?\w+[\]]?))\s+\w*\s*,\s*([\[]?[\w-]+(?:[\]]?\.[\[]?\w+[\]]?\.[\[]?\w+[\]]?))", df)])
        
        
    df = df.drop_duplicates()  
    df['VW_SCHEMA'] =rowInput['VW_SCHEMA'] 
    df['VW_NAME'] =rowInput['VW_NAME'] 
    df['SYNTAX'] =rowInput['SYNTAX'] 
    df['VW_INDEX'] = index
    df['SYNTAX_WORDCOUNT'] = len(q.split())
        
    return df

### Export the results into 3. OUTPUT_GRAPH

In [15]:
#Each Row in the Dataframe are looped iteratively against the REGEX logics

df_cat = pd.DataFrame()    

print(f"Total count of {len(df)} VW in the file")
for index, row in tqdm(df.iterrows()):
    
    #This will put the SQL Syntax into the main regex code analyser to identify table names
    if not isNaN(row['SYNTAX']):
        df_cat = pd.concat([ df_cat, regex_main_analyser(index, row, row['SYNTAX'])])
    
if not df_cat.empty:
    #This will clean up the FULL TABLE NAME, and split them into Database, Schema, Name
    df_cat['test']=df_cat['TABLE_FULL_NAME'].apply(lambda x: final_table_name_cleansing(x))
    df_cat[["PART1", "PART2", "PART3"]] = df_cat["test"].str.split(pat=".",  expand=True)
                                          
    df_cat['TABLE_DATABASE'] = np.select([~df_cat.PART3.isnull()], [df_cat.PART1], default=None )
    df_cat['TABLE_SCHEMA'] = np.select([~df_cat.PART3.isnull()], [df_cat.PART2], default=df_cat.PART1 )
    df_cat['TABLE_NAME'] = np.select([~df_cat.PART3.isnull()], [df_cat.PART3], default=df_cat.PART2 )

    
    df_cat.drop(columns=["PART1", "PART2", "PART3", 'test'], inplace=True)
    
df_cat.to_csv(f'3.OUTPUT_GRAPH/graph_input_vw.csv')
            
print("COMPLETED : Graph Import File saved to output_vw.csv")

Total count of 2 VW in the file


2it [00:00, 129.02it/s]

COMPLETED : Graph Import File saved to output_vw.csv





In [16]:
df_cat

Unnamed: 0,TABLE_FULL_NAME,REGEX_TYPE,VW_SCHEMA,VW_NAME,SYNTAX,VW_INDEX,SYNTAX_WORDCOUNT,TABLE_DATABASE,TABLE_SCHEMA,TABLE_NAME
0,PREPARE.SAMPLE_TB_6,SELECT_FROM,prepare,SAMPLE_TB_6_VW,SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER O...,0,42,,PREPARE,SAMPLE_TB_6
0,DATABASE1.SCHEMA_MALL.SAMPLE_TB_1,SELECT_FROM,schema_mall,SAMPLE_TB_1_temp_view,SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER O...,1,38,DATABASE1,SCHEMA_MALL,SAMPLE_TB_1
