In [1]:
####################################################################################
#
# Preprocessing_Migrationfiles.ipynb - script for extracting and transforming database schemas for the Weberp project.
# Copyright (C) 2023  Sravani Namburi
#
# Preprocessing_Migrationfiles.ipynb program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
# 
# Preprocessing_Migrationfiles.ipynb program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License along
# with Preprocessing_Migrationfiles.ipynb program; If not, see <https://www.gnu.org/licenses/>.
#
####################################################################################

In [2]:
import io
import os
import re
from pathlib import Path
import os.path
import networkx as nx
import shutil

In [3]:
#!/usr/bin/env python

# Extracting and transforming database schemas. 
def remove_sngl_comments(Text):
    Comments_lst = re.findall(r"--.*|#.*", Text)
    Comments_lst.sort(reverse=True)
    for Comment in Comments_lst:
        Text = Text.replace(Comment, "")
    return Text.strip()

def remove_mult_comments(Text):
    idx = Text.find("/*")
    if idx != -1:
        return Text[:idx]
    return Text


def mod_query(Query):

    New_Query = ""
    Lines = Query.split("\n")
    for Line in Lines:
        Line = Line.replace("\t", " ")
        Line = Line.replace("IF NOT EXISTS", "")
        Line = Line.replace("`href` varchar(200) NOT NULL DEFAULT '","`href` varchar(200) NOT NULL DEFAULT '#',")
        Line = Line.replace("TYPE=InnoDB", "ENGINE=InnoDB")
        Line = Line.replace("TYPE=MyISAM", "ENGINE=MyISAM")
        Line = Line.replace("NULLLL", "NULL")
        Line = Line.replace("''''", "")
        New_Query = "{} {}".format(New_Query, Line)

    New_Query = " ".join(New_Query.split())
    New_Query = New_Query.replace(", )", ")").replace("( ", "(").replace(" )", ")")
    return New_Query

def check_folder(): 
    cwd = os.getcwd()
    # Check if the destination folder already exists
    output_folder = os.path.join(cwd, 'Migrationfiles_Mod')
    if os.path.exists(output_folder):
        # If the folder exists, empty it before processing
        shutil.rmtree(output_folder)
        os.makedirs(output_folder)
    else:
        # If the folder doesn't exist, create it
        os.makedirs(output_folder)

def main():
    cwd = os.getcwd()
    
    check_folder()
    
    Migration_Files_Path = os.path.join(cwd, 'Migrationfiles')
    Files = os.scandir(Migration_Files_Path)
    Files_lst = list()
    
    for File in Files:
        if File.name.endswith(".sql"):
            SQL_dct = dict()
            SQL_dct["name"] = File.name
            SQL_dct["path"] = File.path
            Files_lst.append(SQL_dct)

    Files_dct_lst = sorted(Files_lst, key=lambda k: k["name"])

    for File_dct in Files_dct_lst:
        print(File_dct)
        with io.open(File_dct["path"], mode="r", encoding="utf-8") as f:
            Text = f.read()
            Queries = Text.split(";")

            Lines_mod = list()
            Lines = Text.split("\n")
            for Line in Lines:
                Line_mod = remove_sngl_comments(Line)
                Line_mod = remove_mult_comments(Line_mod)
                Lines_mod.append(Line_mod)

            Text = "\n".join(Lines_mod)
            Queries = Text.split(";")
            Queries_list = list()
            for Query in Queries:
                if "create table" in Query.lower():
                    Query_mod = mod_query(Query)
                    Queries_list.append(Query_mod)
        
        
        Filename_mod = "{}/Migrationfiles_Mod/{}_mod.sql".format(os.getcwd(), os.path.splitext(File_dct["name"])[0])
        File_mod = io.open("{}".format(Filename_mod), mode="w", encoding="utf-8")
        for Query in Queries_list:
            File_mod.write("{};\n\n".format(Query))
            
        File_mod.close()
        print("Extraction and transformation of database schemas completed!")


if __name__ == "__main__":
    main()

{'name': 'Weberp-3.0-demo.sql', 'path': 'C:\\Users\\nambu\\Schema_Evolution\\MySQLDiff\\Database_Migration_Files\\Generate_Migration_Files\\Weberp\\Migrationfiles\\Weberp-3.0-demo.sql'}
Extraction and transformation of database schemas completed!
{'name': 'Weberp-3.01-demo.sql', 'path': 'C:\\Users\\nambu\\Schema_Evolution\\MySQLDiff\\Database_Migration_Files\\Generate_Migration_Files\\Weberp\\Migrationfiles\\Weberp-3.01-demo.sql'}
Extraction and transformation of database schemas completed!
{'name': 'Weberp-3.02-demo.sql', 'path': 'C:\\Users\\nambu\\Schema_Evolution\\MySQLDiff\\Database_Migration_Files\\Generate_Migration_Files\\Weberp\\Migrationfiles\\Weberp-3.02-demo.sql'}
Extraction and transformation of database schemas completed!
{'name': 'Weberp-3.03-demo.sql', 'path': 'C:\\Users\\nambu\\Schema_Evolution\\MySQLDiff\\Database_Migration_Files\\Generate_Migration_Files\\Weberp\\Migrationfiles\\Weberp-3.03-demo.sql'}
Extraction and transformation of database schemas completed!
{'name

Extraction and transformation of database schemas completed!
{'name': 'Weberp-4.15.1-demo.sql', 'path': 'C:\\Users\\nambu\\Schema_Evolution\\MySQLDiff\\Database_Migration_Files\\Generate_Migration_Files\\Weberp\\Migrationfiles\\Weberp-4.15.1-demo.sql'}
Extraction and transformation of database schemas completed!
{'name': 'Weberp-4.15.2-demo.sql', 'path': 'C:\\Users\\nambu\\Schema_Evolution\\MySQLDiff\\Database_Migration_Files\\Generate_Migration_Files\\Weberp\\Migrationfiles\\Weberp-4.15.2-demo.sql'}
Extraction and transformation of database schemas completed!


### Topological Sort

In [4]:
# Extract the table names and their dependencies from the SQL file.

MigrationFiles_Path = "{}/Migrationfiles_Mod".format(os.getcwd())

Tables_and_Dependencies_List = []
Create_Table = re.compile(r'CREATE TABLE ([^\s\(]+)')
References_pattern = re.compile(r'REFERENCES ([^\s\(]+)')

def extract_dependencies(Content):
    Dependencies = []
    References = re.findall(References_pattern, Content)
    for reference in References:
        if reference not in Dependencies:
            Dependencies.append(reference)
    return Dependencies

for Filename in os.listdir(MigrationFiles_Path):
    if Filename.endswith('.sql'):
        File_Path = os.path.join(MigrationFiles_Path, Filename)
        with open(File_Path, 'r') as File:
            Content = File.read()
        
        Statements = Content.split(';')
        for statement in Statements:
            Table_Name = re.findall(Create_Table, statement)

            if Table_Name:
                Table_Name = Table_Name[0]
                Dependencies = extract_dependencies(statement)
                Tables_and_Dependencies_List.append((Table_Name, Dependencies))

for Table, Dependency in Tables_and_Dependencies_List:
    print(f'Table: {Table}')
    print(f'Dependencies: {Dependency}')

Table: `accountgroups`
Dependencies: []
Table: `accountsection`
Dependencies: []
Table: `areas`
Dependencies: []
Table: `bankaccounts`
Dependencies: []
Table: `banktrans`
Dependencies: []
Table: `bom`
Dependencies: []
Table: `buckets`
Dependencies: []
Table: `chartdetails`
Dependencies: []
Table: `chartmaster`
Dependencies: []
Table: `cogsglpostings`
Dependencies: []
Table: `companies`
Dependencies: []
Table: `config`
Dependencies: []
Table: `contractbom`
Dependencies: []
Table: `contractreqts`
Dependencies: []
Table: `contracts`
Dependencies: []
Table: `currencies`
Dependencies: []
Table: `custallocns`
Dependencies: []
Table: `custbranch`
Dependencies: []
Table: `debtorsmaster`
Dependencies: []
Table: `debtortrans`
Dependencies: []
Table: `discountmatrix`
Dependencies: []
Table: `edi_orders_seg_groups`
Dependencies: []
Table: `edi_orders_segs`
Dependencies: []
Table: `ediitemmapping`
Dependencies: []
Table: `edimessageformat`
Dependencies: []
Table: `freightcosts`
Dependencies: []
Tab

In [5]:
!pip install networkx




[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
# count of dependency tables

MigrationFiles_Path = "{}/Migrationfiles_Mod".format(os.getcwd())
Tables_with_Dependencies = set()
Create_Table_Pattern = re.compile(r'CREATE TABLE ([^\s\(]+)')
References_Pattern = re.compile(r'REFERENCES ([^\s\(]+)')

def extract_dependencies(Content):
    References = re.findall(References_Pattern, Content)
    for reference in References:
        Tables_with_Dependencies.add(reference)

for Filename in os.listdir(MigrationFiles_Path):
    if Filename.endswith('.sql'):
        File_Path = os.path.join(MigrationFiles_Path, Filename)
        with open(File_Path, 'r') as file:
            Content = file.read()
        
        Statements = Content.split(';')
        for statement in Statements:
            Table_Name = re.findall(Create_Table_Pattern, statement)
            
            if Table_Name:
                Table_Name = Table_Name[0]
                extract_dependencies(statement)

Total_Dependecy_Table_Count = len(Tables_with_Dependencies)
print(f'Total Count Dependency Tables: {Total_Dependecy_Table_Count}')
print(Tables_with_Dependencies)

Total Count Dependency Tables: 4
{'`pickreq`', '`mrpdemandtypes`', '`woitems`', '`salesorders`'}


In [7]:
# Represent the tables and their dependencies as a directed graph.

Graph = nx.DiGraph()
for Table, _ in Tables_and_Dependencies_List:
    Graph.add_node(Table)

for Table, Dependencies in Tables_and_Dependencies_List:
    for dependency in Dependencies:
        Graph.add_edge(dependency, Table)

Sorted_Tables = list(nx.topological_sort(Graph))
print("Count of sorted tables:", len(Sorted_Tables))
for Table in Sorted_Tables:
    print(Table)

Count of sorted tables: 163
`accountgroups`
`accountsection`
`areas`
`bankaccounts`
`banktrans`
`bom`
`buckets`
`chartdetails`
`chartmaster`
`cogsglpostings`
`companies`
`config`
`contractbom`
`contractreqts`
`contracts`
`currencies`
`custallocns`
`custbranch`
`debtorsmaster`
`debtortrans`
`discountmatrix`
`edi_orders_seg_groups`
`edi_orders_segs`
`ediitemmapping`
`edimessageformat`
`freightcosts`
`gltrans`
`grns`
`holdreasons`
`lastcostrollup`
`locations`
`locstock`
`loctransfers`
`orderdeliverydifferenceslog`
`paymentmethods`
`paymentterms`
`periods`
`prices`
`purchdata`
`purchorderdetails`
`purchorders`
`recurringsalesorders`
`recurrsalesorderdetails`
`reportcolumns`
`reportheaders`
`salesanalysis`
`salescat`
`salescatprod`
`salesglpostings`
`salesman`
`salesorderdetails`
`salesorders`
`salestypes`
`scripts`
`securitygroups`
`securityroles`
`securitytokens`
`shipmentcharges`
`shipments`
`shippers`
`stockcategory`
`stockcheckfreeze`
`stockcounts`
`stockmaster`
`stockmoves`
`stockseri

In [8]:
# Perform topological sorting on the graph to obtain the correct order of tables.Output the tables in the sorted order.
#if already files in folder, remove and run this code.

MigrationFiles_Path = "{}/Migrationfiles_Mod".format(os.getcwd())

for Filename in os.listdir(MigrationFiles_Path):
    if Filename.endswith('.sql'):
        File_Path = os.path.join(MigrationFiles_Path, Filename)
        with open(File_Path, 'r') as file:
            Content = file.read()

        Mod_Content = ""
        for Table in Sorted_Tables:
            Table_Creation_Statement = re.search(r'CREATE TABLE {}(.+?);'.format(Table), Content, re.DOTALL)
            if Table_Creation_Statement:
                Mod_Content += Table_Creation_Statement.group(0) + "\n"
        with open(File_Path, 'w') as file:
            file.write(Mod_Content)

for Filename in os.listdir(MigrationFiles_Path):
    if Filename.endswith('.sql'):
        File_Path = os.path.join(MigrationFiles_Path, Filename)
        New_Filename = Filename.replace('.sql', '_sort.sql')
        New_FilePath = os.path.join(MigrationFiles_Path, New_Filename)
        os.rename(File_Path, New_FilePath)      

### Copy processed files to the Docker_Mysqldiff folder.

In [9]:
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
parent_parent_dir = os.path.dirname(parent_dir)

source_dir = os.path.join(cwd, "Migrationfiles_Mod")
destination_dir = os.path.join(parent_parent_dir, "Docker_Mysqldiff", "Weberp", "Input")

if os.path.exists(destination_dir):
    shutil.rmtree(destination_dir)

shutil.copytree(source_dir, destination_dir)

print("Migration files copied to Docker_Mysqldiff folder successfully.")


Migration files copied to Docker_Mysqldiff folder successfully.


### Copy processed files to the Dataset folder.

In [10]:
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
pp_dir = os.path.dirname(parent_dir)
ppp_dir = os.path.dirname(pp_dir)
pppp_dir = os.path.dirname(ppp_dir)


source_dir = os.path.join(cwd, "Migrationfiles_Mod")
destination_dir = os.path.join(pppp_dir, "Datasets", "Weberp", "Input", "Migration_Files")

if os.path.exists(destination_dir):
    shutil.rmtree(destination_dir)

shutil.copytree(source_dir, destination_dir)

print("Migration files copied to Dataset folder successfully.")

Migration files copied to Dataset folder successfully.
