In [1]:
####################################################################################
#
# Preprocessing_Migrationfiles.ipynb - script for pre-processing database migration files for the Roundcube project.
# Copyright (C) 2023  Sravani Namburi
#
# Preprocessing_Migrationfiles.ipynb program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
# 
# Preprocessing_Migrationfiles.ipynb program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License along
# with Preprocessing_Migrationfiles.ipynb program; If not, see <https://www.gnu.org/licenses/>.
#
####################################################################################

In [2]:
import io
import os
import re
from pathlib import Path
import os.path
import networkx as nx
import shutil

In [3]:
#!/usr/bin/env python

# Extracting and transforming database schemas. #if already modified files in CT_Mod folder, remove and run this code.

def remove_sngl_comments(Text):
    Comments_lst = re.findall(r"--.*", Text)
    Comments_lst.sort(reverse=True)
    for Comment in Comments_lst:
        Text = Text.replace(Comment, "")
    return Text.strip()

def remove_mult_comments(Text):
    Pattern = r"/\*.*?\*/"
    return re.sub(Pattern, "", Text, flags=re.DOTALL)


def mod_query(Query):
    New_query = ""
    Lines = Query.split("\n")
    for Line in Lines:
        Line = Line.replace("\t", " ")
        Line = Line.replace("DEFAULT CHARSET=utf8mb4 DEFAULT COLLATE=utf8mb4_unicode_ci","DEFAULT CHARSET=utf8mb4 COLLATE utf8mb4_unicode_ci")
        New_query = "{} {}".format(New_query, Line)
    New_query = " ".join(New_query.split())
    New_query = New_query.replace(", )", ")").replace("( ", "(").replace(" )", ")")
    return New_query

def check_folder(): 
    cwd = os.getcwd()
    # Check if the destination folder already exists
    output_folder = os.path.join(cwd, 'Migrationfiles_Mod')
    if os.path.exists(output_folder):
        # If the folder exists, empty it before processing
        shutil.rmtree(output_folder)
        os.makedirs(output_folder)
    else:
        # If the folder doesn't exist, create it
        os.makedirs(output_folder)
        
def main():
    cwd = os.getcwd()
    
    check_folder()
    
    Migration_Files_Path = os.path.join(cwd, 'Migrationfiles')
    Files = os.scandir(Migration_Files_Path)
    Files_lst = list()

    for File in Files:
        if File.name.endswith(".sql"):
            SQL_dct = dict()
            SQL_dct["name"] = File.name
            SQL_dct["path"] = File.path
            Files_lst.append(SQL_dct)

    Files_dct_lst = sorted(Files_lst, key=lambda k: k["name"])

    for File_dct in Files_dct_lst:
        print(File_dct)
        with io.open(File_dct["path"], mode="r", encoding="utf-8") as f:
            Text = f.read()
            Queries = Text.split(";")
            Lines_mod = list()
            Lines = Text.split("\n")
            for Line in Lines:
                Line_mod = remove_sngl_comments(Line)
                Line_mod = remove_mult_comments(Line_mod)
                Lines_mod.append(Line_mod)

            Text = "\n".join(Lines_mod)
            Queries = Text.split(";")
            Queries_list = list()
            for Query in Queries:
                if "create table" in Query.lower():
                    Query_mod = mod_query(Query)
                    Queries_list.append(Query_mod)

        Filename_New = "{}/Migrationfiles_Mod/{}_mod.sql".format(os.getcwd(), os.path.splitext(File_dct["name"])[0])
        File_mod = io.open("{}".format(Filename_New), mode="w", encoding="utf-8")
        for Query in Queries_list:
            File_mod.write("{};\n\n".format(Query))
            
        File_mod.close()

        print("Extraction and transformation of database schemas completed!")


if __name__ == "__main__":
    main()

{'name': 'Roundcube_2007101000.sql', 'path': 'C:\\Users\\nambu\\Schema_Evolution\\MySQLDiff\\Database_Migration_Files\\Generate_Migration_Files\\Roundcube\\Migrationfiles\\Roundcube_2007101000.sql'}
Extraction and transformation of database schemas completed!
{'name': 'Roundcube_2008030300.sql', 'path': 'C:\\Users\\nambu\\Schema_Evolution\\MySQLDiff\\Database_Migration_Files\\Generate_Migration_Files\\Roundcube\\Migrationfiles\\Roundcube_2008030300.sql'}
Extraction and transformation of database schemas completed!
{'name': 'Roundcube_2008040500.sql', 'path': 'C:\\Users\\nambu\\Schema_Evolution\\MySQLDiff\\Database_Migration_Files\\Generate_Migration_Files\\Roundcube\\Migrationfiles\\Roundcube_2008040500.sql'}
Extraction and transformation of database schemas completed!
{'name': 'Roundcube_2008060900.sql', 'path': 'C:\\Users\\nambu\\Schema_Evolution\\MySQLDiff\\Database_Migration_Files\\Generate_Migration_Files\\Roundcube\\Migrationfiles\\Roundcube_2008060900.sql'}
Extraction and transf

### TOPOLOGICAL SORT

In [4]:
# Extract the table names and their dependencies from the SQL file.

MigrationFiles_Path = "{}/Migrationfiles_Mod".format(os.getcwd())
Tables_and_Dependencies_List = []
Create_Table_Pattern = re.compile(r'CREATE TABLE ([^\s\(]+)')
References_Pattern = re.compile(r'REFERENCES ([^\s\(]+)')

def extract_dependencies(Content):
    Dependencies = []
    References = re.findall(References_Pattern, Content)
    for reference in References:
        if reference not in Dependencies:
            Dependencies.append(reference)
    return Dependencies

for Filename in os.listdir(MigrationFiles_Path):
    if Filename.endswith('.sql'):
        File_Path = os.path.join(MigrationFiles_Path, Filename)
        with open(File_Path, 'r') as File:
            Content = File.read()
        
        Statements = Content.split(';')
        for statement in Statements:
            Table_Name = re.findall(Create_Table_Pattern, statement)

            if Table_Name:
                Table_Name = Table_Name[0]
                Dependencies = extract_dependencies(statement)
                Tables_and_Dependencies_List.append((Table_Name, Dependencies))

for Table, Dependency in Tables_and_Dependencies_List:
    print(f'Table: {Table}')
    print(f'Dependencies: {Dependency}')

Table: `cache`
Dependencies: []
Table: `contacts`
Dependencies: []
Table: `identities`
Dependencies: []
Table: `messages`
Dependencies: []
Table: `session`
Dependencies: []
Table: `users`
Dependencies: []
Table: `cache`
Dependencies: []
Table: `contacts`
Dependencies: []
Table: `identities`
Dependencies: []
Table: `messages`
Dependencies: []
Table: `session`
Dependencies: []
Table: `users`
Dependencies: []
Table: `cache`
Dependencies: []
Table: `contacts`
Dependencies: []
Table: `identities`
Dependencies: []
Table: `messages`
Dependencies: []
Table: `session`
Dependencies: []
Table: `users`
Dependencies: []
Table: `cache`
Dependencies: []
Table: `contacts`
Dependencies: []
Table: `identities`
Dependencies: []
Table: `messages`
Dependencies: []
Table: `session`
Dependencies: []
Table: `users`
Dependencies: []
Table: `cache`
Dependencies: []
Table: `contacts`
Dependencies: []
Table: `identities`
Dependencies: []
Table: `messages`
Dependencies: []
Table: `session`
Dependencies: []
Table: 

In [5]:
!pip install networkx




[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





In [6]:
# count of dependency tables
MigrationFiles_Path = "{}/Migrationfiles_Mod".format(os.getcwd())
Tables_with_Dependencies = set()
Create_Table_Pattern = re.compile(r'CREATE TABLE ([^\s\(]+)')
References_Pattern = re.compile(r'REFERENCES ([^\s\(]+)')

def extract_dependencies(Content):
    References = re.findall(References_Pattern, Content)
    for reference in References:
        Tables_with_Dependencies.add(reference)

for Filename in os.listdir(MigrationFiles_Path):
    if Filename.endswith('.sql'):
        File_Path = os.path.join(MigrationFiles_Path, Filename)
        with open(File_Path, 'r') as file:
            Content = file.read()
        
        Statements = Content.split(';')
        for statement in Statements:
            Table_Name = re.findall(Create_Table_Pattern, statement)
            
            if Table_Name:
                Table_Name = Table_Name[0]
                extract_dependencies(statement)

Total_Dependecy_Table_Count = len(Tables_with_Dependencies)
print(f'Total Count Dependency Tables: {Total_Dependecy_Table_Count}')
print(Tables_with_Dependencies)

Total Count Dependency Tables: 3
{'`users`', '`contactgroups`', '`contacts`'}


In [7]:
# Represent the tables and their dependencies as a directed graph.

Graph = nx.DiGraph()
for Table, _ in Tables_and_Dependencies_List:
    Graph.add_node(Table)

for Table, Dependencies in Tables_and_Dependencies_List:
    for dependency in Dependencies:
        Graph.add_edge(dependency, Table)

Sorted_Tables = list(nx.topological_sort(Graph))
print("Count of sorted tables:", len(Sorted_Tables))
for Table in Sorted_Tables:
    print(Table)

Count of sorted tables: 19
`cache`
`contacts`
`identities`
`messages`
`session`
`users`
`system`
`cache_shared`
`uploads`
`contactgroups`
`cache_index`
`cache_messages`
`cache_thread`
`dictionary`
`searches`
`filestore`
`collected_addresses`
`responses`
`contactgroupmembers`


In [8]:
# Perform topological sorting on the graph to obtain the correct order of tables.Output the tables in the sorted order.
#if already files in folder, remove and run this code.

MigrationFiles_Path = "{}/Migrationfiles_Mod".format(os.getcwd())

for Filename in os.listdir(MigrationFiles_Path):
    if Filename.endswith('.sql'):
        File_Path = os.path.join(MigrationFiles_Path, Filename)
        with open(File_Path, 'r') as file:
            Content = file.read()
        
        Mod_Text = ""
        for Table in Sorted_Tables:
            Table_Creation_Statement = re.search(r'CREATE TABLE {}(.+?);'.format(Table), Content, re.DOTALL)
            if Table_Creation_Statement:
                Mod_Text += Table_Creation_Statement.group(0) + "\n"
        with open(File_Path, 'w') as file:
            file.write(Mod_Text)

for Filename in os.listdir(MigrationFiles_Path):
    if Filename.endswith('.sql'):
        File_Path = os.path.join(MigrationFiles_Path, Filename)
        New_Filename = Filename.replace('.sql', '_sort.sql')
        New_FilePath = os.path.join(MigrationFiles_Path, New_Filename)
        os.rename(File_Path, New_FilePath)      

### Copy processed files to the Docker_Mysqldiff folder.

In [9]:
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
parent_parent_dir = os.path.dirname(parent_dir)

source_dir = os.path.join(cwd, "Migrationfiles_Mod")
destination_dir = os.path.join(parent_parent_dir, "Docker_Mysqldiff", "Roundcube", "Input")

if os.path.exists(destination_dir):
    shutil.rmtree(destination_dir)

shutil.copytree(source_dir, destination_dir)

print("Migration files copied to Docker_Mysqldiff folder successfully.")


Migration files copied to Docker_Mysqldiff folder successfully.


### Copy processed files to the Dataset folder.

In [10]:
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
pp_dir = os.path.dirname(parent_dir)
ppp_dir = os.path.dirname(pp_dir)
pppp_dir = os.path.dirname(ppp_dir)


source_dir = os.path.join(cwd, "Migrationfiles_Mod")
destination_dir = os.path.join(pppp_dir, "Datasets", "Roundcube", "Input", "Migration_Files")

if os.path.exists(destination_dir):
    shutil.rmtree(destination_dir)

shutil.copytree(source_dir, destination_dir)

print("Migration files copied to Dataset folder successfully.")

Migration files copied to Dataset folder successfully.
