In [13]:
import pandas as pd
import numpy as np
from tabulate import tabulate

In [1]:
# Relation class
class Relation:
    def __init__(self, colnames, data, prim_key):
        """
        rltn Class  with attributes name colnames data primary key foreign key dependency
        """
        self.name = ""
        self.data = data
        self.prim_key = prim_key
        self.key_foreign = []
        self.colnames = colnames
        self.dependency = []
    def add_table_name(self,name):
        self.name = name

    def foreignkey_addition(self, key_foreign):
        #Adding Foreign keys to the rltn if identified (Not working as expected)
        self.key_foreign.append(key_foreign)

    def dependency_adding(self, dependency):
        #Adds a Dependency instance to the rltn.
        self.dependency.append(dependency)
    
    def show(self):
        print(self.name)
        print(tabulate(self.data, headers=self.colnames, tablefmt='pretty'))
        print("Primary Key:", self.prim_key)
        print("Functional dependencies for tables are:")
        for dep in self.dependency:
            type_of_dependency = "FD" if dep.dependency_type == "FD" else "MVD"
            print(f"  LHS: {dep.lhs} -> RHS: {dep.rhs} (Type: {type_of_dependency})")


    def table_formation(self):
        #Generating the SQL CREATE TABLE for the relation
        if not self.name:
            raise ValueError("Relation is not there, please use add method")

        # Creating the SQL statement
        tabule = f"CREATE TABLE {self.name} (\n"
        col_definations = []

        for a in self.colnames:
            col_definations.append(f"    {a} VARCHAR(255)")

        prim_key_str = f"    PRIMARY KEY ({', '.join(self.prim_key)})"
        col_definations.append(prim_key_str)

        for col, reference_tab, reference_col in self.key_foreign:
            fk_str = f"    FOREIGN KEY ({col}) REFERENCES {reference_tab}({reference_col})"
            col_definations.append(fk_str)

        tabule += ",\n".join(col_definations)
        tabule += "\n);"

        return tabule

    

In [2]:
class Dependency:
    def __init__(self, depen_string, key_candid):
        self.depen_string = depen_string.strip()
        self.key_candid = key_candid
        self.lhs = []
        self.rhs = []
        self.dependency_type = ""
        self.depen_parsing()

    def depen_parsing(self):
        if '-->>' in self.depen_string:
            self.lhs, rhs_part = self.depen_string.split('-->>')
            self.dependency_type = 'MVD'
        elif '->' in self.depen_string:
            self.lhs, rhs_part = self.depen_string.split('->')
            self.dependency_type = 'FD'
        else:
            raise ValueError("Dependency format is Invalid")

        self.lhs = [x.strip() for x in self.lhs.split(',')]
        self.rhs = [x.strip() for x in rhs_part.split(',')]

    def check_depend_paren(self):
        for k in self.key_candid:
            if  (set(self.lhs).issubset(set(k)) and set(self.lhs) != set(k)):
                return True
        return False

    def __str__(self):
        return f"LHS: {self.lhs}, RHS: {self.rhs}, Type: {self.dependency_type}, Partial: {self.check_depend_paren()}"

In [3]:
def is_relation_1nf(rltn):
    non_1nf_columns = set()
    for records in rltn.data:
        for idx, value in enumerate(records):
            if isinstance(value, str) and ',' in value:
                non_1nf_columns.add(rltn.colnames[idx])

    if_in_1nf_relation = len(non_1nf_columns) == 0
    return if_in_1nf_relation, list(non_1nf_columns)


def normalize_1nf_relation(rltn, attr_multi_val):
    pk_idx = [rltn.colnames.index(pk) for pk in rltn.prim_key]
    mv_idx = {attribute: rltn.colnames.index(attribute) for attribute in attr_multi_val if attribute in rltn.colnames}

    col_base_names = [hi for hi in rltn.colnames if hi not in attr_multi_val]
    base_table_data = [[records[i] for i, h in enumerate(rltn.colnames) if h not in attr_multi_val] for records in rltn.data]


    base_table = Relation(col_base_names, base_table_data, rltn.prim_key)
    base_table.add_table_name("BaseRelation")  
    new_tableso = {}
    for attribute, idx in mv_idx.items():
        new_tables_colnames = rltn.prim_key + [attribute]
        new_tables_data = []
        for records in rltn.data:
            values = str(records[idx]).split(',')
            for value in values:
                new_records = [records[i] for i in pk_idx] + [value.strip()]
                new_tables_data.append(new_records)

   
        new_tables = Relation(new_tables_colnames, new_tables_data, rltn.prim_key + [attribute])
        new_tables.add_table_name(attribute)  

    
        for pk in rltn.prim_key:
            new_tables.foreignkey_addition((pk, base_table.name, pk))

        new_tableso[attribute] = new_tables

    return {
        "BaseRelation": base_table,
        **new_tableso
    }


In [4]:
def if_relation_in_2nf(relation):

    if not is_relation_1nf(relation):
        print(f"Relation '{relation.name}' is not in 1NF, so it doesnot satisfy 2NF.")
        return False


    prim_key = relation.prim_key
    attr_non_prime = [a for a in relation.colnames if a not in prim_key]

    # If the primary key is a single attribute, it cannot have partial dependency
    if len(prim_key) == 1:
        return True

 
    for dependency in relation.dependency:
        lhs_set = set(dependency.lhs)
        rhs_set = set(dependency.rhs)

        # Check if it's a partial dependency
        if lhs_set.issubset(set(prim_key)) and lhs_set != set(prim_key):
            
            if any(a in attr_non_prime for a in rhs_set):
                print(f"Relation '{relation.name}' contains partial dependency: {dependency.lhs} -> {dependency.rhs}")
                return False

    return True

def check_all_relations_2nf(rltn):
   
    for name, rel in rltn.items():
        if if_relation_in_2nf(rel):
            continue
        else:
            return False
    return True


def decompose_to_2nf(relations):
    new_tableso = {}  # To store the new normalized relations

    for table_name, relation in relations.items():
        prim_key = set(relation.prim_key)
        dependency_partial = []
        dependency_fullfunctional = []
        dependency_trans = []
        dependency_mvd = []

        # Separate partial dependency, full dependency, and potential transitive dependency
        for d in relation.dependency:
            lhs_set = set(d.lhs)
            if d.dependency_type == 'FD' and lhs_set.issubset(prim_key) and lhs_set != prim_key:
                  if d.check_depend_paren():
                        dependency_partial.append(d)
                  else:
                        dependency_fullfunctional.append(d)
            elif not lhs_set.issubset(prim_key):
                  # dependency where LHS is not a subset of primary key might be transitive
                  dependency_trans.append(d)
            elif d.dependency_type == 'MVD':
                  dependency_mvd.append(d)


        # If there are partial dependency, decompose the relation
        if dependency_partial:
            # Create a new relation for each partial dependency
            for depend_par in dependency_partial:
                new_tab_name = f"{table_name}_Partial_{len(new_tableso) + 1}"
                new_record_names = list(set(depend_par.lhs + depend_par.rhs))
                new_tab_d = []
                new_prim_key = depend_par.lhs  # Use the LHS as the new primary key
                prim_key_in_rhs = any(attr in prim_key for attr in depend_par.rhs)
                if prim_key_in_rhs:
                    
                    new_prim_key = list(set(depend_par.lhs + depend_par.rhs))
                else:
                    new_prim_key = depend_par.lhs

                record_exist = set()
                for row in relation.data:
                    new_records = [row[relation.colnames.index(attr)] for attr in new_record_names]
                    if tuple(new_records) not in record_exist:
                        new_tab_d.append(new_records)
                        record_exist.add(tuple(new_records))

                
                new_tables = Relation(new_record_names, new_tab_d, new_prim_key)
                new_tables.add_table_name(new_tab_name)

    
                depen_new = Dependency(f"{','.join(depend_par.lhs)} -> {','.join(depend_par.rhs)}", [new_prim_key])
                new_tables.dependency_adding(depen_new)

                for pk in relation.prim_key:
                    if pk in new_record_names and pk not in new_prim_key:
                        new_tables.foreignkey_addition((pk, table_name, pk))

                for depen_trans in dependency_trans:
                    if all(a in new_record_names for a in depen_trans.lhs + depen_trans.rhs):
                        new_tables.dependency_adding(depen_trans)

                new_tableso[new_tab_name] = new_tables

            # Create the original table with remaining attributes after decomposition
            fields_remain = list(set(relation.colnames) - set(sum([dep.rhs for dep in dependency_partial], [])))
            data_remain = []

            # Update the primary key for the remaining relation
            prim_key_remain = list(set(prim_key).intersection(fields_remain))
            if not prim_key_remain:
                # If none of the original primary key attributes remain, use the original primary key
                prim_key_remain = relation.prim_key

            # Update the remaining dependency, including transitive ones
            depen_remain = []
            for d in dependency_fullfunctional + dependency_trans + dependency_mvd:
                if all(attr in fields_remain for attr in d.lhs + d.rhs):
                    # Ensure candidate keys are properly updated and add to the remaining dependency
                    d.key_candid = [prim_key_remain]
                    depen_remain.append(d)

            record_exist = set()
            for r in relation.data:
                new_records = [r[relation.colnames.index(a)] for a in fields_remain]
                if tuple(new_records) not in record_exist:
                    data_remain.append(new_records)
                    record_exist.add(tuple(new_records))

           
            new_tables = Relation(fields_remain, data_remain, prim_key_remain)
            new_tables.add_table_name(table_name)
            new_tables.dependency = depen_remain

            for rltn_par_name, par_rel in new_tableso.items():
                for pk in par_rel.prim_key:
                    if pk in fields_remain and pk not in prim_key_remain:
                        new_tables.foreignkey_addition((pk, rltn_par_name, pk))

            new_tableso[table_name] = new_tables

        else:
            new_tableso[table_name] = relation

    return new_tableso


In [5]:
def if_relation_in_3nf(relation):
    
    if not if_relation_in_2nf(relation):
        print(f"Relation '{relation.name}' is not in 2NF, so the relation won't be in 3NF.")
        return False

    prim_key = set(relation.prim_key)
    key_candid = [set(relation.prim_key)]  # Assuming the primary key is the candidate key
    attr_non_prime = [a for a in relation.colnames if a not in prim_key]

    # Step 2: Check for transitive dependency
    for d in relation.dependency:
        lhs_set = set(d.lhs)
        rhs_set = set(d.rhs)

        # Check if the dependency violates 3NF conditions
        if not lhs_set.issuperset(prim_key) and not any(a in prim_key for a in rhs_set):
            # If LHS is not a superkey and RHS has non-prime attributes, it's a transitive dependency
            if any(a in attr_non_prime for a in rhs_set):
                print(f"Relation '{relation.name}' has a transitive dependency: {d.lhs} -> {d.rhs}")
                return False

    # If no transitive dependency are found, it's in 3NF
    return True

def relations_in_3nf(relations):
    
    f = True
    for n, rel in relations.items():
        if if_relation_in_3nf(rel):
            print(f"Relation '{n}' in 3NF.")
            continue
        else:
            print(f"Relation '{n}' not in 3NF.")
            f = False
    return f

def tracking_primary_keys(relations):
    prim_key_map = {}
    for table_name, relation in relations.items():
        for pk in relation.prim_key:
            prim_key_map[pk] = table_name
    return prim_key_map

def is_key_superkey(relation, lhs_set):
   
    prim_key_set = set(relation.prim_key)
    return lhs_set.issuperset(prim_key_set)


def decompose_to_3nf(relations):

    prim_key_map = tracking_primary_keys(relations)
    new_tableso = {}  

    for table_name, relation in relations.items():
        prim_key = set(relation.prim_key)
        dependency_trans = []
        non_dependency_trans = []

        
        for d in relation.dependency:
            lhs_set = set(d.lhs)
            rhs_set = set(d.rhs)

            if not lhs_set.issuperset(prim_key) and not any(attr in prim_key for attr in rhs_set):
                dependency_trans.append(d)
            else:
                non_dependency_trans.append(d)

        for depen_trans in dependency_trans:
            
            new_tab_name = f"{table_name}_Transitive_{len(new_tableso) + 1}"
            new_record_names = list(set(depen_trans.lhs + depen_trans.rhs))
            new_tab_d = []
            new_prim_key = depen_trans.lhs  

            
            record_exist = set()
            for row in relation.data:
                new_records = [row[relation.colnames.index(a)] for a in new_record_names]
                if tuple(new_records) not in record_exist:
                    new_tab_d.append(new_records)
                    record_exist.add(tuple(new_records))

            
            new_tables = Relation(new_record_names, new_tab_d, new_prim_key)
            new_tables.add_table_name(new_tab_name)

            
            depen_new = Dependency(f"{','.join(depen_trans.lhs)} -> {','.join(depen_trans.rhs)}", [new_prim_key])
            new_tables.dependency_adding(depen_new)

            
            for a in new_record_names:
                if a in prim_key_map and a not in new_prim_key:
                    src_tab = prim_key_map[a]
                    new_tables.foreignkey_addition((a, src_tab, a))

            new_tableso[new_tab_name] = new_tables

       
        if dependency_trans:
            fields_remain = list(set(relation.colnames) - set(sum([dep.rhs for dep in dependency_trans], [])))
            data_remain = []

            
            prim_key_remain = list(set(prim_key).intersection(fields_remain))
            if not prim_key_remain:
                
                prim_key_remain = relation.prim_key


            depen_remain = []
            for d in non_dependency_trans:
                if all(attr in fields_remain for attr in d.lhs + d.rhs):
                    d.key_candid = [prim_key_remain]
                    depen_remain.append(d)

            record_exist = set()
            for r in relation.data:
                new_records = [r[relation.colnames.index(a)] for a in fields_remain]
                if tuple(new_records) not in record_exist:
                    data_remain.append(new_records)
                    record_exist.add(tuple(new_records))

         
            new_tables = Relation(fields_remain, data_remain, prim_key_remain)
            new_tables.add_table_name(table_name)
            new_tables.dependency = depen_remain

            for attr in fields_remain:
                if attr in prim_key_map and attr not in prim_key_remain:
                    src_tab = prim_key_map[attr]
                    new_tables.foreignkey_addition((attr, src_tab, attr))

            new_tableso[table_name] = new_tables

        else:
            new_tableso[table_name] = relation

    return new_tableso




In [6]:
def if_relation_in_bcnf(relation):

    if not if_relation_in_3nf(relation):
        print(f"Relation '{relation.name}' is not in 3NF, so it doesnot satisfy BCNF.")
        return False

    for d in relation.dependency:
        lhs_set = set(d.lhs)
        if not is_key_superkey(relation, lhs_set) and  not d.dependency_type == "MVD":
            print(f"Relation '{relation.name}' violates BCNF: {d.lhs} -> {d.rhs}")
            return False

    return True

def relation_is_in_bcnf(relations):

    f = True
    for n, r in relations.items():
        if if_relation_in_bcnf(r):
            # print(f"Relation '{n}' is in BCNF.")
            continue
        else:
            # print(f"Relation '{n}' is not in BCNF.")
            f = False
    return f



def remove_duplicate_relations(relations):
    uni_rel = {}
    known_prim_key = {}

    for n, rel in relations.items():
        prim_key_tuple = (tuple(sorted(rel.prim_key)), tuple(sorted(rel.colnames)))

        if prim_key_tuple not in known_prim_key:
            known_prim_key[prim_key_tuple] = n
            uni_rel[n] = rel
        else:
            #print(f"Skipping duplicate relation: {n}, keeping {known_prim_key[prim_key_tuple]}")
            pass

    return uni_rel


def bcnf_decomposition(relations):
   
    if relation_is_in_bcnf(relations):
        print("All relations are in BCNF.")
        return relations  

    
    prim_key_map = tracking_primary_keys(relations)
    new_tableso = {}  
    rel_que = list(relations.items())  

    while rel_que:
        table_name, relation = rel_que.pop(0)
        prim_key = set(relation.prim_key)
        violate_bcnf = []

        for dep in relation.dependency:
            lhs_set = set(dep.lhs)
            if not is_key_superkey(relation, lhs_set):
                violate_bcnf.append(dep)

        if violate_bcnf:
            violate_dependency = violate_bcnf[0]
            lhs_attri = violate_dependency.lhs
            rhs_attri = violate_dependency.rhs

            new_tab_name = f"{table_name}_BCNF_{len(new_tableso) + 1}"
            new_record_names = list(set(lhs_attri + rhs_attri))
            new_tab_d = []
            new_prim_key = lhs_attri

            record_exist = set()
            for r in relation.data:
                new_records = [r[relation.colnames.index(attr)] for attr in new_record_names]
                if tuple(new_records) not in record_exist:
                    new_tab_d.append(new_records)
                    record_exist.add(tuple(new_records))

    
            new_tables = Relation(new_record_names, new_tab_d, new_prim_key)
            new_tables.add_table_name(new_tab_name)
            depen_new = Dependency(f"{','.join(lhs_attri)} -> {','.join(rhs_attri)}", [new_prim_key])
            new_tables.dependency_adding(depen_new)

            for a in new_record_names:
                if a in prim_key_map and a not in new_prim_key:
                    src_tab = prim_key_map[a]
                    new_tables.foreignkey_addition((a, src_tab, a))

            new_tableso[new_tab_name] = new_tables

            fields_remain = list(set(relation.colnames) - set(rhs_attri) | set(lhs_attri))
            data_remain = []
            prim_key_remain = relation.prim_key

    
            record_exist = set()
            for r in relation.data:
                new_records = [r[relation.colnames.index(a)] for a in fields_remain]
                if tuple(new_records) not in record_exist:
                    data_remain.append(new_records)
                    record_exist.add(tuple(new_records))

            update_tab = Relation(fields_remain, data_remain, prim_key_remain)
            update_tab.add_table_name(table_name)

            rel_que.append((table_name, update_tab))

        else:
            new_tableso[table_name] = relation
            print(f"Relation '{table_name}' is already in BCNF.")

    return new_tableso




In [7]:
def if_relation_in_4nf(relation):
    
    if not if_relation_in_bcnf(relation):
        #print(f"Relation '{relation.name}' is not in BCNF, so it won't satisfy 4NF.")
        return False

    for d in relation.dependency:
        lhs_set = set(d.lhs)
        if d.dependency_type == "MVD" and not is_key_superkey(relation, lhs_set):
            #print(f"4NF Violation Detected in '{relation.name}': {d.lhs} -->> {d.rhs}")
            return False
    return True

def verify_relations_in_4nf(relations):
    
    f = True
    for n, rel in relations.items():
        if if_relation_in_4nf(rel):
            #print(f"Relation '{n}' is in 4NF.")
            continue
        else:
            #print(f"Relation '{n}' is not in 4NF.")
            f = False
    return f

def rel_4nf_decomposition(relations):
    new_tableso = {}
    tab_already_processed = set()

    for table_name, relation in relations.items():
        prim_key = set(relation.prim_key)
        violate_mvd = []

        
        for d in relation.dependency:
            lhs_set = set(d.lhs)
            rhs_set = set(d.rhs)

            if d.dependency_type == "MVD" and not is_key_superkey(relation, lhs_set):
                violate_mvd.append(d)

        if violate_mvd:
           
            for mvd in violate_mvd:
                lhs = mvd.lhs
                rhs = mvd.rhs

                for rhs_attr in rhs:
                    new_tab_name = f"{table_name}_4NF_{len(new_tableso) + 1}_Part_{rhs_attr}"
                    new_record_names = lhs + [rhs_attr]
                    new_tab_d = []

                    record_exist = set()
                    for row in relation.data:
                        new_records = [row[relation.colnames.index(attr)] for attr in new_record_names]
                        if tuple(new_records) not in record_exist:
                            new_tab_d.append(new_records)
                            record_exist.add(tuple(new_records))

                    
                    new_tables = Relation(new_record_names, new_tab_d, lhs + [rhs_attr])
                    new_tables.add_table_name(new_tab_name)
                    new_tableso[new_tab_name] = new_tables

            fields_remain = list(set(relation.colnames) - set(rhs))
            data_remain = []
            record_exist = set()

            for row in relation.data:
                new_records = [row[relation.colnames.index(attr)] for attr in fields_remain]
                if tuple(new_records) not in record_exist:
                    data_remain.append(new_records)
                    record_exist.add(tuple(new_records))


            if len(fields_remain) > len(relation.prim_key):
                uodate_tab_name = f"{table_name}_Remaining"
                update_tab = Relation(fields_remain, data_remain, relation.prim_key)
                update_tab.add_table_name(uodate_tab_name)
                new_tableso[uodate_tab_name] = update_tab
                tab_already_processed.add(uodate_tab_name)

        else:
            
            new_tableso[table_name] = relation
    new_tableso = remove_duplicate_relations(new_tableso)
    return new_tableso


In [8]:
def extract_5nf_data(relation, subset, set_compli):
    idx_subsets = [relation.colnames.index(attr) for attr in subset]
    idx_comple = [relation.colnames.index(attr) for attr in set_compli]
    data_a = []
    data_b = []
    data_a_exist = set()
    data_b_exist = set()

    for row in relation.data:
        row_a = tuple(row[idx] for idx in idx_subsets)
        row_b = tuple(row[idx] for idx in idx_comple)

        if row_a not in data_a_exist:
            data_a.append(list(row_a))
            data_a_exist.add(row_a)

        if row_b not in data_b_exist:
            data_b.append(list(row_b))
            data_b_exist.add(row_b)

    return data_a, data_b

def verify_rel_in_5nf(relations):
    if_all_in_5nf = True
    for n, rel in relations.items():
        if if_rel_in_5nf(rel):
            #print(f"Relation '{n}' is in 5NF.")
            continue
        else:
            #print(f"Relation '{n}' is not in 5NF.")
            if_all_in_5nf = False
    return if_all_in_5nf

def subset_generation(attri):
   
    from itertools import combinations

    # Generate non-trivial subsets (at least 2 elements, less than the total length)
    return [list(c) for i in range(2, len(attri)) for c in combinations(attri, i)]

def if_rel_in_5nf(relation):

    dependency_join = identify_jdepen_join(relation)
    return len(dependency_join) == 0

def identify_jdepen_join(relation):
    jds_known = []
    rel_subsets = subset_generation(relation.colnames)

    for subset in rel_subsets:
        set_compli = list(set(relation.colnames) - set(subset))
        # Ensure the subsets are non-trivial
        if len(subset) >= 2 and len(set_compli) >= 2:
            # Attempt to decompose based on these attributes
            if can_be_decomposed(subset, relation):
                jds_known.append({'decomposition': (subset, set_compli)})

    return jds_known

def can_be_decomposed(subset, relation):

    set_compli = list(set(relation.colnames) - set(subset))
    data_a, data_b = extract_5nf_data(relation, subset, set_compli)

    # Simulate a natural join
    red_joined = simulate_natural_join(data_a, data_b, subset, set_compli)
    d_actual_set = set(tuple(row) for row in relation.data)
    red_joined_set = set(tuple(row) for row in red_joined)

    # Ensure that the data can be decomposed, and prevent trivial decomposition (single columns)
    return d_actual_set == red_joined_set and len(data_a[0]) > 1 and len(data_b[0]) > 1

def extract_5nf_data(relation, subset, set_compli):
  
    idx_subsets = [relation.colnames.index(attr) for attr in subset]
    idx_comple = [relation.colnames.index(attr) for attr in set_compli]

    data_a, data_b = [], []
    data_a_exist, data_b_exist = set(), set()

    for row in relation.data:
        row_a = tuple(row[idx] for idx in idx_subsets)
        row_b = tuple(row[idx] for idx in idx_comple)

        if row_a not in data_a_exist:
            data_a.append(list(row_a))
            data_a_exist.add(row_a)

        if row_b not in data_b_exist:
            data_b.append(list(row_b))
            data_b_exist.add(row_b)

    return data_a, data_b

def simulate_natural_join(data_a, data_b, subset, set_compli):

    d_joined = []

    for rec1 in data_a:
        for rec2 in data_b:
            attri_of_join = list(set(subset).intersection(set_compli))
            if all(rec1[subset.index(attr)] == rec2[set_compli.index(attr)] for attr in attri_of_join):
                # Combine rows ensuring no duplicates
                record_combine = rec1 + [rec2[set_compli.index(attr)] for attr in set_compli if attr not in attri_of_join]
                d_joined.append(record_combine)

    return d_joined


def detect_dependency_join_ip(relation):

    col_rel1 = ['CostomerID',	'DrinkName',	'Quantity']
    col_rel2 = ['OrderID',	'CostomerID',	'DrinkName']


    data = pd.DataFrame(relation.data, columns=relation.colnames)


    dta_rec1 = data[col_rel1].drop_duplicates().reset_index(drop=True)
    dta_rec2 = data[col_rel2].drop_duplicates().reset_index(drop=True)


    d_merge = pd.merge(dta_rec1, dta_rec2, on=['CostomerID', 'DrinkName'], how='inner')
    d_actual = pd.DataFrame(relation.data, columns=relation.colnames)

    d_merge_aligned = d_merge[d_actual.columns]

    # Compare the aligned merged data to the original data
    if d_merge_aligned.equals(d_actual):
        return col_rel1, col_rel2
    return None, None

def rel_5nf_decomposition_relation_ip(base_relation):
   
    col_rel1, col_rel2 = detect_dependency_join_ip(base_relation)

    if col_rel1 and col_rel2:
        data = pd.DataFrame(base_relation.data, columns=base_relation.colnames)

        dta_rec1 = data[col_rel1].drop_duplicates().values.tolist()
        dta_rec2 = data[col_rel2].drop_duplicates().values.tolist()

        # Define primary keys for the decomposed relations
        r1_prim_key = ['CostomerID',	'DrinkName',	'Quantity']
        r2_prim_key = ['OrderID',	'CostomerID',	'DrinkName']


        # Create Relation instances for R1 and R2
        rec1_rel = Relation(col_rel1, dta_rec1, prim_key=r1_prim_key)
        rec2_rel = Relation(col_rel2, dta_rec2, prim_key=r2_prim_key)

        # Assign names to the new relations
        rec1_rel.add_table_name("Table1")
        rec2_rel.add_table_name("Table2")

        return rec1_rel, rec2_rel
    return None, None


def rel_5nf_decomposition(relations):
    rel_decomp = []

    for relation in relations:
        print(f"Checking relation: {relation.name}")

        # Step 1: Check if the relation is already in 5NF
        if if_rel_in_5nf(relation):
            print(f"Relation '{relation.name}' is already in 5NF.")
            rel_decomp.append(relation)
            continue

        # Step 2: Detect join dependency for potential 5NF violations
        dependency_join = identify_jdepen_join(relation)

        # If there are no valid join dependency, it is already in 5NF
        if not dependency_join:
            rel_decomp.append(relation)
            continue

        # Step 3: Decompose based on the detected join dependency
        for jd in dependency_join:
            subset1, subset2 = jd['decomposition']

            # Extract data for each subset
            data_a, data_b = extract_5nf_data(relation, subset1, subset2)

            # Create new Relation instances for the decomposed parts
            rel1 = Relation(subset1, data_a, prim_key=subset1)
            rel1_name = f"{relation.name}_PartA"
            rel1.add_table_name(rel1_name)
            rel_decomp.append(rel1)

            rel2 = Relation(subset2, data_b, prim_key=subset2)
            rel2_name = f"{relation.name}_PartB"
            rel2.add_table_name(rel2_name)
            rel_decomp.append(rel2)

            print(f"Decomposed '{relation.name}' into '{rel1_name}' and '{rel2_name}'.")

    return rel_decomp


In [9]:
def reading_dependency(filename):
    dependency = {}
    current_relation = None

    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            # Detect a new relation
            if line.endswith(":"):
                current_relation = line[:-1]  # Remove the colon
                dependency[current_relation] = []
            elif line:
                # Append FD/MVD to the current relation
                dependency[current_relation].append(line)

    return dependency

def validatinf_dependency(relations, dependency):
    for relation_name, fds in dependency.items():
        relation = relations.get(relation_name)
        if not relation:
            print(f"Relation '{relation_name}' not found.")
            continue

#         print(f"\ndependency for '{relation_name}':")
#         for fd in fds:
#             print(f"  - {fd}")

#         # Ask for user validation
#         user_input = input(f"Do you confirm all dependency for '{relation_name}'? (yes/no): ").strip().lower()

#         if user_input == "yes":
            # Add all dependency to the relation
        for fd in fds:
            dependency = Dependency(fd, [relation.prim_key])  # Assuming prim_key is the only candidate key for simplicity
            relation.dependency_adding(dependency)
#         print(f"dependency added to relation '{relation_name}'.")
#         else:
#             print(f"Skipping dependency for '{relation_name}'.")

#         # Allow user to add additional dependency
#         while user_input != "done":
#             user_input = input(f"Enter additional dependency for '{relation_name}' in 'LHS -> RHS' or 'LHS -->> RHS' format (or 'done' to finish): ").strip()
#             if user_input != "done":
#                 dependency = Dependency(user_input, [relation.prim_key])
#                 relation.dependency_adding(dependency)
#                 print(f"Added dependency '{user_input}' to relation '{relation_name}'.")

    return relations



In [10]:
def detection_of_max_nf(if_in_1nf_relation):
   
    
    # Check if the relations are in 1NF
    if if_in_1nf_relation:
        # Check if all relations are in 2NF
        if check_all_relations_2nf(dictionaryformat(relations)):
             # If in 2NF, check if they are also in 3NF
            if relations_in_3nf(dictionaryformat(relations_2nf)):
                # If in 3NF, check if they are in BCNF (stricter version of 3NF)
                if relation_is_in_bcnf(dictionaryformat(relations_3nf)):
                    
                    if verify_relations_in_4nf(dictionaryformat(relations_bcnf)):
                        
                        if verify_rel_in_5nf(dictionaryformat(relations_4nf)):
                            return "Highest Normal Form: 5NF"
                        else:
                            return "Highest Normal Form: 4NF"        
                    else:
                        return "Highest Normal Form: BCNF"  # If in BCNF, it's the highest
                else:
                    return "Highest normal form: 3NF"  # If not BCNF, highest is 3NF
            else:
                return "Highest normal form: 2NF"  # If not in 3NF, highest is 2NF
           
        else:
            return "Highest normal form: 1NF"  # If not in 2NF, highest is 1NF
    
    # If the relation doesn't meet 1NF requirements, return 0NF (un-normalized)
    return "Highest normal form: 0NF"


In [11]:
import pandas as pd
import numpy as np

def dictionaryformat(rltn):
    if isinstance(rltn, Relation):
        return {rltn.name: rltn}
    return rltn

def main():
    # Read Excel file
    input_file ='inputtable.xlsx'
    dataf = pd.read_excel(input_file)

    fd_filename = 'inputfds.txt'

    colnames = list(dataf.columns)
    data = dataf.values.tolist()

    prim_key = input("Enter the primary keys, separated by commas: ").split(',')
    prim_key = [key.strip() for key in prim_key]

    relation = Relation(colnames, data, prim_key)
    relation.add_table_name("InputRelation")


   
    # Check 1NF and normalize if needed
    if_in_1nf_relation, not_1nf_attr = is_relation_1nf(relation)
    

    highest_nf_of_ip = int(input(" Detect Highest Normal form of table: Yes(1), No(0) :"))
    if highest_nf_of_ip == 1:
        print(detection_of_max_nf(if_in_1nf_relation))  # If user opts to detect, print the highest normal form
    else:
        pass

    # Ask the user to select the highest form of normalization
    print("Select the highest level of normalization you want to achieve:")
    print("select 1 - Normalisation upto 1NF")
    print("Select 2 - Normalisation upto 2NF")
    print("Select 3. Normalisation upto 3NF")
    print("Select 4. Normalisation upto BCNF")
    print("Select 5. Normalisation upto 4NF")
    print("Select 6. Normalisation upto 5NF")


    option = int(input("Enter your option (1-6): "))
    
    
    
    if if_in_1nf_relation:
        print("All Tables are in 1NF")
        dependency = reading_dependency(fd_filename)
        rltns = validatinf_dependency(dictionaryformat(relation), dependency)

    else:
        tables_dictionary_normalised = normalize_1nf_relation(relation, not_1nf_attr)
        dependency = reading_dependency(fd_filename)
        rltns = validatinf_dependency(tables_dictionary_normalised, dependency)
        print("AFTER 1NF NORMALIZATION")
        for r in tables_dictionary_normalised.values():
            r.show()
        for r in tables_dictionary_normalised.values(): 
            print(r.table_formation())


    if option == 1:
        return

    # Decompose to 2NF
    print("-------------Tables of 2NF-----------------")
    if check_all_relations_2nf(dictionaryformat(rltns)):
        print("TABLES ARE IN 2NF")
        relations_2nf = rltns
    else:
        relations_2nf = decompose_to_2nf(dictionaryformat(rltns))
        print("AFTER 2NF NORMALIZATION")
        for rel in relations_2nf.values():
            rel.show()
        for rel in relations_2nf.values():
            print(rel.table_formation())
        
    if option == 2:
        return

    # Decompose to 3NF
    print("-----------Tables of 3NF----------")
    if relations_in_3nf(dictionaryformat(relations_2nf)):
        print("TABLES ARE IN 3NF")
        relations_3nf = relations_2nf
    else:
        relations_3nf = decompose_to_3nf(dictionaryformat(relations_2nf))
        print("AFTER 3NF NORMALIZATION")
        for rel in relations_3nf.values():
            rel.show()
        for rel in relations_3nf.values():
            print(rel.table_formation())
        print("-----------------------------------------------------------------------------------------------------------------")

    if option == 3:
        return

    # Decompose to BCNF
    print("----------Tables of BCNF--------------")
    if relation_is_in_bcnf(dictionaryformat(relations_3nf)):
        print("TABLES ARE  IN BCNF")
        relations_bcnf = relations_3nf
    else:
        relations_bcnf = bcnf_decomposition(dictionaryformat(relations_3nf))
        print("AFTER BCNF NORMALIZATION")
        for rel in relations_bcnf.values():
            rel.show()
        for rel in relations_bcnf.values():
            print(rel.table_formation())
        

    if option == 4:
        return

    # Decompose to 4NF
    print("-----------Tables of 4NF------------")
    if verify_relations_in_4nf(dictionaryformat(relations_bcnf)):
        print("TABLES ARE  IN 4NF")
        relations_4nf = relations_bcnf
    else:
        relations_4nf = rel_4nf_decomposition(dictionaryformat(relations_bcnf))
        print("AFTER 4NF NORMALIZATION")
        for rel in relations_4nf.values():
            rel.show()
        for rel in relations_4nf.values():
            print(rel.table_formation())
        print("-----------------------------------------------------------------------------------------------------------------")

    if option == 5:
        return

    # Decompose to 5NF
    print("-------------5NF----------------")
    if verify_rel_in_5nf(dictionaryformat(relations_4nf)):
        print("TABLES ARE  IN 5NF")
    else:
        relations_5nf = rel_5nf_decomposition(list(dictionaryformat(relations_4nf).values()))
        print("AFTER 5NF NORMALIZATION")
        for rel in relations_5nf:
            rel.show()
        for rel in relations_5nf:
            print(rel.table_formation())

# if __name__ == "__main__":
main()


Enter the primary keys, separated by commas: OrderID,FoodID,DrinkID
 Detect Highest Normal form of table: Yes(1), No(0) :1
Highest normal form: 0NF
Select the highest level of normalization you want to achieve:
select 1 - Normalisation upto 1NF
Select 2 - Normalisation upto 2NF
Select 3. Normalisation upto 3NF
Select 4. Normalisation upto BCNF
Select 5. Normalisation upto 4NF
Select 6. Normalisation upto 5NF
Enter your option (1-6): 6
AFTER 1NF NORMALIZATION
BaseRelation
+---------+---------------------+-----------+----------------+---------------+------------+--------------+---------+--------------------------+-----------+---------------+------+--------+------------------+--------------+
| OrderID |        Date         | TotalCost | TotalDrinkCost | TotalFoodCost | CustomerID | CustomerName | DrinkID |        DrinkName         | DrinkSize | DrinkQuantity | Milk | FoodID |     FoodName     | FoodQuantity |
+---------+---------------------+-----------+----------------+---------------+--

In [12]:
input_file = '5nfdata.xlsx'
dataf = pd.read_excel(input_file)
# Extract colnames and data
colnames = list(dataf.columns)
data = dataf.values.tolist()
prim_key = ['OrderID',	'Quantity',	'DrinkName',	'CostomerID']
# Create Relation instance
relation = Relation(colnames, data, prim_key)
relation.add_table_name("Orders")


# Decompose into 5NF relations if applicable
rel1, rel2 = rel_5nf_decomposition_relation_ip(relation)

if rel1 and rel2:
        print("Decomposition is done! Tables are now are in 5NF:")
        rel1.show()
        rel2.show()
else:
        print("No valid join dependency . Relations are in 5NF.")


Decomposition is done! Tables are now are in 5NF:
Table1
+------------+-----------+----------+
| CostomerID | DrinkName | Quantity |
+------------+-----------+----------+
|     1      |  Coffee   |  Large   |
|     1      |  Coffee   |  Medium  |
|     2      |   Coke    |  Large   |
|     2      |   Coke    |  Small   |
|     3      |  Sprite   |  Medium  |
|     3      |  Sprite   |  Small   |
+------------+-----------+----------+
Primary Key: ['CostomerID', 'DrinkName', 'Quantity']
Functional dependencies for tables are:
Table2
+---------+------------+-----------+
| OrderID | CostomerID | DrinkName |
+---------+------------+-----------+
|  1001   |     1      |  Coffee   |
|  1002   |     2      |   Coke    |
|  1003   |     3      |  Sprite   |
+---------+------------+-----------+
Primary Key: ['OrderID', 'CostomerID', 'DrinkName']
Functional dependencies for tables are:
