In [1]:
import sys
import os
import warnings
import time

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", "..", ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)
warnings.filterwarnings("ignore")

In [2]:
from src.database.db_connection import *
from src.database.preparation import *

In [3]:
file_path = os.path.join(
    os.getcwd(), "..", "..", "..",
    "queries", "FP", "synthetic", "Students.csv"
)
file_path = os.path.abspath(file_path)
df_data = load_csv_to_df(file_path)

In [4]:
engine = create_engine_for_db()

In [5]:
df_data.to_sql('students_truth',engine,schema='prototype_fp',index=False,if_exists='replace')

10

In [6]:
fd_constraints = [(['sid', 'university'],['sname', 'address', 'postcode', 'emailid', 'phonenum','age'])]

In [7]:
fd_LHS, fd_RHS = fd_constraints[0]

In [8]:
fd_LHS

['sid', 'university']

In [9]:
fd_RHS

['sname', 'address', 'postcode', 'emailid', 'phonenum', 'age']

In [10]:
violation_levels = violation_levels = [
    {'target_violation_fraction': 0.80, 'dups_per_cluster': (4, 4), 'cells_corrupted': (1, 1)},
    {'target_violation_fraction': 0.80, 'dups_per_cluster': (4, 4), 'cells_corrupted': (2, 2)},
    {'target_violation_fraction': 0.80, 'dups_per_cluster': (4, 4), 'cells_corrupted': (3, 3)},
    {'target_violation_fraction': 0.80, 'dups_per_cluster': (4, 4), 'cells_corrupted': (4, 4)},
    {'target_violation_fraction': 0.80, 'dups_per_cluster': (4, 4), 'cells_corrupted': (5, 5)}
]

num_inconsistent_relations = 10

db_persistence = True
holoclean_persistence = True
output_dir_holoclean = r"C:\Users\anush\OneDrive\Documents\Thesis-ProbDatabases\HoloClean\testdata\synthetic"
os.makedirs(output_dir_holoclean, exist_ok=True)

In [11]:
for j,violation_level in enumerate(violation_levels,start=1):
    
    target_violation_fraction = violation_level['target_violation_fraction']
    dups_per_cluster = violation_level['dups_per_cluster']
    cells_corrupted = violation_level['cells_corrupted']

    inconsistent_dfs = generate_corruptions_for_student_data(df_data, fd_LHS, fd_RHS, num_inconsistent_relations, target_violation_frac=target_violation_fraction, 
                                          dups_per_cluster=dups_per_cluster, cells_corrupted=cells_corrupted)
    for (i,inconsistent_df) in enumerate(inconsistent_dfs,start=1):
        table_name = f"students_{j}_{i}"

        if db_persistence:
            inconsistent_df.to_sql(table_name,engine,schema='prototype_fp',index=False,if_exists='replace')
            print(f"Persisted table {table_name} in database")

        if holoclean_persistence:
            csv_path = os.path.join(output_dir_holoclean, f"{table_name}.csv")
            inconsistent_df.to_csv(csv_path, index=False)
            print(f"Wrote CSV: {csv_path}")
            

Persisted table students_1_1 in database
Wrote CSV: C:\Users\anush\OneDrive\Documents\Thesis-ProbDatabases\HoloClean\testdata\synthetic\students_1_1.csv
Persisted table students_1_2 in database
Wrote CSV: C:\Users\anush\OneDrive\Documents\Thesis-ProbDatabases\HoloClean\testdata\synthetic\students_1_2.csv
Persisted table students_1_3 in database
Wrote CSV: C:\Users\anush\OneDrive\Documents\Thesis-ProbDatabases\HoloClean\testdata\synthetic\students_1_3.csv
Persisted table students_1_4 in database
Wrote CSV: C:\Users\anush\OneDrive\Documents\Thesis-ProbDatabases\HoloClean\testdata\synthetic\students_1_4.csv
Persisted table students_1_5 in database
Wrote CSV: C:\Users\anush\OneDrive\Documents\Thesis-ProbDatabases\HoloClean\testdata\synthetic\students_1_5.csv
Persisted table students_1_6 in database
Wrote CSV: C:\Users\anush\OneDrive\Documents\Thesis-ProbDatabases\HoloClean\testdata\synthetic\students_1_6.csv
Persisted table students_1_7 in database
Wrote CSV: C:\Users\anush\OneDrive\Docume