In [1]:
import sys
import os
import warnings
import time

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", "..", ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)
warnings.filterwarnings("ignore")

In [2]:
from src.database.db_connection import *
from src.evaluation.metrics import *
from src.utilities.step1_simplify_fds import *

In [3]:
engine = create_engine_for_db()

In [4]:
fd_constraints = [(['sid', 'university'],['sname', 'address', 'postcode', 'emailid', 'phonenum'])]

In [5]:
fd_constraints = simplify_FDs(fd_constraints)

In [6]:
fd_constraints

[(['sid', 'university'], ['sname']),
 (['sid', 'university'], ['address']),
 (['sid', 'university'], ['postcode']),
 (['sid', 'university'], ['emailid']),
 (['sid', 'university'], ['phonenum'])]

In [7]:
violation_levels = [
  {'target_violation_fraction': 0.20, 'dups_per_cluster': (1, 1),  'cells_corrupted': (1, 2)},
  {'target_violation_fraction': 0.40, 'dups_per_cluster': (2, 2),  'cells_corrupted': (2, 3)},  
  {'target_violation_fraction': 0.60, 'dups_per_cluster': (4, 4),  'cells_corrupted': (2, 3)},  
  {'target_violation_fraction': 0.80, 'dups_per_cluster': (6, 6),  'cells_corrupted': (3, 4)}  
]

num_inconsistent_relations = 10

delete_cost = 2
update_cost = 1
eps=1e-9
prob_type = 'COST_BASED'
constraint_hardness = [1.0] * len(fd_constraints)

query_file_name_expected_CQA = "../../../queries/FP/synthetic/Students_truth.sql"
query_file_name_actual_CQA = "../../../queries/FP/synthetic/Students_CQA.sql"
algorithm_name = 'CQA'
old_table_name = 'prototype_fp.students'

In [8]:
for j,violation_level in enumerate(violation_levels,start=1):
    total_time = 0.0
    total_precision_CQA, total_recall_CQA,total_coverage_CQA, total_noise_CQA = 0.0, 0.0, 0.0, 0.0
    target_violation_fraction = violation_level['target_violation_fraction']
    dups_per_cluster = violation_level['dups_per_cluster']
    cells_corrupted = violation_level['cells_corrupted']
    
    for i in range(1, num_inconsistent_relations + 1):
        
        table_name = f"students_{j}_{i}"

        start = time.perf_counter()
        results_CQA,results_with_truth_set_CQA = calculate_expected_values(query_file_name_expected_CQA, query_file_name_actual_CQA, algorithm_name, 
                                                                           engine,old_table_name, f"prototype_fp.{table_name}")
        elapsed = time.perf_counter() - start
        
        precision_CQA, recall_CQA,coverage_CQA, noise_CQA = summarize_results(results_CQA)
        
        print(f"\nViolation Level {j}: Processed inconsistent relation {i} table name: {table_name}")
        print(f"Time taken {elapsed}")
        print(f"Precision CQA: {precision_CQA:.4f}")
        print(f"Recall CQA: {recall_CQA:.4f}")
        print(f"Coverage CQA: {coverage_CQA:.4f}")
        print(f"Noise CQA: {noise_CQA:.4f}")
        total_precision_CQA += precision_CQA
        total_recall_CQA += recall_CQA
        total_coverage_CQA += coverage_CQA
        total_noise_CQA += noise_CQA
        total_time += elapsed
        
    avg_time_violation_level = total_time/num_inconsistent_relations
    avg_precision_CQA = total_precision_CQA / num_inconsistent_relations
    avg_recall_CQA = total_recall_CQA / num_inconsistent_relations
    avg_coverage_CQA = total_coverage_CQA / num_inconsistent_relations
    avg_noise_CQA = total_noise_CQA / num_inconsistent_relations

    
    print(f"\nViolation Level {j}: Average time taken {avg_time_violation_level}")
    print(f"Average Precision CQA for violation level {j}: {avg_precision_CQA:.4f}")
    print(f"Average Recall CQA for violation level {j}: {avg_recall_CQA:.4f}")
    print(f"Average Coverage CQA for violation level {j}: {avg_coverage_CQA:.4f}")
    print(f"Average Noise CQA for violation level {j}: {avg_noise_CQA:.4f}")


Violation Level 1: Processed inconsistent relation 1 table name: students_1_1
Time taken 0.6058723999885842
Precision CQA: 1.0000
Recall CQA: 0.8103
Coverage CQA: 0.8103
Noise CQA: 0.0000

Violation Level 1: Processed inconsistent relation 2 table name: students_1_2
Time taken 0.4809402000391856
Precision CQA: 1.0000
Recall CQA: 0.7932
Coverage CQA: 0.7932
Noise CQA: 0.0000

Violation Level 1: Processed inconsistent relation 3 table name: students_1_3
Time taken 0.263272300013341
Precision CQA: 1.0000
Recall CQA: 0.8017
Coverage CQA: 0.8017
Noise CQA: 0.0000

Violation Level 1: Processed inconsistent relation 4 table name: students_1_4
Time taken 0.4691116000758484
Precision CQA: 1.0000
Recall CQA: 0.7889
Coverage CQA: 0.7889
Noise CQA: 0.0000

Violation Level 1: Processed inconsistent relation 5 table name: students_1_5
Time taken 0.46559120004530996
Precision CQA: 0.9692
Recall CQA: 0.9009
Coverage CQA: 0.9009
Noise CQA: 0.0308

Violation Level 1: Processed inconsistent relation 6 t