In [1]:
import sys
import os
import warnings
import time

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", "..", ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)
warnings.filterwarnings("ignore")

In [2]:
from src.database.db_connection import *
from src.database.preparation import *
from src.utilities.step1_simplify_fds import *
from src.utilities.step2_clustering_and_generating_repairs import *
from src.utilities.step3_convert_back_to_db import *
from src.utilities.step4_group_similar_sentences import *
from src.evaluation.metrics import *
from src.database.db_connection import *

In [3]:
engine = create_engine_for_db()

In [4]:
fd_constraints = [(['HospitalName'],['ZipCode', 'PhoneNumber', 'Address1', 'HospitalOwner', 'ProviderNumber','City'])]

In [5]:
fd_constraints = simplify_FDs(fd_constraints)

In [6]:
fd_constraints

[(['HospitalName'], ['ZipCode']),
 (['HospitalName'], ['PhoneNumber']),
 (['HospitalName'], ['Address1']),
 (['HospitalName'], ['HospitalOwner']),
 (['HospitalName'], ['ProviderNumber']),
 (['HospitalName'], ['City'])]

In [7]:
delete_cost = 2
update_cost = 1
eps=1e-9
prob_type = 'UNIFORM'
constraint_hardness = [1.0] * len(fd_constraints)
print(constraint_hardness)

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [8]:
table_name = 'hospital'
hospital_df = pd.read_sql_table(table_name, con=engine, schema="prototype_fp")
print(len(hospital_df))

1000


In [9]:
hospital_df.head(5)

Unnamed: 0,tid,ProviderNumber,HospitalName,Address1,City,State,ZipCode,CountyName,PhoneNumber,HospitalType,HospitalOwner
0,0,10018,callahan eye foundation hospital,1720 university blvd,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private
1,1,10018,callahan eye foundation hospital,1720 university blvd,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private
2,2,10018,callahan eye foundation hospital,1720 university blvd,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private
3,3,10018,callahan eye foundation hospital,1720 university blvd,birminghxm,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private
4,4,10018,callahan eye foundation hospital,1720 university blvd,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private


In [10]:
dfs_by_hospital = [g.copy() for _, g in hospital_df.groupby("HospitalName", dropna=False)]

In [11]:
print(len(dfs_by_hospital))

69


In [12]:
with engine.begin() as connection:
    connection.execute(text("""
    TRUNCATE TABLE prototype_fp.hospital_prob
    RESTART IDENTITY
    CASCADE;
    """))
    connection.execute(text("""
    UPDATE prototype_fp."_dict"
    SET dict = dictionary('')
    WHERE name = :name;
    """), {"name": "hospital_dict"})

In [13]:
for i,inconsistent_df in enumerate(dfs_by_hospital,start=1):
    start = time.perf_counter()
    inconsistent_df.rename(columns={"tid": "uuid"}, inplace=True)
    actions,cluster_ids,rv_probs = compute_actions_batch(inconsistent_df, fd_constraints, constraint_hardness,i,delete_cost, update_cost,eps,prob_type)
    all_actions,cartesian_possible,cartesian_computed = min_cost_actions_pruned(cluster_ids,actions,delete_cost,update_cost)
    repaired_versions =  apply_combined_actions(inconsistent_df, all_actions, uuid_col="uuid")
    repaired_versions = pd.concat(repaired_versions, ignore_index=True)
    #rv_probabilities = filter_and_condition_min_cost_rvs(rv_probs, repaired_versions)
    rv_probabilities = rv_probs
    repaired_df = group_similar_rows_together(repaired_versions)
    repaired_df.rename(columns={"uuid": "tid"}, inplace=True)
    repaired_df.to_sql('hospital_prob_100',engine,schema='prototype_fp',index=False,if_exists='append')
    entries = [f"{k}:{v}" for k, v in rv_probabilities.items()]
    joined = ';'.join(entries)
    update_stmt = f"""UPDATE prototype_fp._dict SET dict = add(COALESCE(dict, ''), '{joined}') WHERE name = 'hospital_dict_100';"""
    #update_stmt = f"UPDATE prototype_fp._dict\nSET dict=add(dict, '{joined}')\nWHERE name='hospital_dict';"
    with engine.begin() as connection:
        connection.execute(text(update_stmt))
    elapsed = time.perf_counter() - start
    print(f"\nCurrent cluster {i} done")
    print(f"Time taken: {elapsed}")



Current cluster 1 done
Time taken: 0.0480354999890551

Current cluster 2 done
Time taken: 0.05003829998895526

Current cluster 3 done
Time taken: 0.044182000012369826

Current cluster 4 done
Time taken: 0.03829789999872446

Current cluster 5 done
Time taken: 0.04448850001790561

Current cluster 6 done
Time taken: 0.03111359998001717

Current cluster 7 done
Time taken: 0.041159700020216405

Current cluster 8 done
Time taken: 0.04225140000926331

Current cluster 9 done
Time taken: 0.041325500002130866

Current cluster 10 done
Time taken: 0.03237169998465106

Current cluster 11 done
Time taken: 0.042107699991902336

Current cluster 12 done
Time taken: 0.03334940000786446

Current cluster 13 done
Time taken: 0.032602100021904334

Current cluster 14 done
Time taken: 0.03351500001735985

Current cluster 15 done
Time taken: 0.03380229999311268

Current cluster 16 done
Time taken: 0.043244200001936406

Current cluster 17 done
Time taken: 0.030363300000317395

Current cluster 18 done
Time take