In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

In [3]:
data = [
    [1,4,10,15],
    [1,4,11,14],
    [1,2,11,16],
    [1,2,12,17],
    [1,3,14,21],
    [2,3,13,18],
    [2,3,15,20],
    [3,4,20,21]
]

columns = ['A','B','C','D']

df_data = pd.DataFrame(data, columns=columns)
print(df_data)

   A  B   C   D
0  1  4  10  15
1  1  4  11  14
2  1  2  11  16
3  1  2  12  17
4  1  3  14  21
5  2  3  13  18
6  2  3  15  20
7  3  4  20  21


In [4]:
from src.database.db_connection import *

In [5]:
engine = create_engine_for_db()

In [6]:
df_data.to_sql('test_prototype_1',engine,schema='prototype1',index=False,if_exists='replace')

8

In [7]:
fd_constraint1 = [(['A'],['B'])]

# Step 1 - simplify FDs to have one attribute at RHS

In [8]:
from src.utilities.step1_simplify_fds import *

In [9]:
fd_constraint1 = simplify_FDs(fd_constraint1)

In [10]:
fd_constraint1

[(['A'], ['B'])]

# Step 2: clusters + generating possible repairs per cluster

In [11]:
from src.utilities.step2_clustering_and_generating_repairs import *

### Step 2_0: Preprocess data by adding a new column with unique identifier

In [12]:
df_data = preprocess_data(df_data)

In [13]:
df_data

Unnamed: 0,A,B,C,D,uuid
0,1,4,10,15,1
1,1,4,11,14,2
2,1,2,11,16,3
3,1,2,12,17,4
4,1,3,14,21,5
5,2,3,13,18,6
6,2,3,15,20,7
7,3,4,20,21,8


### Step 2_1: Identify clusters - group by FD_LHS

In [14]:
# src.utilities.step2_clustering_and_generating_repair identify_clusters

### Step 2_2: Find minimum actions needed within a cluster

In [15]:
# src.utilities.step2_clustering_and_generating_repair find_minimum_actions

### Step 2_3: Define highest frequency sub-clusters

In [16]:
# src.utilities.step2_clustering_and_generating_repair mark_high_frequence_subclusters

### Step 2_4: Create and assign RVs for delete, update & delete/update cases

In [17]:
# src.utilities.step2_clustering_and_generating_repair create_rv_definitions

In [18]:
rv_definitions,rv_probabilities,consistent_rows = create_rv_definitions(df_data,fd_constraint1)

In [19]:
rv_definitions

{'fd1B1=1': (2, {3, 4}),
 'fd1B1=2': (2, {1, 3, 4}),
 'fd1B1=3': (2, {2, 3, 4}),
 'fd1B1=4': (2, {3, 4, 5}),
 'fd1B1=5': (2, {1, 2, 3, 4}),
 'fd1B1=6': (2, {1, 3, 4, 5}),
 'fd1B1=7': (2, {2, 3, 4, 5}),
 'fd1B1=8': (2, {1, 2, 3, 4, 5}),
 'fd1B1=9': (4, {1, 2}),
 'fd1B1=10': (4, {1, 2, 3}),
 'fd1B1=11': (4, {1, 2, 4}),
 'fd1B1=12': (4, {1, 2, 5}),
 'fd1B1=13': (4, {1, 2, 3, 4}),
 'fd1B1=14': (4, {1, 2, 3, 5}),
 'fd1B1=15': (4, {1, 2, 4, 5}),
 'fd1B1=16': (4, {1, 2, 3, 4, 5})}

In [20]:
rv_probabilities

{'fd1B1=1': 0.0625,
 'fd1B1=2': 0.0625,
 'fd1B1=3': 0.0625,
 'fd1B1=4': 0.0625,
 'fd1B1=5': 0.0625,
 'fd1B1=6': 0.0625,
 'fd1B1=7': 0.0625,
 'fd1B1=8': 0.0625,
 'fd1B1=9': 0.0625,
 'fd1B1=10': 0.0625,
 'fd1B1=11': 0.0625,
 'fd1B1=12': 0.0625,
 'fd1B1=13': 0.0625,
 'fd1B1=14': 0.0625,
 'fd1B1=15': 0.0625,
 'fd1B1=16': 0.0625}

In [21]:
consistent_rows

[6, 7, 8]

# Step 3: Convert back to repaired database table

In [22]:
from src.utilities.step3_convert_back_to_db import *

In [23]:
repaired_rows = create_repaired_rows(rv_definitions,df_data,consistent_rows)

In [24]:
columns = [col for col in df_data.columns if col != 'uuid'] + ['_sentences']
repaired_df = pd.DataFrame(repaired_rows, columns=columns)

In [25]:
repaired_df

Unnamed: 0,A,B,C,D,_sentences
0,1,2,11,16,fd1B1=1 | fd1B1=2 | fd1B1=3 | fd1B1=4 | fd1B1=...
1,1,2,12,17,fd1B1=1 | fd1B1=2 | fd1B1=3 | fd1B1=4 | fd1B1=...
2,1,2,10,15,fd1B1=2 | fd1B1=5 | fd1B1=6 | fd1B1=8
3,1,2,11,14,fd1B1=3 | fd1B1=5 | fd1B1=7 | fd1B1=8
4,1,2,14,21,fd1B1=4 | fd1B1=6 | fd1B1=7 | fd1B1=8
5,1,4,10,15,fd1B1=9 | fd1B1=10 | fd1B1=11 | fd1B1=12 | fd1...
6,1,4,11,14,fd1B1=9 | fd1B1=10 | fd1B1=11 | fd1B1=12 | fd1...
7,1,4,11,16,fd1B1=10 | fd1B1=13 | fd1B1=14 | fd1B1=16
8,1,4,12,17,fd1B1=11 | fd1B1=13 | fd1B1=15 | fd1B1=16
9,1,4,14,21,fd1B1=12 | fd1B1=14 | fd1B1=15 | fd1B1=16


# Step 4: Merge same rows by combining sentences

In [26]:
from src.utilities.step4_group_similar_sentences import *

In [27]:
repaired_df = group_similar_rows_together(repaired_df)

In [28]:
repaired_df

Unnamed: 0,A,B,C,D,_sentences
0,1,2,10,15,fd1B1=2 | fd1B1=5 | fd1B1=6 | fd1B1=8
1,1,2,11,14,fd1B1=3 | fd1B1=5 | fd1B1=7 | fd1B1=8
2,1,2,11,16,fd1B1=1 | fd1B1=2 | fd1B1=3 | fd1B1=4 | fd1B1=...
3,1,2,12,17,fd1B1=1 | fd1B1=2 | fd1B1=3 | fd1B1=4 | fd1B1=...
4,1,2,14,21,fd1B1=4 | fd1B1=6 | fd1B1=7 | fd1B1=8
5,1,4,10,15,fd1B1=9 | fd1B1=10 | fd1B1=11 | fd1B1=12 | fd1...
6,1,4,11,14,fd1B1=9 | fd1B1=10 | fd1B1=11 | fd1B1=12 | fd1...
7,1,4,11,16,fd1B1=10 | fd1B1=13 | fd1B1=14 | fd1B1=16
8,1,4,12,17,fd1B1=11 | fd1B1=13 | fd1B1=15 | fd1B1=16
9,1,4,14,21,fd1B1=12 | fd1B1=14 | fd1B1=15 | fd1B1=16


# Step 5: Insert in Dubio

In [29]:
repaired_df.to_sql('test_prototype_1_prob',engine,schema='prototype1',index=False,if_exists='replace')

13

In [30]:
entries = [f"{k}:{v}" for k, v in rv_probabilities.items()]
joined = ';'.join(entries)

update_stmt = f"UPDATE prototype1._dict\nSET dict=add(dict, '{joined}')\nWHERE name='prototype_dict';"

print(update_stmt)

UPDATE prototype1._dict
SET dict=add(dict, 'fd1B1=1:0.0625;fd1B1=2:0.0625;fd1B1=3:0.0625;fd1B1=4:0.0625;fd1B1=5:0.0625;fd1B1=6:0.0625;fd1B1=7:0.0625;fd1B1=8:0.0625;fd1B1=9:0.0625;fd1B1=10:0.0625;fd1B1=11:0.0625;fd1B1=12:0.0625;fd1B1=13:0.0625;fd1B1=14:0.0625;fd1B1=15:0.0625;fd1B1=16:0.0625')
WHERE name='prototype_dict';


In [31]:
with engine.connect() as connection:
    connection.execute(text(update_stmt))
    connection.commit()