In [11]:
import json

def parse_nycdeaths_dc(dc_file_content):
    """Parse NYC deaths format denial constraints from file content."""
    dcs = []
    
    # Split into lines and filter out empty ones
    lines = [line.strip() for line in dc_file_content.split('\n') if line.strip()]
    
    for line in lines:
        # Remove the "not" prefix and parentheses
        if line.startswith('not('):
            line = line[4:-1]  # Remove 'not(' and trailing ')'
        
        # Split into individual predicates
        predicates = line.split(' and ')
        dc_predicates = []
        
        for pred in predicates:
            # Split predicate into components
            parts = pred.split('==')
            if len(parts) == 2:
                op = 'EQUAL'
            else:
                parts = pred.split('<>')
                if len(parts) == 2:
                    op = 'UNEQUAL'
                else:
                    parts = pred.split('<=')
                    if len(parts) == 2:
                        op = 'LESS_EQUAL'
                    else:
                        parts = pred.split('>=')
                        if len(parts) == 2:
                            op = 'GREATER_EQUAL'
                        else:
                            parts = pred.split('<')
                            if len(parts) == 2:
                                op = 'LESS'
                            else:
                                parts = pred.split('>')
                                if len(parts) == 2:
                                    op = 'GREATER'
                                else:
                                    continue  # Skip invalid predicates
    
    return dcs


In [12]:
def get_dc_set(dcs):
    """Convert list of DCs into a set of frozensets of predicates for comparison."""
    dcset = set()
    for dc in dcs:
        pred_set = set()
        for predicate in dc["predicates"]:
            pred_set.add(
                predicate["column1"]["columnIdentifier"]
                + ", "
                + predicate["op"]
                + ", "
                + predicate["column2"]["columnIdentifier"]
            )
        dcset.add(frozenset(pred_set))
    return dcset

In [14]:
# Example usage:
with open("../nycdeathsdc", "r") as file:
    content = file.read()

with open("../nycdeathsdc_shuff", "r") as file:
    content_shuffled = file.read()
    
dcs = parse_nycdeaths_dc(content)
dcs_shuffled = parse_nycdeaths_dc(content_shuffled)
dc_set = get_dc_set(dcs)
dc_set_shuffled = get_dc_set(dcs_shuffled)

print(dc_set)

{frozenset({'sex, EQUAL, sex', 'deathrate, GREATER, deathrate', 'raceethnicity, EQUAL, raceethnicity', 'ageadjusteddeathrate, EQUAL, ageadjusteddeathrate', 'deaths, LESS, deaths', 'leadingcause, UNEQUAL, leadingcause'}), frozenset({'deathrate, GREATER, deathrate', 'ageadjusteddeathrate, LESS, ageadjusteddeathrate', 'leadingcause, EQUAL, leadingcause', 'deaths, EQUAL, deaths', 'sex, UNEQUAL, sex'}), frozenset({'deathrate, EQUAL, deathrate', 'raceethnicity, EQUAL, raceethnicity', 'leadingcause, EQUAL, leadingcause', 'ageadjusteddeathrate, GREATER, ageadjusteddeathrate', 'year, EQUAL, year'}), frozenset({'deathrate, EQUAL, deathrate', 'ageadjusteddeathrate, LESS, ageadjusteddeathrate', 'leadingcause, EQUAL, leadingcause', 'year, EQUAL, year', 'sex, UNEQUAL, sex', 'deaths, LESS, deaths'}), frozenset({'deathrate, EQUAL, deathrate', 'raceethnicity, EQUAL, raceethnicity', 'leadingcause, EQUAL, leadingcause', 'ageadjusteddeathrate, UNEQUAL, ageadjusteddeathrate', 'year, EQUAL, year'}), frozens