In [11]:
# validates randomly generated list-based ODs directly on the data
import pandas as pd
import random
from tqdm.notebook import tqdm
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta

In [12]:
def unique_counts(x: pd.Series):
    return len(x.unique())

def colsToString(cols: tuple[str], directions: tuple[bool]):
        return f",".join([col + ("↑" if direction else "↓") for col, direction in zip(cols, directions)])


class ListBasedDependency():
    def __init__(self, df: pd.DataFrame, lhs: list[str], lhsDirection: list[bool], rhs: list[str], rhsDirection : list[bool]) -> None:
        self.df = df
        self.df = pd.concat([self.df, self.df.isna().add_suffix("_isna")],axis=1)
        self.lhs = lhs
        self.lhsDirection = lhsDirection
        self.rhs = rhs
        self.rhsDirection = rhsDirection
        
    # ensure propoer null first sorting by first sorting by _isna columns
    # returns (columns, direction)
    def create_sort_args(self, columns, direction):
        sort_cols = []
        sort_directions = []
        for col, dir in zip(columns, direction):
            sort_cols.append(col + "_isna")
            sort_directions.append(True)
            sort_cols.append(col)
            sort_directions.append(dir)
        return sort_cols, sort_directions
         
    
    def isValid(self):
        # no splits
        df_fd_check = self.df.groupby(self.lhs,dropna=False).agg({col: unique_counts for col in self.rhs})
        if not (df_fd_check == 1).all(axis=None):
             return False
        
        # no swaps
        lhs, lhsDirection = self.create_sort_args(self.lhs, self.lhsDirection)
        sorted_by_lhs = self.df.sort_values(lhs, ascending=lhsDirection)
        rhs, rhsDirection = self.create_sort_args(self.rhs, self.rhsDirection)
        sorted_by_rhs = sorted_by_lhs.sort_values(rhs, ascending=rhsDirection,kind="stable")
        return (sorted_by_lhs.index == sorted_by_rhs.index).all()
    
    def __str__(self):
        result = "["
        result += colsToString(self.lhs, self.lhsDirection)
        result += "] -> ["
        result += colsToString(self.rhs, self.rhsDirection)
        result += "]"
        return result    


In [13]:

def isPrefixOf(listA, listB):
    if(len(listB) < len(listA)):
        return False
    for i in range(len(listA)):
        if listA[i] != listB[i]:
            return False
    return True

def generateCnadidates(df, valid_amount = 100, max_side_length=6, max_tries = 10_000):
    df.fillna(-np.inf,inplace=True)
    valids = set()
    invalids = set()
    starttime = datetime.now()
    max_side_length = min(max_side_length, len(df.columns))
    i = 0
    if i % 1000 == 0:
        print(f"{i=}")
    while len(valids) < valid_amount and i < max_tries:
        i += 1
        if datetime.now() - starttime > timedelta(minutes=10):
            print(f"Cancel after timeout with {len(valids)=} and {len(invalids)=}")
            return valids, invalids
        lhsSize = random.randint(1,max_side_length)
        rhsSize = random.randint(1,max_side_length)
        lhs = random.sample(sorted(df.columns), lhsSize)
        rhs = random.sample(sorted(df.columns), rhsSize)
        lhsDirection = [random.choice([True, False]) for _ in range(lhsSize)]
        rhsDirection = [random.choice([True, False]) for _ in range(rhsSize)]
        
        # skpi trivial ODs
        if isPrefixOf(rhsDirection, lhsDirection) or isPrefixOf(rhs,lhs):
            continue

        od = ListBasedDependency(df, lhs, lhsDirection, rhs, rhsDirection)

        if od.isValid():
            valids.add(str(od))

            if len(valids) % 10 == 0:
                print(f"{len(valids)=}")
            
        elif len(invalids) < 10_000:
            invalids.add(str(od))
    return valids, invalids

In [14]:
root_path = Path("/Users/paulsieben/HPI/WiSe 2023-2024 Advanced Data Profiling/Example Data and ODs")

In [15]:
from tqdm.notebook import tqdm
result_path = Path('/Users/paulsieben/Programming/OrderDependencyTester/testdata')
for file in tqdm((root_path / "datasets").glob("**/*")):
    if file.is_dir() or not file.name.endswith('.csv'): continue
    valid_path = result_path / "candidates" / str(file.parent.name)/ (str(file.name) + ".valids.txt")
    invalid_path = result_path / "candidates" / str(file.parent.name)/ (str(file.name) + ".invalids.txt")
    if valid_path.exists() and invalid_path.exists(): continue
    print(file)

    valids, invalids = generateCnadidates(pd.read_csv(file,sep='\t'))

    valid_path.parent.mkdir(parents=True, exist_ok=True)
    valid_path.write_text('\n'.join(valids))

    invalid_path.parent.mkdir(parents=True, exist_ok=True)
    invalid_path.write_text('\n'.join(invalids))
    


0it [00:00, ?it/s]

/Users/paulsieben/HPI/WiSe 2023-2024 Advanced Data Profiling/Example Data and ODs/datasets/credit/category.csv
i=0
len(valids)=10
len(valids)=20
len(valids)=30
len(valids)=40
len(valids)=50
len(valids)=60
len(valids)=70
len(valids)=80
len(valids)=80
len(valids)=90
len(valids)=100


In [59]:
df = x = pd.read_csv("/Users/paulsieben/HPI/WiSe 2023-2024 Advanced Data Profiling/Example Data and ODs/datasets/credit/provider.csv", sep="\t")


In [60]:
ListBasedDependency(df,["mail_code"],[False],["phone_no","issue_dt"],[False,True]).isValid()

False

In [61]:
# create a pandas time series containing the last 10 days at midnight

s = pd.date_range(end=datetime.now(), periods=10, freq='D').to_series()

In [62]:
pd.to_datetime(pd.concat([s, pd.Series([np.nan, np.nan])])).fillna(-np.inf).sort_values()

TypeError: '<' not supported between instances of 'float' and 'Timestamp'

In [None]:
s2 = pd.Series(["a","b","3",np.nan])

In [None]:
s2.fillna(-np.inf).sort_values()

TypeError: '<' not supported between instances of 'float' and 'str'