In [31]:
# validates randomly generated list-based ODs directly on the data
import pandas as pd
import random
from tqdm.notebook import tqdm

In [32]:
df = pd.read_csv("horse-sub.csv",sep=";",header=None).iloc[:,:28]

In [33]:
def number_to_excel_column(n):
    result = ""
    while n > 0:
        n, remainder = divmod(n - 1, 26)
        result = chr(65 + remainder) + result  # 65 is the ASCII code for 'A'
    return result

In [34]:
def unique_counts(x: pd.Series):
    return len(set(x))

def colsToString(cols: tuple[str], directions: tuple[bool]):
        return f",".join([number_to_excel_column(col + 1) + ("↑" if direction else "↓") for col, direction in zip(cols, directions)])


class ListBasedDependency():
    def __init__(self, df, lhs, lhsDirection, rhs, rhsDirection) -> None:
        self.df = df
        self.lhs = lhs
        self.lhsDirection = lhsDirection
        self.rhs = rhs
        self.rhsDirection = rhsDirection
        
    
    def isValid(self):
        # no swaps
        sorted_by_lhs = df.sort_values(self.lhs, ascending=self.lhsDirection)
        sorted_by_rhs = sorted_by_lhs.sort_values(self.rhs, ascending=self.rhsDirection,kind="stable")
        
        # no splits
        df_fd_check = df.groupby(self.lhs).agg({col: unique_counts for col in self.rhs})
        return (sorted_by_lhs.index == sorted_by_rhs.index).all() and (df_fd_check == 1).all(axis=None)
    
    def __str__(self):
        result = "["
        result += colsToString(self.lhs, self.rhsDirection)
        result += "] -> ["
        result += colsToString(self.rhs, self.rhsDirection)
        result += "]"
        return result    


In [35]:
for lhsSize in tqdm([5,10]):
    rhsSize = 2
    valids = []
    invalids = []

    for i in tqdm(range(10000),leave=False):
        lhs = random.sample(sorted(df.columns), lhsSize)
        rhs = random.sample(sorted(df.columns), rhsSize)
        lhsDirection = [random.choice([True, False]) for _ in range(lhsSize)]
        rhsDirection = [random.choice([True, False]) for _ in range(rhsSize)]
        od = ListBasedDependency(df, lhs, lhsDirection, rhs, rhsDirection)

        if od.isValid():
            valids.append(str(od))
            
        elif len(invalids) < 1000:
            invalids.append(str(od))

    with open(f"valid_{lhsSize}_{rhsSize}.txt","w+") as f:
        f.write("\n".join(valids))

    with open(f"invalid_{lhsSize}_{rhsSize}.txt","w+") as f:
        f.write("\n".join(invalids))


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]