In [8]:
# validates randomly generated list-based ODs directly on the data
import pandas as pd
import random
from tqdm.notebook import tqdm
import re

In [9]:
df = pd.read_csv("minimal1.csv",sep=";",header=None)
df

Unnamed: 0,0,1,2,3
0,0,-5,10,20
1,10,-6,20,10
2,20,-7,10,15
3,30,-8,20,5


In [10]:
def number_to_excel_column(n):
    result = ""
    while n > 0:
        n, remainder = divmod(n - 1, 26)
        result = chr(65 + remainder) + result  # 65 is the ASCII code for 'A'
    return result

def excel_column_to_number(column_str):
    result = 0
    for i, char in enumerate(reversed(column_str)):
        result += (ord(char) - 64) * (26 ** i)  # 64 is the ASCII code for 'A'
    return result

In [11]:
listbased_matcher = re.compile(r"\[(.+)\] -> \[(.+)\]")
def unique_counts(x: pd.Series):
    return len(set(x))

def colsToString(cols: tuple[str], directions: tuple[bool]):
        return f",".join([number_to_excel_column(col + 1) + ("↑" if direction else "↓") for col, direction in zip(cols, directions)])


class ListBasedDependency():
    def __init__(self, df, lhs, lhsDirection, rhs, rhsDirection) -> None:
        self.df = df
        self.lhs = lhs
        self.lhsDirection = lhsDirection
        self.rhs = rhs
        self.rhsDirection = rhsDirection
    
    @staticmethod 
    def parse_attrlist(alist):
         orderspecs = alist.split(',')
         attributes = [excel_column_to_number(spec[:-1]) - 1 for spec in orderspecs]
         directions = [ spec[-1] == '↑' for spec in orderspecs]
         return attributes, directions
    
    @staticmethod 
    def from_string(df, s):
        lhs, rhs = listbased_matcher.match(s).groups()
        lhs, lhsDirections = ListBasedDependency.parse_attrlist(lhs)
        rhs, rhsDirections = ListBasedDependency.parse_attrlist(rhs)
        return ListBasedDependency(df, lhs, lhsDirections, rhs, rhsDirections)
    
    def isValid(self):
        # no swaps
        sorted_by_lhs = df.sort_values(self.lhs, ascending=self.lhsDirection)
        sorted_by_rhs = sorted_by_lhs.sort_values(self.rhs, ascending=self.rhsDirection,kind="stable")
        
        # no splits
        df_fd_check = df.groupby(self.lhs).agg({col: unique_counts for col in self.rhs})
        return (sorted_by_lhs.index == sorted_by_rhs.index).all() and (df_fd_check == 1).all(axis=None)
    
    def __str__(self):
        result = "["
        result += colsToString(self.lhs, self.lhsDirection)
        result += "] -> ["
        result += colsToString(self.rhs, self.rhsDirection)
        result += "]"
        return result    


In [12]:
with open('/Users/paulsieben/Programming/OrderDependencyTester/CliFrontend/to-test-invalid.txt') as f:
    for line in tqdm(f.readlines()):
        lb = ListBasedDependency.from_string(df,line)
        isValid = lb.isValid()
        if isValid:
            print(lb)
            lb.isValid()

  0%|          | 0/318290 [00:00<?, ?it/s]

In [13]:
with open('/Users/paulsieben/Programming/OrderDependencyTester/CliFrontend/to-test-valid.txt') as f:
    for line in tqdm(f.readlines()):
        lb = ListBasedDependency.from_string(df,line)
        isValid = lb.isValid()
        if not isValid:
            print(lb)


  0%|          | 0/81134 [00:00<?, ?it/s]