# Extended ra operations
> relation algebra operations and ie function calculation operations

In [None]:
#| default_exp ra

In [None]:
#| hide
from nbdev.showdoc import show_doc
from IPython.display import display, HTML
%load_ext autoreload
%autoreload 2

In [None]:
#| export
#| export
import pytest
import pandas as pd
pd.set_option("mode.copy_on_write", True)
import numpy as np
from typing import no_type_check, Set, Sequence, Any,Optional,List,Callable,Dict,Union
import networkx as nx
import itertools

from spannerlib.utils import assert_df_equals
from spannerlib.span import Span
from spannerlib.data_types import _infer_relation_schema,pretty

import logging
logger = logging.getLogger(__name__)

In [None]:
#| export
def _col_names(length):
    # these names wont conflixt with logical variables since they must always start with Uppercase letters
    return [f'col_{i}' for i in range(length)]



In [None]:
s = pd.DataFrame([
    [1,1],
    [2,2],
    [3,3],
    [4,5]
])

s2 = pd.DataFrame([
    [1,2,3],
    [2,3,4],
    [2,3,5],
    [4,5,6]
])

In [None]:
df = pd.DataFrame([
    [1,2],
    [1,3],
    [1,2],
    ])
df

Unnamed: 0,0,1
0,1,2
1,1,3
2,1,2


### RA

In [None]:
#| export

# some select theta functions

class equalConstTheta():
    def __init__(self,*pos_val_tuples):
        self.pos_val_tuples = pos_val_tuples
    def __call__(self,df):
        masks = [df.iloc[:,pos]==val for pos,val in self.pos_val_tuples]
        return pd.concat(masks,axis=1).all(axis=1)
    def __str__(self):
        return f'''Theta({', '.join([f'col_{pos}={val}' for pos,val in self.pos_val_tuples])})'''
    def __repr__(self):
        return str(self)
    def __eq__(self,other):
        if not isinstance(other,equalConstTheta):
            return False
        return self.pos_val_tuples == other.pos_val_tuples

class equalColTheta():
    def __init__(self,*col_pos_tuples):
        self.col_pos_tuples = col_pos_tuples

    def __call__(self,df):
        masks = [df.iloc[:,pos1]==df.iloc[:,pos2] for pos1,pos2 in self.col_pos_tuples]
        return pd.concat(masks,axis=1).all(axis=1)    
    def __str__(self):
        return f'''Theta({', '.join([f'col_{pos1}=col_{pos2}' for pos1,pos2 in self.col_pos_tuples])})'''
    def __repr__(self):
        return str(self)
    def __eq__(self,other):
        if not isinstance(other,equalColTheta):
            return False
        return self.col_pos_tuples == other.col_pos_tuples

In [None]:
s4 = pd.DataFrame([
    [1,2,3,1],
    [1,4,4,1],
    [1,2,3,1],
    [1,4,4,0]
])
s4

Unnamed: 0,0,1,2,3
0,1,2,3,1
1,1,4,4,1
2,1,2,3,1
3,1,4,4,0


In [None]:
s4.iloc[:,0]==1

0    True
1    True
2    True
3    True
Name: 0, dtype: bool

In [None]:
assert list(equalConstTheta((0,1),(2,4))(s4)) == [False, True, False, True]
assert list(equalColTheta((0,3),(1,2))(s4)) == [False, True, False, False]

 

In [None]:
#| export
def get_const(const_dict,**kwargs):
    return pd.DataFrame([const_dict])


def is_truthy(df):
    return df.shape==(1,0)

def is_falsy(df):
    return df.shape==(0,0)

In [None]:
res = get_const({'_C1':1,'_C2':2,0:3})
assert_df_equals(res,pd.DataFrame([[1,2,3]],columns=['_C1','_C2',0]))

Unnamed: 0,_C1,_C2,0
0,1,2,3


In [None]:
#| export
def select(df,theta,schema,**kwargs):
    if df is None or df.empty:
        return pd.DataFrame(columns=schema)
    if callable(theta):
        return df[theta(df)]
    else:
        raise ValueError(f"theta must be callable, got {theta}")

def project(df,schema,**kwargs):
    if df is None or df.empty:
        return pd.DataFrame(columns=schema)
    return df[schema]
    
def rename(df,schema,**kwargs):
    if df is None or df.empty:
        return pd.DataFrame(columns=schema)
    
    df=df.copy()
    df.columns = schema
    return df

def union(*dfs,schema,**kwargs):
    # use numpy arrays to ignore column names
    non_empty_dfs = []
    for df in dfs:
        if df is not None and not df.empty:
            non_empty_dfs.append(df.values)
    if len(non_empty_dfs)==0:
        return pd.DataFrame(columns=schema)
    else:
        return pd.DataFrame(np.concatenate(non_empty_dfs,axis=0),columns=schema).drop_duplicates()

def intersection(df1,df2,schema,**kwargs):
    if df1 is None or df2 is None or df1.empty or df2.empty:
        return pd.DataFrame(columns=schema)
    return pd.merge(df1,df2,how='inner',on=list(df1.columns))

def difference(df1,df2,schema,**kwargs):
    if df1 is None or df2 is None or df1.empty or df2.empty:
        return pd.DataFrame(columns=schema)
    return pd.concat([df1,df2]).drop_duplicates(keep=False)

def join(df1,df2,schema,**kwargs):
    if df1 is None or df2 is None or is_falsy(df1) or is_falsy(df2):
        return pd.DataFrame(columns=schema)

    # if one of the dataframes is truthy, return the other
    # this solves the problem of joining with a constant
    if is_truthy(df1):
        return df2
    if is_truthy(df2):
        return df1

    cols1 = set(df1.columns)
    cols2 = set(df2.columns)
    on = cols1 & cols2
    # get only logical variables
    on = [ col for col in on if isinstance(col,str) and col[0].isupper()]
    if len(on)==0:
        return pd.merge(df1,df2,how='cross')
    else:
        return pd.merge(df1,df2,how='inner',on=on)

def product(df1,df2,schema,**kwargs):
    if df1 is None or df2 is None or df1.empty or df2.empty:
        return pd.DataFrame(columns=schema)
    return pd.merge(df1,df2,how='cross')

#### Tests

In [None]:
truthy = project(pd.DataFrame([[1,2,3]],columns=['A','B','C']),schema=[])
display(truthy)
truthy.shape

0


(1, 0)

In [None]:
falsey = project(pd.DataFrame([],columns=['A','B','C']),schema=[])
display(falsey)
falsey.shape


(0, 0)

In [None]:
assert_df_equals(
    join(s,truthy,schema=[0,1]),
    s
    )

Unnamed: 0,0,1
0,1,1
1,2,2
2,3,3
3,4,5


In [None]:
assert_df_equals(
    join(s,falsey,schema=[0,1]),
    pd.DataFrame(columns=[0,1])
)

Unnamed: 0,0,1


In [None]:
s3 = pd.DataFrame([
    [4,5,6],
    [5,6,7],
    [1,2,3],
    [7,8,9]
])
s3

Unnamed: 0,0,1,2
0,4,5,6
1,5,6,7
2,1,2,3
3,7,8,9


In [None]:
s

Unnamed: 0,0,1
0,1,1
1,2,2
2,3,3
3,4,5


In [None]:
s2_copy = s2.copy()
s2

Unnamed: 0,0,1,2
0,1,2,3
1,2,3,4
2,2,3,5
3,4,5,6


In [None]:
empty = pd.DataFrame()

truth = pd.DataFrame([()])

In [None]:
empty.empty

True

In [None]:
pd.DataFrame(columns=['a','b']).empty

True

In [None]:

assert_df_equals(select(empty,None,['X','Y']),pd.DataFrame(columns=['X','Y']))
assert_df_equals(rename(empty,['X','Y']),pd.DataFrame(columns=['X','Y']))
assert_df_equals(project(empty,['X','Y']),pd.DataFrame(columns=['X','Y']))


Unnamed: 0,X,Y


In [None]:
s

Unnamed: 0,0,1
0,1,1
1,2,2
2,3,3
3,4,5


In [None]:
res = select(s,equalConstTheta((0,1)),[0,1])
assert_df_equals(res,pd.DataFrame([[1,1],],columns=[0,1]))

res = select(s2,equalConstTheta((0,2),(1,3)),[0,1,2])
assert_df_equals(res,pd.DataFrame([[2,3,4],[2,3,5]],columns=[0,1,2]))

res = select(s,equalColTheta((0,1)),[0,1])
assert_df_equals(res,pd.DataFrame([[1,1], [2,2], [3,3]],columns=[0,1]))


Unnamed: 0,0,1
0,1,1
1,2,2
2,3,3


In [None]:
s2

Unnamed: 0,0,1,2
0,1,2,3
1,2,3,4
2,2,3,5
3,4,5,6


In [None]:
res = project(s2,schema=[2,1])
assert_df_equals(res,pd.DataFrame([[3,2],[4,3],[5,3],[6,5]],columns=[2,1]))

Unnamed: 0,2,1
0,3,2
1,4,3
2,5,3
3,6,5


In [None]:
assert list(rename(s2,['X',1,2]).columns) == ['X',1,2]
assert list(rename(s2,['X',1,'Z']).columns) == ['X',1,'Z']

In [None]:
non_uniq_cols_df = pd.DataFrame([
    [1,1,1,1],
    [2,2,2,2],
    [3,3,3,3],
],columns=['X','Y','X','Y'])
non_uniq_cols_df

Unnamed: 0,X,Y,X.1,Y.1
0,1,1,1,1
1,2,2,2,2
2,3,3,3,3


In [None]:
res = rename(non_uniq_cols_df,schema=['X','Y','_F2','_F3'])
assert res.columns.tolist() == ['X', 'Y', '_F2', '_F3']
# make sure we didnt change input
assert non_uniq_cols_df.columns.tolist() == ['X', 'Y', 'X', 'Y']
res

Unnamed: 0,X,Y,_F2,_F3
0,1,1,1,1
1,2,2,2,2
2,3,3,3,3


In [None]:
res = union(s2,s3,schema=[0,1,2])
assert_df_equals(res,pd.DataFrame([
    [1,2,3],
    [2,3,4],
    [2,3,5],
    [4,5,6],
    [5,6,7],
    [7,8,9]
],columns=[0,1,2]))

Unnamed: 0,0,1,2
0,1,2,3
1,2,3,4
2,2,3,5
3,4,5,6
5,5,6,7
7,7,8,9


In [None]:
res = intersection(s2,s3,schema=[0,1,2])
assert_df_equals(res,pd.DataFrame([
    [1,2,3],
    [4,5,6]
],columns=[0,1,2]))

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6


In [None]:
res = difference(s2,s3,schema=[0,1,2])
assert_df_equals(res,pd.DataFrame([
    [2,3,4],
    [2,3,5],
    [5,6,7],
    [7,8,9]
],columns=[0,1,2]))

Unnamed: 0,0,1,2
1,2,3,4
2,2,3,5
1,5,6,7
3,7,8,9


In [None]:
left = rename(s,[0,'Y'])
left

Unnamed: 0,0,Y
0,1,1
1,2,2
2,3,3
3,4,5


In [None]:
right = rename(s2,['Y','X',2])
right

Unnamed: 0,Y,X,2
0,1,2,3
1,2,3,4
2,2,3,5
3,4,5,6


In [None]:
res = join(left,right,schema=[0,'Y','X',2])
assert_df_equals(res,pd.DataFrame([
    [1,1,2,3],
    [2,2,3,4],
    [2,2,3,5]
],columns=[0,'Y','X',2]))


Unnamed: 0,0,Y,X,2
0,1,1,2,3
1,2,2,3,4
2,2,2,3,5


In [None]:
res = join(
    rename(s,['a','b']),s,
    schema=['a','b',0,1]
    )
assert len(res)==16
assert list(res.columns) == ['a', 'b', 0, 1]
res.head()

Unnamed: 0,a,b,0,1
0,1,1,1,1
1,1,1,2,2
2,1,1,3,3
3,1,1,4,5
4,2,2,1,1


### Calc ie operators

To calculate ie functions we need
* a map like operator to run the function on the input tuples to generate `[input+output]` tuples


In [None]:
#| export
def coerce_tuple_like(name,func,input,output):
    if isinstance(output,(tuple,list)):
        return output
    
    if isinstance(output,(int,str,Span)):
        return (output,)
    
    raise ValueError(f"IEFunction {name} with underlying function {func}\n"
                        f"returned a value that is not a tuple/list or a primitive\n"
                        f"for input output pair ({input},{output})")

def assert_ie_schema(name,func,value,expected_schema,arity,input_or_output='input'):
    if callable(expected_schema):
        expected_schema = expected_schema(arity)
    actual_schema = [type(v) for v in value]
    if actual_schema != expected_schema:
        raise ValueError(
            f"IEFunction {name} with underlying function {func}\n"
            f"received an {input_or_output} value {value}(schema={pretty(_infer_relation_schema(value))})\n"
            f"but expected {pretty(expected_schema)}")

def assert_iterable(name,func,input,output):
    try:
        out_iter = iter(output)
    except TypeError:
        raise ValueError(f"IEFunction {name} with underlying function {func}\n"
                f"returned a value that is not an iterable\n"
                f"for input {input} -> {output}")

def map_iter(df,name,func,in_schema,out_schema,in_arity,out_arity,**kwargs):
    """helper function returns an iterator that applies a function to each row of a dataframe
    """
    for _,in_row in df.iterrows():
        in_row = list(in_row)
        assert_ie_schema(name,func,in_row,in_schema,in_arity,input_or_output='input')
        output = func(*in_row)
        assert_iterable(name,func,in_row,output)
        for out_row in output:
            out_row = coerce_tuple_like(name,func,in_row,out_row)
            out_row = list(out_row)
            assert_ie_schema(name,func,out_row,out_schema,out_arity,input_or_output='output')
            yield in_row + out_row

def ie_map(df,name,func,in_schema,out_schema,in_arity,out_arity,**kwargs):
    """given an indexed dataframe, apply an ie function to each row and return the output 
    such that each output relation is indexed by the same index as the input relation that generated it
    """
    if df is None or df.empty:
        return pd.DataFrame(columns=_col_names(in_arity+out_arity))
    output_iter = map_iter(df,name,func,in_schema,out_schema,in_arity,out_arity)
    total_arity = in_arity + out_arity
    return pd.DataFrame(output_iter,columns=_col_names(total_arity))





#### Tests

In [None]:
s2

Unnamed: 0,0,1,2
0,1,2,3
1,2,3,4
2,2,3,5
3,4,5,6


In [None]:
def func(x,y): return [(x+y,x-y)]
def func2(x,y,z): return [(x,y)]
res = ie_map(s,'F',func,[int,int],[int,int],in_arity=2,out_arity=2)
assert_df_equals(res,pd.DataFrame([
    [1,1,2,0],
    [2,2,4,0],
    [3,3,6,0],
    [4,5,9,-1]
],columns=['col_0','col_1','col_2','col_3']))


Unnamed: 0,col_0,col_1,col_2,col_3
0,1,1,2,0
1,2,2,4,0
2,3,3,6,0
3,4,5,9,-1


In [None]:
res =ie_map(s2,'F',func2,[int,int,int],[int,int],in_arity=3,out_arity=2) 
assert_df_equals(res,pd.DataFrame([
    [1,2,3,1,2],
    [2,3,4,2,3],
    [2,3,5,2,3],
    [4,5,6,4,5]
],columns=['col_0','col_1','col_2','col_3','col_4']))

Unnamed: 0,col_0,col_1,col_2,col_3,col_4
0,1,2,3,1,2
1,2,3,4,2,3
2,2,3,5,2,3
3,4,5,6,4,5


In [None]:
res =ie_map(None,'F',func,[int,int],[int,int],in_arity=2,out_arity=2)
assert_df_equals(res,pd.DataFrame(columns=['col_0','col_1','col_2','col_3']))

Unnamed: 0,col_0,col_1,col_2,col_3


In [None]:
# test checking of schema
with pytest.raises(ValueError) as exc_info:
    ie_map(s2,'F',func2,[int,int,int],[int,int,int],3,3)
assert 'but expected' in str(exc_info.value)
print(exc_info.value)

with pytest.raises(ValueError) as exc_info:
    ie_map(s2,'F',func2,[int,int,int],[int,str],3,2)
assert 'but expected' in str(exc_info.value)
print(exc_info.value)

# ie function that forgot to return a tuple
# if its not iterable we dont know what to do
not_iter_func = lambda x,y:x+y

# ie function that returns a value that a primitive, we cast it to an unary tuple
not_tuple_iter_func = lambda x,y:[x+y]

with pytest.raises(ValueError) as exc_info:
    ie_map(s,'F',not_iter_func,[int,int],[int],2,1)
assert 'that is not an iterable' in str(exc_info.value)
print(exc_info.value)

res = ie_map(s,'F',not_tuple_iter_func,[int,int],[int],2,1)
assert_df_equals(res,pd.DataFrame([
    [1,1,2],
    [2,2,4],
    [3,3,6],
    [4,5,9]
],columns=['col_0','col_1','col_2']))

IEFunction F with underlying function <function func2>
received an output value [1, 2](schema=[<class 'int'>, <class 'int'>])
but expected [<class 'int'>, <class 'int'>, <class 'int'>]
IEFunction F with underlying function <function func2>
received an output value [1, 2](schema=[<class 'int'>, <class 'int'>])
but expected [<class 'int'>, <class 'str'>]
IEFunction F with underlying function <function <lambda>>
returned a value that is not an iterable
for input [1, 1] -> 2


Unnamed: 0,col_0,col_1,col_2
0,1,1,2
1,2,2,4
2,3,3,6
3,4,5,9


In [None]:
#|hide
import nbdev; nbdev.nbdev_export()
     