## Tensorflow implementation of Apriori

#### Matrix representation of apriori entities
- Output: n-dimensional cube where nth dimension represents nth member of group. Zeros along diagonals.

- Input: N by m dimensional matrix where N is the number of Cpc's and m is the number of samples (families).
- Size-1 candidate sets: the 4-letter cpc codes
- Size-1 frequency sets: adding input columns and filtering by whether greater than support threshold.



- Size-n candidate sets: E.g. From the size-1 frequent sets, get all the possible pairings sets (removing reversed order duplicates)

Taking 'mask' from n-1's > min_support filter - set zeros on all below threshold
                       
-------------------------------------Set main diagonal to zeros-------------

---A---B---C---D-------------------------A---B---C---D-------------------------A---B---C---D
   
A--1---1---0---1----------------------A--0---1---0---1----------------------A--0---0---0---0

B--1---1---0---1----------------------B--1---0---0---1----------------------B--1---0---0---0

C--0---0---0---0----------------------C--0---0---0---0----------------------C--0---0---0---0

D--1---1---0---1----------------------D--1---1---0---0----------------------D--1---1---0---0


- Make mask for each input collection, extending into the N'th dimension, then stack all input collections together before count op.

#### Stages of completion
1. Get something to count single occurances of elements in input collections
2. Create and run against test data -> optimise, e.g. use sparse tensor representations 
3. Return richer output e.g. support, confidence, lift measures for each group (wrapped in some container)
4. Store the result in memory for further requests e.g. distance measures between groups etc

In [173]:
import numpy as np
from numpy import array
import tensorflow as tf
from itertools import permutations, combinations
import collections

from typing import Dict, Any, List, Set, Iterable, Tuple

GroupCountIO = collections.namedtuple('GroupCountIO', 
                                      """
                                      current_N,
                                      original_els, num_original_els, 
                                      input_rows, num_input_rows,
                                      curr_bin_mask, curr_group_count_totals,
                                      min_support
                                      """)
GroupSupport = collections.namedtuple('GroupSupport', 'groups, support, group_size')

In [174]:
def get_input_collections_as_binary_arrays(input_collections: List[Iterable]) -> Tuple[List, List[array]]:
    """
    TODO: Get binarised_input_collections with tensorflow?
    """
    # First get complete set of input elements
    all_input_elements = list(sorted(set([el for row in input_collections for el in row])))
    # For each input collection, create array with 1 for element exists or 0 for not exists 
    # (extension: count the number of elements - for use with multiple element counting version)
    binarised_input_collections = [tf.Variable([1 if el in row else 0 for el in all_input_elements], dtype=tf.uint8)
                                   for row in input_collections]
    return (all_input_elements, binarised_input_collections,)
    

In [200]:
def get_groups(original_input_els, inputs_size, group_summations,  group_summations_dim) -> Dict[frozenset, int]:
    """
    TODO: Write down an expected format for the group_summations shape
    TODO: If this takes lots of memory, just pass the group_summations out of session and recursively search
    
    
    
    inputs_size: The number of distinct input elements
    original_input_els: The original distinct input elements which have the same order as the axes in group_summations
                        and which hence can be mapped back by index.
    group_summations: The N dimensional tensor containing the groups which met the min support threshold.
                      Each dimension is of length len(original_input_els)
    inputs_size: The length of unique elements in the input
    group_summations_dim: The number of dimensions in group_summations
    returns: The dictionary of group elements to group support
    """
    
    # TODO: group_summations_dim can probably be replaced with -1 or tf.newaxis
    
    if group_summations_dim <= 1:
        raise ValueError("group_summations_dim should have greater than one dimension")
        
    dim_to_append = original_input_els
    for dim in range(1,group_summations_dim):
        dim_rows = []
        for row in range(inputs_size):
            dim_rows.append(dim_to_append)
        group_dims_stack = tf.stack(dim_rows)
        dim_to_append = group_dims_stack

    total_possible_groups_list = []
    dims_list = list(range(0, group_summations_dim))
    for dim in dims_list:
        perm = [(i + dim) % len(dims_list) for i in dims_list]
        print(f"get_groups perm: {perm}")
        group_dims_stack_perm = tf.transpose(group_dims_stack, perm=perm)
        dim_possible_groups_perm = tf.expand_dims(group_dims_stack_perm, group_summations_dim)
        total_possible_groups_list.append(dim_possible_groups_perm)
    group_layout_tensor = tf.concat(total_possible_groups_list, group_summations_dim)

    successfull_groups = tf.gather_nd(group_layout_tensor, tf.where(group_summations >= 1))
    successfull_group_counts = tf.gather_nd(group_summations, tf.where(group_summations >= 1))
    return successfull_groups, successfull_group_counts, tf.constant(group_summations_dim)


test_original_input_els = ['A','B','C','D']

test_group_summations_2 = tf.Variable([[0,0,0,0], 
                                       [1,0,0,0], 
                                       [0,0,0,0], 
                                       [1,1,1,0]], tf.uint8)
expected_output_2 = tuple([[[b'A',b'B'],[b'A',b'D'],[b'B',b'D'],[b'C',b'D']], [1,1,1,1], 2])

test_group_summations_3 = tf.Variable([
    [[0,0,0,0], 
     [0,0,0,0], 
     [0,0,0,0], 
     [0,0,0,0]], 
    
    [[0,0,0,0], 
     [0,0,0,0], 
     [0,0,0,0], 
     [0,0,0,0]],
    
    [[0,0,0,0], 
     [1,0,0,0], 
     [0,0,0,0], 
     [0,0,0,0]],
    
    [[0,0,0,0], 
     [1,0,0,0], 
     [0,5,0,0], 
     [0,0,0,0]]
], tf.uint8)
expected_output_3 = tuple([np.array([[b'A',b'B',b'C'],[b'A',b'B',b'D'],[b'B',b'C',b'D']]),np.array([1,1,5]), 3])

def test_get_groups(test_original_input_els, len_input_els, test_group_summations, 
                    len_group_summations_dims, expected):
    init = tf.global_variables_initializer()
    with tf.Session() as session:
        session.run(init)
        result = session.run(get_groups(test_original_input_els, len_input_els,
                                        test_group_summations, len_group_summations_dims))
        print(result)
    for i_group in range(len(result[0])):
        assert len(result[0][i_group]) == len(expected[0][i_group])
        for i_group_el in range(len(result[0][i_group])):
            assert result[0][i_group][i_group_el] == expected[0][i_group][i_group_el]

test_get_groups(test_original_input_els, 4, test_group_summations_2, 2, expected_output_2)
test_get_groups(test_original_input_els, 4, test_group_summations_3, 3, expected_output_3)

get_groups perm: [0, 1]
get_groups perm: [1, 0]
(array([[b'A', b'B'],
       [b'A', b'D'],
       [b'B', b'D'],
       [b'C', b'D']], dtype=object), array([1, 1, 1, 1], dtype=int32), 2)
get_groups perm: [0, 1, 2]
get_groups perm: [1, 2, 0]
get_groups perm: [2, 0, 1]
(array([[b'A', b'B', b'C'],
       [b'A', b'B', b'D'],
       [b'B', b'C', b'D']], dtype=object), array([1, 1, 5], dtype=int32), 3)


In [201]:
# def map_with_mask(is_frequent_mask, input_row):
#     return [1 if is_frequent_mask[i] else 0 for i in range(0, tf.size(input_row))]
        

In [202]:
from resources.test_resources import expected_occluded_output_3d, input_occluded_output_3d

def get_inputs_filtered_by_possible_combinations(group_dims_stack,
                                                 current_els_mask, 
                                                 group_size):
    """
    group_dims_stack: the 
    current_els_mask: python binary list with allowed elements as 1.
    """
    # possible_els_idxs -> [0,3,5,10] = oth, 3rd etc els allowed.
    possible_combinations_idxs = tf.py_func(lambda mask: 
                                   np.array(list(combinations([i for i, v in enumerate(mask) if v == 1], group_size)), 
                                            dtype=np.int32),
                                   [current_els_mask], tf.int32)
    
    number_of_unique_input_els = tf.size(current_els_mask)
    
    mask_shape = tf.map_fn(lambda x: number_of_unique_input_els, tf.Variable(list(range(group_size))))
        
    values = tf.py_func(lambda perms: np.ones(len(perms), dtype=np.int32),
                                                [possible_combinations_idxs], tf.int32) 

    allowed_combs = tf.sparse_to_dense(sparse_indices=possible_combinations_idxs, 
                                       output_shape=mask_shape,
                                       sparse_values=values,)

    return tf.multiply(allowed_combs, group_dims_stack)
 
    
def _test_get_diagonal_and_upper_zeroed_tensor(expected, group_dims_stack, current_els_mask, group_size):
    init = tf.global_variables_initializer()
    with tf.Session() as session:
        session.run(init)
        tensor = get_inputs_filtered_by_possible_combinations(group_dims_stack, current_els_mask, group_size)
        result = session.run(tensor)
#     print('Expected:')
#     print(str(np.array(expected)))
#     print('Output:')
#     print(str(result))
    assert np.array_equal(np.array(expected), result)

current_N = None
_test_get_diagonal_and_upper_zeroed_tensor(expected_occluded_output_3d, input_occluded_output_3d, [1,1,1,1,1], 3)

In [203]:
def get_inputs_tensor_multiplied_by_transpose_permutations(group_dims_stack, group_size):
    # We have the masked vectorised input rows as a N+1 dimensional tensor
    # Now multiply by N - 1 tensors whose axes have been permuted (on all but 0th axis) 
    cross_multiplied_group_dims_stack = group_dims_stack
    dims_list = list(range(1, group_size + 1))
    perm_list = list(range(0, group_size))
    L = len(dims_list)
    for dim in dims_list[:-1]: #permutations(range(1,current_N + 1)):
        perm = [0] + [((i + dim + L - 1) % L) + 1 for i in dims_list]
#         perm = [0 if i == -1 else perm[i] for i in range(-1, len(perm))]
        print("perm: " + str(perm))
        group_dims_stack_perm = tf.transpose(group_dims_stack, perm=perm)
        #TODO: Test broadcasting always works when using same number of input rows as unique elements
        cross_multiplied_group_dims_stack = tf.multiply(cross_multiplied_group_dims_stack, group_dims_stack_perm)
    return cross_multiplied_group_dims_stack
    

In [204]:
def get_group_count_tensor_reduced_to_1D(group_count_t, number_of_unique_input_els, group_size):
    """
    TODO: remove this and create get_next_mask_and_groups.input_collections_reduced with previous 
    next_frequent_bin_filter i.e. would be better to create next possible from combinations of current groups
    e.g. if A,B,C and B,C,D are groups, but no size 3 group contains A and D, A,B,C,D should not be a possiblity
    """
    ONE = tf.constant(1, dtype=tf.uint8)
    ZERO = tf.constant(0, dtype=tf.uint8)
    output_mask = tf.zeros(number_of_unique_input_els, dtype=tf.uint8)
    dims_list = list(range(0, group_size))
    dim_skips_list = [[d for d in dims_list if d != skip_d] for skip_d in dims_list]
    print(f"dim_skips_list: {dim_skips_list}")
    for dims in dim_skips_list:
        print(dims)
        output_mask = tf.add(output_mask, tf.reduce_sum(group_count_t, dims))
        
    return tf.unstack(tf.map_fn(lambda x: tf.cond(x >= ONE, true_fn=lambda: ONE, false_fn=lambda: ZERO), 
                     output_mask, dtype=tf.uint8))

In [223]:
def get_next_mask_and_groups(gc: GroupCountIO) -> GroupSupport:
    """
    Produces the next 
    
    frequent_bin_mask: 
    The 1 dimensional tensor procuded by filtering elements in the previous iteration by min_support and reducing
    back to single dimension.
    It has 1s and 0s at each element denoting the presence of each group.
    
    current_N:
    The dimension that is currently being created i.e. the new mask will be this large and the full count 
    representation tensor will be order N+1
    
    current_N,
    original_els, num_original_els, 
    input_rows, num_input_rows,
    curr_bin_mask, curr_group_count_totals,
    min_support
    """
    # TODO: return if number of frequent els < current_N

    # TODO: find a better way to combine the N-dimensional hypercube mask with the #input-rows by N input 
    # vectorisation to get the group count representation
    # E.g. tf.matmul or tf.tensordot
    # Downside of the current (tile, reshape) form is that we throw away the count rep tensor
    
    print(f"Get_next_mask_and_groups for group size {gc.current_N}")
    
    input_collections_reduced = tf.multiply(gc.input_rows, gc.curr_bin_mask)
    print(gc.num_original_els)
    dim_to_append = input_collections_reduced
    for d in range(2, gc.current_N+1):
        group_dims_stack = tf.stack([dim_to_append for _ in range(0, gc.num_original_els)], axis=-1)
        traspose_multiplied_stack = get_inputs_tensor_multiplied_by_transpose_permutations(group_dims_stack, d)
        dim_to_append = traspose_multiplied_stack
        
    filtered_counts_tensor = get_inputs_filtered_by_possible_combinations(traspose_multiplied_stack,
                                                                          gc.curr_bin_mask,
                                                                          gc.current_N
                                                                         )
    
    next_el_occurances = tf.reduce_sum(filtered_counts_tensor, axis=0)
    
    group_counts = get_groups(gc.original_els, gc.num_original_els, next_el_occurances, gc.current_N)
    group_support = GroupSupport(*group_counts)

    next_frequent_bin_filter = tf.cast(next_el_occurances >= gc.min_support, tf.uint8)
    next_frequent_bin_mask = get_group_count_tensor_reduced_to_1D(next_frequent_bin_filter, gc.num_original_els,
                                                                  gc.current_N)

    return next_frequent_bin_mask, group_support
    

def test_get_next_mask_and_groups(gc: GroupCountIO):
    with tf.Session() as session:
        init = tf.global_variables_initializer()
        session.run(init)
        result = session.run(get_next_mask_and_groups(gc))
    return result
    
#     current_N
#     original_els, num_original_els, 
#     input_rows, num_input_rows,
#     curr_bin_mask, curr_group_count_totals,
#     min_support
    
original_input_els = ['A','B','C','D','E']
vectorised_inputs_stack_2 = tf.Variable([[1,1,1,1,0],
                                         [1,1,1,1,1], 
                                         [0,1,1,1,0]], tf.uint8)
frequent_bin_mask_2 = tf.Variable([1,1,1,1,1], tf.uint8)

input_3_5 = GroupCountIO(3, original_input_els, 5, vectorised_inputs_stack_2, 3, frequent_bin_mask_2, [], 1)
results = test_get_next_mask_and_groups(input_3_5)

results

Get_next_mask_and_groups for group size 3
5
perm: [0, 2, 1]
perm: [0, 2, 3, 1]
perm: [0, 3, 1, 2]
get_groups perm: [0, 1, 2]
get_groups perm: [1, 2, 0]
get_groups perm: [2, 0, 1]
dim_skips_list: [[1, 2], [0, 2], [0, 1]]
[1, 2]
[0, 2]
[0, 1]


([1, 1, 1, 1, 1],
 GroupSupport(groups=array([[b'C', b'B', b'A'],
       [b'D', b'B', b'A'],
       [b'E', b'B', b'A'],
       [b'D', b'C', b'A'],
       [b'E', b'C', b'A'],
       [b'E', b'D', b'A'],
       [b'D', b'C', b'B'],
       [b'E', b'C', b'B'],
       [b'E', b'D', b'B'],
       [b'E', b'D', b'C']], dtype=object), support=array([2, 2, 1, 2, 1, 1, 3, 1, 1, 1], dtype=int32), group_size=3))

In [208]:
def get_next_groups(gc: GroupCountIO):
    """
    GroupCountIO
      current_N
      original_els, num_original_els, 
      input_rows, num_input_rows,
      curr_bin_mask, curr_group_count_totals,
      min_support    
    """
    next_mask, group_counts = get_next_mask_and_groups(gc)
    gc.curr_group_count_totals.append(group_counts)
    current_N += 1
    return gc


In [220]:
def get_apriori_groups(input_id_to_collections: Dict, min_support: int) -> Set[frozenset]:
    input_collections_size = len(input_id_to_collections.items())
    
    all_input_elements, vectorised_input_collections = get_input_collections_as_binary_arrays(
        input_id_to_collections.values()
    )

    number_of_unique_input_els = len(all_input_elements)
    
    vectorised_inputs_stack = tf.stack(vectorised_input_collections)

    single_el_occurances = tf.reduce_sum(vectorised_inputs_stack, axis=0)
    
    # Get single set groups as itemsets (in original formats)
    single_member_groups = tf.gather_nd(all_input_elements, tf.where(single_el_occurances >= min_support))
    single_member_group_counts = tf.gather_nd(single_el_occurances, tf.where(single_el_occurances >= min_support))
    
    frequent_single_bin_mask = tf.cast(single_el_occurances >= min_support, tf.uint8)
        
#     current_N
#     original_els, num_original_els, 
#     input_rows, num_input_rows,
#     curr_bin_mask, curr_group_count_totals,
#     min_support

    input_1 = (GroupCountIO(2, 
                            all_input_elements, number_of_unique_input_els,
                            vectorised_input_collections, input_collections_size,
                            frequent_single_bin_mask, 
                            [GroupSupport(single_member_groups, single_member_group_counts, 1)],
                            min_support
                           ),)
    
    condition = lambda group_count_io: tf.greater(tf.reduce_sum(group_count_io.curr_bin_mask), 
                                                  tf.constant(0, dtype=tf.uint8))
    
    group_count_output = tf.while_loop(condition, get_next_groups, input_1,
                                       parallel_iterations=1,
                                       back_prop=False,
                                       maximum_iterations=len(original_input_els))
    
    return group_count_output.curr_group_count_totals

In [221]:
current_N = None
number_of_unique_input_els = None
def test_apriori_groups(input_groups: Dict[Any,str], real_groups, min_support=2):
    result_groups = get_apriori_groups(input_groups, min_support)
    
    init = tf.global_variables_initializer()
    with tf.Session() as session:
        session.run(init)
        result = session.run(result_groups)
        
    print(result)
#     assert len([g for g in real_groups if g in result_groups]) == len(real_groups)
    
groups = {'0':['A','B','C','D','E'], '1':['B','C','D','E','F'],'2':['C','D','E','F','G']}

real_groups_min_sup_2 = set([frozenset(['B']),frozenset(['C']),frozenset(['D']),frozenset(['E']),frozenset(['F']),
                             frozenset(['B','C']),frozenset(['C','D']),frozenset(['D','E']),frozenset(['B','D']),
                             frozenset(['B','E']),frozenset(['C','E']),
                             frozenset(['B','C','D']),frozenset(['B','C','E']),frozenset(['C','D','E']),frozenset(['C','D','F']),
                             frozenset(['C','E','F']),frozenset(['D','E','F']),
                             frozenset(['B','C','D','E'])])

test_apriori_groups(groups, real_groups_min_sup_2, min_support=2)

Get_next_mask_and_groups for group size Tensor("while_29/Identity_1:0", shape=(), dtype=int32)
Tensor("while_29/Identity_9:0", shape=(), dtype=int32)


TypeError: 'Tensor' object cannot be interpreted as an integer

In [232]:
[1, 3, 4] + 1

TypeError: can only concatenate list (not "int") to list

In [9]:
smt = tf.Variable([[1,0,0],
                   [1,0,1],
                   [1,0,1]])
smt_2 = tf.Variable(
    [
        [[1,1,1],
         [1,1,1],
         [1,1,1]],
        [[1,1,1],
         [1,1,1],
         [1,1,1]],
        [[1,1,1],
         [1,1,1],
         [1,1,1]]
    ]
)

sparse = tf.SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])

sparse_2 = tf.SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])
    
init = tf.global_variables_initializer()
with tf.Session() as session:
    session.run(init)
    result = session.run(tf.multiply(sparse_2,sparse))
#     result = session.run(tf.shape(smt))
result

TypeError: Failed to convert object of type <class 'tensorflow.python.framework.sparse_tensor.SparseTensor'> to Tensor. Contents: SparseTensor(indices=Tensor("SparseTensor_1/indices:0", shape=(2, 2), dtype=int64), values=Tensor("SparseTensor_1/values:0", shape=(2,), dtype=int32), dense_shape=Tensor("SparseTensor_1/dense_shape:0", shape=(2,), dtype=int64)). Consider casting elements to a supported type.

In [18]:
import numpy as np

vs = np.matrix('1 2 1 2 1; 1 2 0 2 1')
M = np.matrix(
    '1 0 0 0 0;'   # col 1 becomes col 1
    '0 0 0 0 0;'   # drop col 2
    '0 0 1 0 0;'   # col 3 becomes col 2
    '0 0 0 0 0;'   # drop col 4
    '0 0 0 0 1',   # col 5 becomes col 3
)
print(vs)
print('*')
print(M)
print('=')
print(vs*M)

print()
M2 = np.matrix(
    '0.5 0   ;'  # col 1 becomes 0.5 of col 1, and 0.5 of col 3
    '0   0   ;'  # col 2 dropped
    '0.5 0.5 ;'  # col 3 becomes 0.5 of col 1, and 0.5 of col 2
    '0   0   ;'  # col 4 droppes
    '0.5 0.5 ')  # col 5 becomes 0.5 of col 2

print(vs)
print('*')
print(M2)
print('=')
print(vs*M2)

[[1 2 1 2 1]
 [1 2 0 2 1]]
*
[[1 0 0 0 0]
 [0 0 0 0 0]
 [0 0 1 0 0]
 [0 0 0 0 0]
 [0 0 0 0 1]]
=
[[1 0 1 0 1]
 [1 0 0 0 1]]

[[1 2 1 2 1]
 [1 2 0 2 1]]
*
[[0.5 0. ]
 [0.  0. ]
 [0.5 0.5]
 [0.  0. ]
 [0.5 0.5]]
=
[[1.5 1. ]
 [1.  0.5]]


In [331]:
vs = np.array([['1','2','1','2','1'], '1','2','0','2','1'])
M = np.array(
    [
        [['1','0','0'],
         ['0','0','0'],
         ['0','1','0'],
         ['0','0','0'],
         ['0','0','1']],
        
        [['1','0','0'],
         ['0','0','0'],
         ['0','1','0'],
         ['0','0','0'],
         ['0','0','1']],
        
        [['1','0','0'],
         ['0','0','0'],
         ['0','1','0'],
         ['0','0','0'],
         ['0','0','1']],
        
        [['1','0','0'],
         ['0','0','0'],
         ['0','1','0'],
         ['0','0','0'],
         ['0','0','1']],
        
        [['1','0','0'],
         ['0','0','0'],
         ['0','1','0'],
         ['0','0','0'],
         ['0','0','1']]
    ]
)
type(M)
M_t = np.transpose(M,[1,2,0])
np.matmul(M_t, M)

TypeError: invalid data type for einsum

In [74]:
import collections
Pair = collections.namedtuple('Pair', 'j, k')
ijk_0 = (tf.constant(0), Pair(tf.constant(1), tf.constant(2)))
c = lambda i, p: i < 10
b = lambda i, p: (i + 1, Pair((p.j + p.k), (p.j - p.k)))
ijk_final = tf.while_loop(c, b, ijk_0)

init = tf.global_variables_initializer()
with tf.Session() as session:
    session.run(init)
    result = session.run(ijk_final)
#     result = session.run(tf.shape(smt))
result

(10, Pair(j=32, k=64))