In [413]:
import numpy as np
import math
import itertools
from functools import cache

In [414]:
class SequenceTreeNode:
    def __init__(self):
        self.children = np.array([None, None])
        self.char = -1
        self.height = 0
        self.size = 0
        self.contains_0_vector = False
        self.contains_1_vector = False

    def has_sequence(self, x):
        node = self
        for i,c in enumerate(x):
            if node.children[c] is None:
                return False
            node = node.children[c]
        return True

    def insert_sequence(self, x):
        if self.has_sequence(x):
            return

        self.__insert_sequence_internal(x)

    def __insert_sequence_internal(self, x):
        self.size += 1
        if x.size == 0:
            return

        self.height = x.size

        if np.all(x==0):
            self.contains_0_vector = True   
        elif np.all(x==1):
            self.contains_1_vector = True 

        letter = x[0]
        if self.children[letter] is None:
            self.children[letter] = SequenceTreeNode()

        self.children[letter].char = letter
        self.children[letter].__insert_sequence_internal(x[1:])

    def print_tree(self):
        self.__print_tree(0)

    def __print_tree(self, depth):
        print(f'({depth}) - char {self.char} has {self.size} sequences.\n\t\
            contains the 0 vector: {self.contains_0_vector}\n\t\
            contains the 1 vector: {self.contains_1_vector}')
        for child in self.children:
            if child is not None:
               child.__print_tree(depth+1)

    def print_all_sequences(self):
        self.__print_all_sequences(np.array([], dtype=np.ubyte))

    
    def __print_all_sequences(self, path):
        if np.all(self.children == None):
            print(path)
        for child in self.children:
            if child is None:
                continue
            child.__print_all_sequences(np.append(path, child.char))

    def decrease_size_by(self, k):
        if k == 0:
            return

        self.size -= k

        if self.children[0] is not None and self.children[0].size <= k:
            self.children[1].decrease_size_by(k - self.children[0].size)
            self.children[0] = None
            return
        elif self.children[1] is not None and self.children[1].size <= k:
            self.children[0].decrease_size_by(k - self.children[1].size)
            self.children[1] = None
            return

        if self.children[0] is not None:
            self.children[0].decrease_size_by(k)
        else: 
            self.children[1].decrease_size_by(k)
        

    def decrease_size_to(self, N):
        self.decrease_size_by(self.size - N)


In [415]:
@cache
def get_maximal_deletion_ball_size(n, t):
    if n < t or t < 0:
        return 0

    size = 0
    for i in range(t+1):
        size += math.comb(n-t, i)
    return size

@cache
def get_maximal_number_of_common_subsequences(n, t):
    if n <= t or t <= 0:
        return 0
    return get_maximal_deletion_ball_size(n,t) - get_maximal_deletion_ball_size(n-1,t) + get_maximal_deletion_ball_size(n-2,t-1)

In [416]:
def get_ordering(U):
    k = np.zeros_like(U.children)
    for i, child in enumerate(U.children):
        if child is None:
            k[i] = 0
        else:
            k[i] = child.size
    t = np.argsort(-k, kind='stable') # -k so it will be in descending order
    c = k[t]
    return t, c

In [417]:
def get_deletion_ball_tree(x, t, N=-1):
    result = SequenceTreeNode()
    for c in itertools.combinations(x, x.size-t):
        result.insert_sequence(np.array(c))
        if result.size == N:
            break

    return result

In [418]:
n = 10
t = 2
q = 2
x=np.random.randint(0,q,(n),dtype=np.ubyte)
D_x = get_deletion_ball_tree(x, t)
print(f'x = {x}')
print()
D_x.print_all_sequences()

x = [1 0 1 0 1 1 0 1 0 1]

[0 0 1 1 0 1 0 1]
[0 1 0 1 0 1 0 1]
[0 1 0 1 1 0 0 1]
[0 1 0 1 1 0 1 0]
[0 1 0 1 1 0 1 1]
[0 1 0 1 1 1 0 1]
[0 1 1 1 0 1 0 1]
[1 0 0 1 0 1 0 1]
[1 0 0 1 1 0 0 1]
[1 0 0 1 1 0 1 0]
[1 0 0 1 1 0 1 1]
[1 0 0 1 1 1 0 1]
[1 0 1 0 0 1 0 1]
[1 0 1 0 1 0 0 1]
[1 0 1 0 1 0 1 0]
[1 0 1 0 1 0 1 1]
[1 0 1 0 1 1 0 0]
[1 0 1 0 1 1 0 1]
[1 0 1 0 1 1 1 0]
[1 0 1 0 1 1 1 1]
[1 0 1 1 0 1 0 1]
[1 0 1 1 1 0 0 1]
[1 0 1 1 1 0 1 0]
[1 0 1 1 1 0 1 1]
[1 0 1 1 1 1 0 1]
[1 1 0 1 0 1 0 1]
[1 1 0 1 1 0 0 1]
[1 1 0 1 1 0 1 0]
[1 1 0 1 1 0 1 1]
[1 1 0 1 1 1 0 1]
[1 1 1 1 0 1 0 1]


In [419]:
def get_u_a_i(U, a, i):
    root = SequenceTreeNode()
    node = U
    for j in range(i-1):
        node = node.children[1-a]        
        if node is None:
            return root
    
    node = node.children[a]
    if node is None:
        return root

    root.size = node.size
    root.contains_0_vector = node.contains_0_vector
    root.contains_1_vector = node.contains_1_vector

    for i in range(root.children.size):
        root.children[i] = node.children[i]
    
    return root

In [420]:
n = 10
t = 1
x=np.random.randint(0,q,(n),dtype=np.ubyte)
D_x = get_deletion_ball_tree(x, t)
print(D_x.size)
D_x.print_all_sequences()
print()
D_x.decrease_size_by(3)
print(D_x.size)
D_x.print_all_sequences()
print()

6
[0 0 0 0 0 1 0 0 1]
[0 1 0 0 0 0 0 0 1]
[0 1 0 0 0 0 1 0 0]
[0 1 0 0 0 0 1 0 1]
[0 1 0 0 0 1 0 0 1]
[1 0 0 0 0 1 0 0 1]

3
[0 1 0 0 0 0 0 0 1]
[0 1 0 0 0 0 1 0 0]
[0 1 0 0 0 0 1 0 1]



In [421]:
def get_subsequence_reconstruction_threshold(n, t, order_comp):
    for i, w_i in enumerate(order_comp):
        tau_i = get_maximal_number_of_common_subsequences(n-i-1,t-i)
        if w_i > tau_i:
            return i

In [422]:
def reconstruct_x_from_subsequences(n, U, verbose=False):
    t = n-U.height
    if verbose:
        print(n, t, U.size)
    reconstruction = np.array([], dtype=int)
    while t >= 1:
        order_perm, order_comp = get_ordering(U)
        j = get_subsequence_reconstruction_threshold(n, t, order_comp)
        reconstruction = np.concatenate((reconstruction, order_perm[:j+1]))
        if verbose:
            print(j, reconstruction)

        n = n-j-1
        t = t-j    
        N = get_maximal_number_of_common_subsequences(n,t)+1
        subsequences = get_u_a_i(U, order_perm[j], 1)
        subsequences.decrease_size_to(N)
        if verbose:
            print(N, U.size)
        
    return np.concatenate((reconstruction, subsequences.T[0]))

In [423]:
n = 10
t = 2
q = 2

x=np.random.randint(0,q,(n),dtype=np.ubyte)
N = get_maximal_number_of_common_subsequences(n,t)+1
print(N)
D_x = get_deletion_ball_tree(x, t, N)

while D_x.size < N:
    x=np.random.randint(0,q,(n),dtype=np.ubyte)
    D_x = get_deletion_ball_tree(x, t, N)

print(D_x.size)

17
17


In [424]:
# print(subsequences.shape)
# print(get_next_subsequnces(subsequences,0).shape)
# print(get_next_subsequnces(subsequences,0))
print(x)
print(n, D_x.size)
reconstructed_x = reconstruct_x_from_subsequences(n, D_x, True)
print(f'Reconstructed {reconstructed_x}')
print(f'From {x}')
print(np.array_equal(x, reconstructed_x))

[1 0 0 1 1 0 1 1 1 0]
10 17
10 2 17
1 [1 0]
3 17
0 [1 0 1]
3 17
0 [1 0 1 1]


AttributeError: 'NoneType' object has no attribute 'decrease_size_by'