In [1]:
import numpy as np
import matplotlib.pyplot as plt
import time
import pandas as pd
from numpy.testing import assert_array_equal
from tqdm.notebook import tqdm

In [3]:
from microagg1d.main import _simple_dynamic_program,_simple_dynamic_program2,  compute_cluster_cost_sorted
from microagg1d.wilber import wilber, _galil_park
#from microagg1d.wilber2 import wilber2, _galil_park2

In [4]:
from functools import partial

In [5]:
from microagg1d.smawk_iter2 import __smawk_iter

In [6]:
from numba import njit, int64, float64
from microagg1d.wilber import StableMicroaggWilberCalculator, relabel_clusters_plus_one, calc_cumsum, MicroaggWilberCalculator, MicroaggWilberCalculator_edu

In [7]:
#@njit([(float64[:], int64)], cache=False)
def _diag2(v, k):
    stable=0
    n = len(v)
    if stable==1:
        wil_calculator = StableMicroaggWilberCalculator(v, k, -np.ones(n+1, dtype=np.float64), 3*k)
        return relabel_clusters_plus_one(__diag2(n, wil_calculator, k))
    elif stable==0:
        cumsum = calc_cumsum(v)
        cumsum2 = calc_cumsum(np.square(v))
        wil_calculator = MicroaggWilberCalculator_edu(cumsum, cumsum2, k, -np.ones(n+1, dtype=np.float64))
        wil_calculator.LARGE_VAL=100
        wil_calculator.SMALL_VAL=1
        out = __diag2(n, wil_calculator, k)
        with np.printoptions(linewidth=300, precision=3, suppress=True):
            M1 = np.vstack((np.arange(wil_calculator.G.shape[1]),wil_calculator.G))
            print(M1.shape)
            x = np.arange(-1, wil_calculator.G.shape[0]).reshape(wil_calculator.G.shape[0]+1, 1)
            print(x.shape)
            print(np.concatenate((x, M1),axis=1))
        return out #relabel_clusters_plus_one(out)
    else:
        raise NotImplementedError("Only stable in (0,1) supported")
# 3 7 0 4

In [22]:
@njit()
def __staggered2(n, wil_calculator, k):
    """ Solves the dynamic problem in O(n)
    """
    F = np.empty(n, dtype=np.int32)
    F_vals = wil_calculator.F_vals
    F_vals[0]=0
    col_buffer = np.empty(2*n+2, dtype=np.int64)
    col_starts = np.empty(2*n+2, dtype=np.int64)

    # initial values
    for i in range(k-1, min(2*k-1,n)):
        F[i] = 0
        F_vals[i+1] = wil_calculator.calc(i, 0)
    if n<=2*k-1:
        return F

    def one_round(l, r, u, b):
        print(F_vals)
        print(F_vals[u:b], u, b)
        f_min = np.min(F_vals[u:b])
        for j in range(u, b):
            F_vals[j]-=f_min

        __smawk_iter(l, r, u, b, wil_calculator, F, col_starts, col_buffer)
        for j in range(l, r):
            F_vals[j+1] = wil_calculator.calc(j, F[j])

    # first block
    max_col = min(3*k-1,n)
    one_round(2*k-1, max_col, k, 2*k)
    if n<=max_col:
        return F

    # remaining blocks
    f, R = divmod(n-3*k+1,k)
    j=3
    for i in range(3,3+f): # do the main blocks
        one_round(i*k-1, (i+1)*k-1, max(F[i*k-2], (i-2)*k), i*k)
        j=i+1

    if R > 0: # deal with the remainder
        one_round(j*k-1, (j)*k-1+R, max(F[j*k-2], (j-2)*k), (j-1)*k+R)

    return F

In [23]:
#@njit([(float64[:], int64)], cache=False)
def _staggered2(v, k):
    stable=0
    n = len(v)
    if stable==1:
        wil_calculator = StableMicroaggWilberCalculator(v, k, -np.ones(n+1, dtype=np.float64), 3*k)
        return relabel_clusters_plus_one(__staggered2(n, wil_calculator, k))
    elif stable==0:
        cumsum = calc_cumsum(v)
        cumsum2 = calc_cumsum(np.square(v))
        wil_calculator = MicroaggWilberCalculator_edu(cumsum, cumsum2, k, -np.ones(n+1, dtype=np.float64))
        wil_calculator.G[:]=-1
        wil_calculator.LARGE_VAL=100
        wil_calculator.SMALL_VAL=1
        out = __staggered2(n, wil_calculator, k)
        with np.printoptions(linewidth=300, precision=3, suppress=True):
            M1 = np.vstack((np.arange(wil_calculator.G.shape[1]),wil_calculator.G))
            print(M1.shape)
            x = np.arange(-1, wil_calculator.G.shape[0]).reshape(wil_calculator.G.shape[0]+1, 1)
            print(x.shape)
            print(np.concatenate((x, M1),axis=1))
        return out #relabel_clusters_plus_one(out)
    else:
        raise NotImplementedError("Only stable in (0,1) supported")

In [26]:

np.random.seed(1)
arr = np.arange(12, dtype=np.float64)#np.random.rand(12)
#arr = np.array([1.14374817e-04, 2.73875932e-02, 9.23385948e-02, 1.46755891e-01,
#       1.86260211e-01, 2.04452250e-01, 3.02332573e-01, 3.45560727e-01,
#       3.96767474e-01, 4.17022005e-01, 4.19194514e-01, 5.38816734e-01,
#       6.85219500e-01, 7.20324493e-01, 8.78117436e-01])
arr.sort()

result1 = _staggered2(arr, 2)


[ 0.  -1.   0.5  2.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1.  -1. ]
[0.5 2. ] 2 4
[ 0.  -1.   0.   1.5  0.5  2.  -1.  -1.  -1.  -1.  -1.  -1.  -1. ]
[0.  1.5 0.5 2. ] 2 6
[ 0.  -1.   0.   1.5  0.5  2.   1.   2.5 -1.  -1.  -1.  -1.  -1. ]
[0.5 2.  1.  2.5] 4 8
[ 0.  -1.   0.   1.5  0.   1.5  0.5  2.   1.   2.5 -1.  -1.  -1. ]
[0.5 2.  1.  2.5] 6 10
[ 0.  -1.   0.   1.5  0.   1.5  0.   1.5  0.5  2.   1.   2.5 -1. ]
[0.5 2.  1. ] 8 11
(13, 12)
(13, 1)
[[ -1.    0.    1.    2.    3.    4.    5.    6.    7.    8.    9.   10.   11. ]
 [  0.   -1.    0.5   2.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1. ]
 [  1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1. ]
 [  2.   -1.   -1.   -1.    0.5   2.   98.   98.   -1.   -1.   -1.   -1.   -1. ]
 [  3.   -1.   -1.   -1.   -1.    2.    3.5  97.   -1.   -1.   -1.   -1.   -1. ]
 [  4.   -1.   -1.   -1.   -1.   -1.    1.    2.5  96.   96.   -1.   -1.   -1. ]
 [  5.   -1.   -1.   -1.   -1.   -1.  105.    2.5   3.5  95.  

In [11]:
result1

array([-875836469, -875836469,          0,          0,          0,
                3,          4,          4,          6,          7,
                8,          9])

In [12]:
#[0 0 0 0 1 1 1 2 2 2]
from numba.experimental import jitclass

In [13]:
@jitclass([('cumsum', float64[:,:]), ('cumsum2', float64[:,:]), ('k', int64), ("F_vals", float64[:]), ("SMALL_VAL", float64), ("LARGE_VAL", float64), ("cell_size", int64), ("count1", int64), ("count2", int64)])
class CountStableMicroaggWilberCalculator:
    """A stable variant of the microagg calculator for wilbers method"""
    def __init__(self, x, k, F_vals, cell_size):
        self.cumsum = calc_cumsum_cell(x, cell_size)
        x_square = np.square(x)
        self.cumsum2 = calc_cumsum_cell(x_square, cell_size)
        self.k = k
        self.F_vals = F_vals
        n = len(x)
        self.SMALL_VAL = _calc_objective(np.sum(x), np.sum(x_square), n)
        self.LARGE_VAL = self.SMALL_VAL * (1 + n)
        self.cell_size = cell_size
        self.count1=0
        self.count2=0

    def calc(self, j, i): # i <-> j interchanged is not a bug!
        if j < i:
            return np.inf

        if not (j+1 - i >= self.k):
            self.count2+=1
            return self.LARGE_VAL + self.SMALL_VAL*i
        if not (j+1 - i <= 2 * self.k - 1):
            self.count2+=1
            return self.LARGE_VAL - self.SMALL_VAL*i
        self.count1+=1
        return calc_objective_cell(self.cumsum, self.cumsum2, self.cell_size, i, j) + self.F_vals[i]

In [14]:
@jitclass([('cumsum', float64[:]), ('cumsum2', float64[:]), ('k', int64), ("F_vals", float64[:]), ("SMALL_VAL", float64), ("LARGE_VAL", float64), ("count1", int64), ("count2", int64)])
class CountMicroaggWilberCalculator:
    """The standard microagg calculator for wilbers method"""
    def __init__(self, cumsum, cumsum2, k, F_vals):
        self.cumsum = cumsum
        self.cumsum2 = cumsum2
        self.k = k
        self.F_vals = F_vals
        n = len(cumsum) - 1
        self.SMALL_VAL = calc_objective_upper_inclusive(cumsum, cumsum2, 0, n-1)
        self.LARGE_VAL = self.SMALL_VAL * (1 + n)
        self.count1=0
        self.count2=0

    def calc(self, j, i): # i <-> j interchanged is not a bug!
        if j < i:
            return np.inf

        if not (j+1 - i >= self.k):
            self.count2+=1
            return self.LARGE_VAL + self.SMALL_VAL*i
        if not (j+1 - i <= 2 * self.k - 1):
            self.count2+=1
            return self.LARGE_VAL - self.SMALL_VAL*i
        self.count1+=1
        return calc_objective_upper_inclusive(self.cumsum, self.cumsum2, i, j) + self.F_vals[i]

In [15]:
from microagg1d.wilber2 import __galil_park2
from microagg1d.common import calc_cumsum, calc_objective_upper_exclusive, calc_objective_upper_inclusive, calc_objective_cell, calc_cumsum_cell, _calc_objective

In [16]:
@njit([(float64[:], int64, int64)], cache=False)
def _galil_park2(v, k, stable=1):
    n = len(v)
    if stable==1:
        wil_calculator = CountStableMicroaggWilberCalculator(v, k, -np.ones(n+1, dtype=np.float64), 3*k)
        out = __galil_park2(n, wil_calculator)
        print(k, wil_calculator.count1/n, wil_calculator.count2/n)
        return relabel_clusters_plus_one(out)
    elif stable==0:
        cumsum = calc_cumsum(v)
        cumsum2 = calc_cumsum(np.square(v))
        wil_calculator = CountMicroaggWilberCalculator(cumsum, cumsum2, k, -np.ones(n+1, dtype=np.float64))
        out = __galil_park2(n, wil_calculator)
        return relabel_clusters_plus_one(out)
    else:
        raise NotImplementedError("Only stable in (0,1) supported")

In [17]:
@njit([(float64[:], int64, int64)], cache=False)
def _staggered3(v, k, stable=1):
    n = len(v)
    if stable==1:
        wil_calculator = CountStableMicroaggWilberCalculator(v, k, -np.ones(n+1, dtype=np.float64), 3*k)
        out = __staggered2(n, wil_calculator, k)
        print(k, wil_calculator.count1/n, wil_calculator.count2/n)
        return relabel_clusters_plus_one(out)
    elif stable==0:
        cumsum = calc_cumsum(v)
        cumsum2 = calc_cumsum(np.square(v))
        wil_calculator = CountMicroaggWilberCalculator(cumsum, cumsum2, k, -np.ones(n+1, dtype=np.float64))
        out = __staggered2(n, wil_calculator, k)
        print(k, wil_calculator.count1/n, wil_calculator.count2/n)
        return relabel_clusters_plus_one(out)
    else:
        raise NotImplementedError("Only stable in (0,1) supported")

In [18]:

big_arr = np.arange(1_000_000, dtype=np.float64)

np.random.seed(1)
big_arr = np.random.rand(1_000_000)
big_arr.sort()

In [19]:
@jitclass([('cumsum', float64[:,:]), ('cumsum2', float64[:,:]), ('cell_size', int64), ('count', int64)])
class CountStableCumsumCalculator:
    def __init__(self, v, cell_size):
        self.cumsum = calc_cumsum_cell(v, cell_size)
        self.cumsum2 = calc_cumsum_cell(np.square(v), cell_size)
        self.cell_size = cell_size
        self.count = 0

    def calc(self, i, j):
        self.count+=1
        if j==i:
            return 0
        return calc_objective_cell(self.cumsum, self.cumsum2, self.cell_size, i, j)
    
    
@jitclass([('cumsum', float64[:]), ('cumsum2', float64[:]), ('count', int64)])
class CountCumsumCalculator:
    def __init__(self, v):
        self.cumsum = calc_cumsum(v)
        self.cumsum2 = calc_cumsum(np.square(v))
        self.count = 0

    def calc(self, i, j):
        self.count+=1
        return calc_objective_upper_inclusive(self.cumsum, self.cumsum2, i, j)

In [20]:
from microagg1d.main import __simple_dynamic_program2, CumsumCalculator

In [21]:
@njit([(float64[:], int64, int64)], cache=False)
def _simple_dynamic_program2(x, k, stable=1):
    n = len(x)
    assert k > 0
    if n//2 < k: # there can only be one cluster
        return np.zeros(n, dtype=np.int64)
    if k==1: # each node has its own cluster
        return np.arange(n)

    if stable==1:
        calculator = CountStableCumsumCalculator(x, k)
        out = __simple_dynamic_program2(n, k, calculator)
        print("#", calculator.count/n)
        return out
    elif stable==0:
        calculator = CountCumsumCalculator(x)
        out = __simple_dynamic_program2(n, k, calculator)
        print("#", calculator.count/n)
        return out
    else:
        assert False

In [22]:
18/0.5

36.0

In [23]:
11.8/0.425

27.764705882352942

In [24]:

for k in [10, 100, 1000, 10000]:
    start = time.time()
    _galil_park2(big_arr, k, stable=True)
    end = time.time()
    print("\t", end-start)
#10 8149951 4987435 1.634096684969328
#100 8997198 9554434 0.9416777592476959
#1000 10018823 11202448 0.8943422901851452
#10000 10906281 7632456 1.4289346705699975

10 7.467243 5.166735
	 0.517047643661499
100 8.369993 9.272056
	 0.538238525390625
1000 8.470119 11.976444
	 0.5570006370544434
10000 9.464793 8.635394
	 0.5660004615783691


In [25]:
for k in [10, 100, 1000, 10000]:
    start = time.time()
    _staggered3(big_arr, k, stable=True)
    end = time.time()
    print("\t", end-start)
    
# stable
#10 7.599915 1.699925
#100 8.774565 2.753608
#1000 9.555528 2.350685
#10000 10.762317 1.040191


10 7.288854 2.017855
	 0.39138054847717285
100 8.462782 3.062973
	 0.41899919509887695
1000 8.472159 3.433215
	 0.4180002212524414
10000 9.501233 2.304138
	 0.45102858543395996


In [29]:
for k in [10, 100, 1000, 10000]:
    start = time.time()
    _simple_dynamic_program2(big_arr, k, stable=True)
    end = time.time()
    print("\t", end-start)
    time.sleep(0.2)
# stable=False   
# 3.976547
#	 0.07240653038024902
# 5.648772
#	 0.08900070190429688
# 10.991427
#	 0.16099786758422852
# 214.374837
#	 2.7751386165618896



# stable=True

# 4.240748
#	 3.4537150859832764
# 26.091592
#	 1.2420427799224854
# 295.721681
#	 13.74938154220581
# 2879.835166
#	 136.4666247367859



# 4.240748
	 0.22540736198425293
# 26.091592
	 1.2300479412078857
# 295.721681
	 13.562055826187134
# 2879.835166
	 132.80891585350037


In [27]:
print("Efficiency in Million QF calculations per second")
214.374837/2.737

Efficiency in Million QF calculations per second


78.32474862988674

In [28]:
10.99/0.160

68.6875