# Heaps
A special, complete binary tree that satisfies the heap property - the key at each node is at least as great as the keys stored at its children (max-heap)   
AKA a priority queue

When implemented as array:   
- left child index = $2i + 1$
- right child index = $2i + 2$   

Time Complexity:   
- look up max/min: $O(1)$
- insertion/extraction: $O(\log n)$

## Tips:
- Use a heap when **all you care about** is the **largest** or **smallest** elements, and you **do not need** to support fast lookup, delete, or search operations for arbitrary elements
- A heap is a good choice when you need to compute the $k$ **largest** or $k$ **smallest** elements in a collection. For the former, use a min-heap, for the latter, use a max-heap

In [2]:
import heapq
import itertools
import math
import random
from typing import Iterator, List, Tuple

## Libraries
- heapq provides a min-heap
- for a max-heap, take negative of values to get same effect

In [3]:
L = [random.randint(0, 20) for i in range(10)]
print('L:                          ', L)

# tranform collection, L, into heap in place
heapq.heapify(L)
print('heapq.heapify(L):           ', L)

# k largest
print('heapq.nlargest(k, L):       ', heapq.nlargest(3, L))

# k smallest
print('heapq.nsmallest(k, L):      ', heapq.nsmallest(3, L))

# push new element on heap
heapq.heappush(L, 10)
print('heapq.heappush(L, value):   ', L)

# extract smallest element on heap
print('heapq.heappop(L):           ', heapq.heappop(L))

# push value on to heap then extract smallest element
print('heapq.heappushpop(L, value):', heapq.heappushpop(L, 15), ' --> ', L)

# lookup min value without extracting
print('L[0]:                       ', L[0], ' --> ', L)

L:                           [12, 11, 3, 17, 12, 20, 15, 17, 6, 7]
heapq.heapify(L):            [3, 6, 12, 11, 7, 20, 15, 17, 17, 12]
heapq.nlargest(k, L):        [20, 17, 17]
heapq.nsmallest(k, L):       [3, 6, 7]
heapq.heappush(L, value):    [3, 6, 12, 11, 7, 20, 15, 17, 17, 12, 10]
heapq.heappop(L):            3
heapq.heappushpop(L, value): 6  -->  [7, 10, 12, 11, 12, 20, 15, 17, 17, 15]
L[0]:                        7  -->  [7, 10, 12, 11, 12, 20, 15, 17, 17, 15]


### Process Stream of Strings and keep K largest

In [4]:
def top_k(k: int, stream: Iterator[str]) -> List[str]:

    # entries are compared by their length
    min_heap = [(len(s), s) for s in itertools.islice(stream, k)]
    heapq.heapify(min_heap)

    for next_string in stream:
        # push next string and pop shortest string
        heapq.heappushpop(min_heap, (len(next_string), next_string))
    
    # elements in heap are tuples
    return [t[1] for t in heapq.nsmallest(k, min_heap)]

In [5]:
k, stream = 3, ['cat', 'banana', 'george', 'aaaaaaaaaaaaa', 'I', 'couat', 'afgeareas']
s = [(len(s), s) for s in itertools.islice(stream, k)]
print(s)

[(3, 'cat'), (6, 'banana'), (6, 'george')]


In [6]:
top_k(k, stream)

['george', 'afgeareas', 'aaaaaaaaaaaaa']

Time complexity is $O(n\log k)$ because takes $\log k$ to process each string and there are $n$ strings

### 10.1: Merge Sorted Files
Write a program that takes as input a set of sorted sequences and computes the union of those sequences as a sorted sequence   
e.g.: $[3, 5, 7], [0, 6], [0, 6, 28]$ --> $[0, 0, 3, 5, 6, 6, 7, 28]$


In [7]:
def merge_sorted_arrays(sorted_arrays = List[List[int]]) -> List[int]:
    
    min_heap: List[Tuple[int, int]] = []  # value, array index

    # build a list of iterators for each array in sorted_arrays
    sorted_array_iters = [iter(x) for x in sorted_arrays]

    # put first element from each iterator in min_heap
    for i, it in enumerate(sorted_array_iters):
        first_element = next(it, None)   # default value of None if iterator is exhausted
        if first_element is not None:
            heapq.heappush(min_heap, (first_element, i))

    result = []
    while min_heap:
        min_value, array_index = heapq.heappop(min_heap)           # get smallest value
        result.append(min_value)
        next_element = next(sorted_array_iters[array_index], None) # get next element from array with smallest element
        if next_element is not None:
            heapq.heappush(min_heap, (next_element, array_index))

    return result

def merge_sorted_arrays_pythonic(sorted_arrays = List[List[int]]) -> List[int]:
    return list(heapq.merge(*sorted_arrays))

Time complexity is $O(n\log k)$ and space complexity is $O(k)$ where $k < n$

In [8]:
sorted_array_list, ans = [[3, 5, 7], [0, 6], [0, 6, 28]], [0, 0, 3, 5, 6, 6, 7, 28]
print(*sorted_array_list)
assert merge_sorted_arrays(sorted_array_list) == ans
assert merge_sorted_arrays_pythonic(sorted_array_list) == ans

[3, 5, 7] [0, 6] [0, 6, 28]


### 10.2 Sort and Increasing-Decreasing Array
alternates between increasing and decreasing k times       
e.g.: $[57, 131, 493, 294, 221, 339, 418, 452, 442, 190]$ where $k=4$


In [24]:
def sort_k_increasing_decreasing_array(A: List[int]) -> List[int]:
    # decompose A into list of sorted subarrays
    sorted_arrays = []
    increasing, decreasing = range(2)
    subarray_type = increasing
    start_idx = 0

    for i in range(1, len(A)+1):
        # reached end of subarray
        if (i == len(A) or
            (A[i-1] < A[i] and subarray_type == decreasing) or
            (A[i-1] >= A[i] and subarray_type == increasing)
        ):
            sorted_arrays.append(A[start_idx:i] if subarray_type == increasing else A[i-1:start_idx-1:-1])  # reverse subarray when decreasing
            start_idx = i
            subarray_type = decreasing if subarray_type == increasing else increasing
    
    # sort sequence of sorted arrays
    return merge_sorted_arrays(sorted_arrays)

assert sort_k_increasing_decreasing_array([57, 131, 493, 294, 221, 339, 418, 452, 442, 190]) == [57, 131, 190, 221, 294, 339, 418, 442, 452, 493]

In [25]:
def sort_k_increasing_decreasing_array(A: List[int]) -> List[int]:
    # decompose A into list of sorted subarrays
    sorted_arrays = []
    increasing = 1
    start_idx = 0
    end_idx = 1

    while end_idx < len(A) + 1:
        # find end of increasing/decreasing sub array
        if end_idx != len(A):
            if increasing:
                while A[end_idx-1] < A[end_idx]:
                    end_idx += 1
            else: 
                while A[end_idx-1] < A[end_idx]:
                    end_idx += 1
        # extract subarray
        if increasing:
            sorted_arrays.append(A[start_idx:end_idx])
        else:
            sorted_arrays.append(A[end_idx-1:start_idx-1:-1])

        start_idx = end_idx
        end_idx += 1      
    
    # sort sequence of sorted arrays
    return merge_sorted_arrays(sorted_arrays)

assert sort_k_increasing_decreasing_array([57, 131, 493, 294, 221, 339, 418, 452, 442, 190]) == [57, 131, 190, 221, 294, 339, 418, 442, 452, 493]

Time complexity is $O(n\log k)$ and space complexity is $O(k)$ where $k < n$

### 10.3: Sort Almost Sorted Array
Sort an array where each element is at most $k$ away from its correct position     
e.g.: $[3, -1, 2, 6, 4, 5, 8]$, each element is no more than 2 spots away from correct position

In [26]:
def sort_almost_sorted_array(sequence: Iterator[int], k: int) -> List[int]:
    ''' 
    guaranteed to find next smallest value by checking k+1 values
    '''
    min_heap: List[int] = []

    # add first k element
    for x in itertools.islice(sequence, k):
        heapq.heappush(min_heap, x)

    result: List[int] = []
    for next_element in sequence:
        smallest = heapq.heappushpop(min_heap, next_element)
        result.append(smallest)

    # sequence exhausted
    while min_heap:
        smallest = heapq.heappop(min_heap)  # guarantees check k+1 elements
        result.append(smallest)
    
    return result

Time complexity is $O(n\log k)$ and space complexity is $O(k)$ where $k < n$

In [27]:
assert sort_almost_sorted_array(iter([3, -1, 2, 6, 4, 5, 8]), k=2) == [-1, 2, 3, 4, 5, 6, 8]

### 10.4: Compute K Closest Starts

In [None]:
class Star:
    def __init__(self, x: float, y:float, z: float) -> None:
        self.x, self.y, self.z = x, y, z 
    
    @property
    def disance(self) -> float:
        return math.sqrt(self.x**2 + self.y**2 + self.z**2)

    def __lt__(self, rhs: 'Star') -> bool:
        return self.disance < rhs.disance

    def __str__(self) -> str:
        return f'Star at position ({self.x}, {self.y}, {self.z}) is {self.disance} from Earth'


def closest_k_stars(stars: Iterator[Star], k: int) -> List[Star]:

    max_heap: List[Tuple[float, Star]] = [] # -distance, start

    # add first k stars to heap
    for star in stars:
        heapq.heappush(max_heap, (-star.disance, star))  # python only has min-heap so sort in reversed distance order
        if len(max_heap) == k+1:                         # if more than k elements on heap, pop
            heapq.heappop(max_heap)

    # iteratively extract from the max-heap, which yields the stars sorted from furthest to closest
    return [s[1] for s in heapq.nlargest(k, max_heap)]



Time complexity is $O(n\log k)$ and space complexity is $O(k)$ where $k < n$

In [None]:
stars = [Star(1, 1, 1), Star(2, 10, 5)]
for i in range(10):
    stars.append(Star(*[random.random()*20 for i in range(3)]))
k_closest = closest_k_stars(iter(stars), k=3)
for star in k_closest:
    print(star)

Star at position (1, 1, 1) is 1.7320508075688772 from Earth
Star at position (2, 10, 5) is 11.357816691600547 from Earth
Star at position (13.346258568203258, 5.598600386328984, 0.6712975026057189) is 14.488532858510782 from Earth


#### Variant: 
Design an $O(n\log k) algorithm that reads a sequence of $n$ elements and for each element, starting with the $k$th element, prints the $k$th largest elements read up to that point.   
e.g. = [5, 2, 10, 3, 4, 8, 1, 5] k = 3   
5   
5, 2    
10, 5, **2**   
10, 5, **3**, 2   
10, 5, **4**, 3, 2   
10, 8, **5**, 4, 3, 2   
10, 8, **5**, 4, 3, 2, 1   
10, 8, **5**, 5, 4, 3, 2, 1     

In [31]:
def kth_largest(sequence: Iterator[int], k: int) -> List[int]:

    min_heap = []

    # add first k elements to min heap
    for x in itertools.islice(sequence, k):
        heapq.heappush(min_heap, x)

    result = []
    result.append(min_heap[0])

    # iterate through rest of sequence
    for x in sequence:
        if x > min_heap[0]:
            heapq.heappushpop(min_heap, x)
        result.append(min_heap[0])
    
    return result

kth_largest(iter([5, 2, 10, 3, 4, 8, 1, 5]), k=3)

[2, 3, 4, 5, 5, 5]

### 10.5: Compute the Median of Online Data

In [36]:
def online_median(sequence: Iterator[int]) -> List[float]:
    ''' 
    maintain a max heap for the lower half and min heap for the upper half
    '''
    min_heap: List[int] = []
    max_heap: List[int] = []
    result = []
    for x in sequence:
        heapq.heappush(max_heap, -heapq.heappushpop(min_heap, x))

        if len(max_heap) > len(min_heap):
            heapq.heappush(min_heap, -heapq.heappop(max_heap))

        result.append((min_heap[0] + -max_heap[0]) / 2 if len(min_heap) == len(max_heap) else min_heap[0])

    return result

assert online_median(iter([1, 0, 3, 5, 2, 0, 1])) == [1, 0.5, 1, 2, 2, 1.5, 1]

Time complexity is $\log n$ corresponding to insertion and extraction from heap

### 10.6: Compute K Largest Elements in a Max-Heap
do not modify the heap   
Brute force algorithm is to do k max-extracts but that has time complexity $O(k\log n)$ and modifies array  



In [None]:
def largest_k_elements(heap: List[int], k: int) -> List[int]:

    if k <= 0 or len(heap) == 0:
        return []

    candidate_max_heap: List[Tuple[int, int]] = [] # -value, index
    candidate_max_heap.append((-heap[0], 0))       # largest element is first element

    result: List[int] = []
    for _ in range(k):
        candidate_idx = candidate_max_heap[0][1]
        result.append(-heapq.heappop(candidate_max_heap)[0])

        left_child_idx = candidate_idx * 2 + 1
        right_child_idx = left_child_idx + 1

        if left_child_idx < len(heap):
            heapq.heappush(candidate_max_heap, (-heap[left_child_idx], left_child_idx))

        if right_child_idx < len(heap):
            heapq.heappush(candidate_max_heap, (-heap[right_child_idx], right_child_idx))

    return result




Time complexity is $O(k \log k)$  because k iterations and each push to candidate max heap take $\log k$ time. Space complexity is $O(k)$ where $k < n$

In [None]:
assert largest_k_elements([561, 314, 401, 28, 156, 359, 271, 11, 3], 4) == [561, 401, 359, 314]