# Introduction

In this notebook, we experiment with various compression methods for postings lists. 


In [8]:
from math import log

DEBUG = True
#DEBUG = False

def p(msg):
    if DEBUG:
        print('.. {}'.format(msg))

def ilog2(x):
    return int(log(x, 2.0))

def encode_unary(vals):
    out = ""

    vs = [vals] if type(vals) == int else vals
    for v in vs:
        out = out + "1" * v
        out = out + "0"

    return out

def encode_binary(vals, width):
    out = ""

    vs = [vals] if type(vals) == int else vals
    for v in vs:
        for i in range(width, 0, -1):
            bit = v >> (i-1) & 0x0001
            if bit > 0:
                out = out + "1"
            else:
              out = out + "0"

    return out


# should check if input val is within the coding range (e.g., >0)
def encode_gamma(vals):
    out_code = []
    for v in vals:
        k_d = ilog2(v)
        k_r = v - (1 << k_d)
        p('val = {}, k_d = {}, k_r = {}'.format(v, k_d, k_r))
        out1 = encode_unary(k_d)
        out2 = encode_binary(k_r, k_d)
        out_code.append([out1, out2])
    return out_code

# out_list is a list of encodings for each value, where each encoding is a list of component
def pp_binary(out_list, width=8, as_string=True):
    if as_string:
        # print as strings of width-byte chunks
        # first, flatten the list
        l = ["".join(v) for v in out_list]
        str = "".join(l)
        n = len(str)
        s = 0
        while s < n:
            e = min(s + width, n)
            print("{} ".format(str[s:e]), end="")
            s += width
        print()
    else:
        for component in out_list:
            print(component)




In [10]:
def test_gamma(values):
    out = encode_gamma(values)
    #print(out)
    pp_binary(out, as_string=False)
    pp_binary(out)


In [11]:
print([x for x in range(1,16)])

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [12]:
test_gamma(range(1,16))

.. val = 1, k_d = 0, k_r = 0
.. val = 2, k_d = 1, k_r = 0
.. val = 3, k_d = 1, k_r = 1
.. val = 4, k_d = 2, k_r = 0
.. val = 5, k_d = 2, k_r = 1
.. val = 6, k_d = 2, k_r = 2
.. val = 7, k_d = 2, k_r = 3
.. val = 8, k_d = 3, k_r = 0
.. val = 9, k_d = 3, k_r = 1
.. val = 10, k_d = 3, k_r = 2
.. val = 11, k_d = 3, k_r = 3
.. val = 12, k_d = 3, k_r = 4
.. val = 13, k_d = 3, k_r = 5
.. val = 14, k_d = 3, k_r = 6
.. val = 15, k_d = 3, k_r = 7
['0', '']
['10', '0']
['10', '1']
['110', '00']
['110', '01']
['110', '10']
['110', '11']
['1110', '000']
['1110', '001']
['1110', '010']
['1110', '011']
['1110', '100']
['1110', '101']
['1110', '110']
['1110', '111']
01001011 10001100 11101011 01111100 00111000 11110010 11100111 11010011 10101111 01101110 111 


## Exercise

1. Implement a method `decode_gamma` that takes a gamma-encoded binary string and decode it into a list of integers. 
2. Implement the encoding and decoding methods for delta, rice, and variable byte methods. 