In [10]:
import struct
from math import sqrt
from utils import round_up

## Encoding/decoding, varbite8

In [1]:
def encode_int(a):
    b = b''                           
    base = 2 ** 7                     
    while a >= base:                  
        b += struct.pack('B', a%base) 
        a //= base                    
    b += struct.pack('B', base + a)   

    return b

def decode_int(b):
    a = 0                        
    base = 2 ** 7                
    osn = 1                      
    l = 0                        
    for i in b:                  
        l += 1                   
        if i >= base:            
            a += (i - base) * osn
            break                
        a += i * osn             

        osn *= base              

    return a, l                  

## Output
Convert some structures to bytes

In [54]:
int_size = 4
int_format = 'I'
jump_size = 1 * int_size

ENABLE_VARBITE = True
ENABLE_LIST_CMP = True
ENABLE_JUMPS = True

def write_int_raw(a):
    return struct.pack(int_format, a)

def write_int(a):
    if ENABLE_VARBITE:
        return encode_int(a)
    return write_int_raw(a)

def write_str(s):
    b = bytes(s, encoding='utf8')
    return write_int(len(s)) + struct.Struct('{}s'.format(len(b))).pack(b)
    
def write_list(l):
    if ENABLE_LIST_CMP:
        nl = [l[0]]
        for i in range(1, len(l)):
            nl.append(l[i] - l[i-1])
        l = nl
        
    b = b''.join([write_int(i) for i in l])
    return write_int(len(b)) + b

def write_entrance(doc_id, coords):
    return write_int(doc_id) + write_list(coords)

def write_jump(offset):
    return write_int_raw(offset)

def write_block(block):
    l = len(block)
    p = round_up(sqrt(l-1)) # jump number
    o = round_up((l-1) / p) # jump len
    
    entrances = []
    sizes = [0]
    
    doc_ids = list(block.keys())
    doc_ids.sort()
    
    for i in range(l): # encode entrances
        doc_id = doc_ids[i]
        entrances.append(write_entrance(doc_id, block[doc_id]))
        sizes.append(sizes[-1] + len(entrances[-1]))
        
    sizes = sizes[1:]
    
    b = write_int(l)
    b = write_int_raw(len(b) + sizes[-1] + jump_size * p) + b
    
    for i in range(l): # add jumps and join all bites
        b += entrances[i]
        if ENABLE_JUMPS and i % o == 0 and o > 2 and i < l-1:
            to = min(l-1, i+o-1)
            b += write_jump(sizes[to] - sizes[i])
        
    return b

In [47]:
b = write_block({i:[1,2,8] for i in range(10)})

In [49]:
b = write_int_raw(123)
struct.unpack('I', b)

(123,)

In [56]:
write_str('asd')[0]

131

In [58]:
for i in write_int(33493):
    print(i)

85
5
130
