## Preprocessing for stream sampler

In [1]:
CHUNK_SIZE_BYTE = 512 * 1024 * 1024
EDGE_FILE = "/mnt/nvme2/data/yahoo/edges.bin"
DEGREE_FILE = "/mnt/nvme2/data/yahoo/degree.bin"

# files to write
STREAMING_EDGE_FILE = "/mnt/nvme2/data/yahoo/preprocessed/streaming_edges.bin"
CHUNK_INFO_FILE = "/mnt/nvme2/data/yahoo/preprocessed/chunks.txt"
TRAIN_NODE_FILE = "/mnt/nvme2/data/yahoo/preprocessed/train.bin"
RANDOM_READ_EDGE_FILE = "/mnt/nvme2/data/yahoo/preprocessed/random_read_edges.bin"
OFFSET_FILE = "/mnt/nvme2/data/yahoo/preprocessed/offset.bin"

SEQUENTIAL_READ_EDGE_FILE = "/mnt/nvme2/data/yahoo/preprocessed/sequential_read.bin"

In [2]:
import struct
from tqdm.auto import tqdm
import os
import numpy as np
import shutil
# also need jupyter and ipywidgets.

Origianl file must be preprocess by yahoo.cpp before using this jupyter notebook

### Load from taw text data

In [3]:
LOAD_FROM_RAW = True

In [4]:
def read_in_all_edges(filepath):
    print("Reading edges from file")
    return np.fromfile(filepath, dtype=np.uint32)

def read_in_degrees(filepath):
    print("Reading degrees from file")
    return np.fromfile(filepath, dtype=np.uint32)

def compute_offsets(degrees):
    offsets = np.zeros(len(degrees), dtype=np.uint64)
    offsets[0] = 0
    pbar = tqdm(total=len(degrees), desc="Compute offsets")
    for i in range(len(degrees) - 1):
        offsets[i+1] = offsets[i] + degrees[i]
        if i % 1000000 == 0:
            pbar.update(1000000)
    pbar.close()
    return offsets

In [5]:
if LOAD_FROM_RAW:
    degrees = read_in_degrees(DEGREE_FILE)
    edges = read_in_all_edges(EDGE_FILE)
    offsets = compute_offsets(degrees)

Reading degrees from file
Reading edges from file


Compute offsets:   0%|          | 0/1413511394 [00:00<?, ?it/s]

### Create padded file for treaming sampler

In [6]:
CREATE_STREAMING_SAMPLER_FILE = True

In [6]:
# find where to split the edges into chunks
def analysis_chunks(degrees, chunk_size=CHUNK_SIZE_BYTE):
    max_ints = chunk_size // 4

    chunks = []
    start = 0
    current_chunk_size = 3 
    pbar = tqdm(total=len(degrees))
    for i in range(len(degrees)):
        if i % 1000000 == 0:
            pbar.update(1000000)
        if current_chunk_size + degrees[i] + 1 > max_ints:
            chunks.append((start, i-1))
            start = i
            current_chunk_size = 3
        current_chunk_size += degrees[i] + 1
    chunks.append((start, len(degrees)-1))
    pbar.close()
    return chunks

def print_chunk_details(chunks, degrees):
    print("Chunk Details")
    print("="*12*5)
    print(f"{'start':>12}{'end':>12}{'n_nodes':>12}{'size(bytes)':>12}{'fill up':>12}")
    for chunk in chunks:
        start = chunk[0]
        end = chunk[1]
        n_nodes = sum(degrees[start:end+1])
        size = (1 + 1 + (end - start + 2) + n_nodes ) * 4
        print(f"{start:12d}{end:12d}{n_nodes:12d}{size:12d}{size/CHUNK_SIZE_BYTE*100:9.2f}%")

    print("len(chunks)", len(chunks))
    print("chunks", chunks)

def create_streaming_edge_file(filepath, chunks, degrees, edges):
    pbar = tqdm(total=len(chunks), desc="Creating padded file")
    with open(filepath, "wb") as file:
        edges_cnt = 0
        for t, chunk in enumerate(chunks):
            # write buffer
            byte_data_list = []
            # total number of nodes, little endian, unsigned int
            total_nodes = chunk[1] - chunk[0] + 1
            byte_data_list.append(struct.pack("<I", total_nodes))
            # first node id, little endian, unsigned int
            byte_data_list.append(struct.pack("<I", chunk[0]))
            # offsets, little endian, unsigned int
            cnt = total_nodes + 2 + 1
            byte_data_list.append(struct.pack("<I", cnt))
            for i in range(chunk[0], chunk[1]+1):
                cnt += degrees[i]
                byte_data_list.append(struct.pack("<I", cnt))
            # edges, little endian, unsigned int
            for i in range(chunk[0], chunk[1]+1):
                for j in range(edges_cnt, edges_cnt + degrees[i]):
                    byte_data_list.append(struct.pack("<I", edges[j]))
                edges_cnt += degrees[i]
            # padding zeros, unsigned int
            # For the last chunk, padding to make the chunk size a multiple of 512
            byte_data = b''.join(byte_data_list)
            file.write(byte_data)
            if t != len(chunks) - 1:
                bytes_to_add = CHUNK_SIZE_BYTE - len(byte_data)
            else:
                bytes_to_add = 512 - len(byte_data) % 512
            file.write(b'\x00' * bytes_to_add)
            # print(t, len(byte_data) + bytes_to_add)
            pbar.update(1)
            
    pbar.close()

def create_chunk_file(filepath, chunks):
    with open(filepath, "w") as fh:
        # fh.write(f"{len(chunks)}\n")
        for chunk in chunks:
            # fh.write(f"{chunk[0]} {chunk[1]}\n")
            fh.write(f"{chunk[1]}\n")
        
def create_train_nodes(filepath ):
    with open(filepath, "wb") as fh:
        byte_data_list = [struct.pack("<I", x) for x in range(1400000)]
        byte_data = b''.join(byte_data_list)
        fh.write(byte_data)

In [24]:
def create_seq_read_edge_file(filepath, chunks, degrees, edges):
    pbar = tqdm(total=len(chunks), desc="Creating padded file")
    with open(filepath, "wb") as file:
        edges_cnt = 0
        for t, chunk in enumerate(chunks):
            # write buffer
            byte_data_list = []
            # total number of nodes, little endian, unsigned int
            total_nodes = chunk[1] - chunk[0] + 1
            byte_data_list.append(struct.pack("<I", total_nodes))
            # first node id, little endian, unsigned int
            byte_data_list.append(struct.pack("<I", chunk[0]))
            # offsets, little endian, unsigned int
            for i in range(chunk[0], chunk[1]+1):
                byte_data_list.append(struct.pack("<I", degrees[i]))
                for j in range(edges_cnt, edges_cnt + degrees[i]):
                    byte_data_list.append(struct.pack("<I", edges[j]))
                edges_cnt += degrees[i]
            
            byte_data = b''.join(byte_data_list)
            file.write(byte_data)
            if t != len(chunks) - 1:
                bytes_to_add = CHUNK_SIZE_BYTE - len(byte_data)
            else:
                bytes_to_add = 512 - len(byte_data) % 512
            file.write(b'\x00' * bytes_to_add)
            pbar.update(1)
            print(t, len(byte_data), bytes_to_add, len(byte_data) + bytes_to_add)
            
    pbar.close()

In [None]:
CREATE_SEQUENTIAL_READ_SAMPLER_FILE = True
if CREATE_SEQUENTIAL_READ_SAMPLER_FILE:
    chunks = analysis_chunks(degrees)
    print_chunk_details(chunks, degrees)
    create_seq_read_edge_file(SEQUENTIAL_READ_EDGE_FILE, chunks, degrees, edges)

In [25]:
create_seq_read_edge_file(SEQUENTIAL_READ_EDGE_FILE, chunks, degrees, edges)

Creating padded file:   0%|          | 0/60 [00:00<?, ?it/s]

0 536870492 420 536870912
1 536870248 664 536870912
2 536865952 4960 536870912
3 536869676 1236 536870912
4 536870904 8 536870912
5 536870892 20 536870912
6 536870840 72 536870912
7 536870788 124 536870912
8 536870856 56 536870912
9 536870844 68 536870912
10 536870624 288 536870912
11 536870884 28 536870912
12 536870896 16 536870912
13 536870660 252 536870912
14 536870720 192 536870912
15 536870904 8 536870912
16 536870744 168 536870912
17 536870876 36 536870912
18 536870840 72 536870912
19 536870860 52 536870912
20 536870892 20 536870912
21 536870884 28 536870912
22 536870388 524 536870912
23 536870836 76 536870912
24 536870888 24 536870912
25 536870712 200 536870912
26 536870900 12 536870912
27 536870868 44 536870912
28 536870816 96 536870912
29 536870792 120 536870912
30 536870724 188 536870912
31 536870792 120 536870912
32 536870776 136 536870912
33 536870712 200 536870912
34 536870848 64 536870912
35 536870908 4 536870912
36 536870908 4 536870912
37 536870908 4 536870912
38 536870

In [8]:
if CREATE_STREAMING_SAMPLER_FILE:
    chunks = analysis_chunks(degrees)
    print_chunk_details(chunks, degrees)
    create_streaming_edge_file(STREAMING_EDGE_FILE, chunks, degrees, edges)
    create_chunk_file(CHUNK_INFO_FILE, chunks)
    create_train_nodes(TRAIN_NODE_FILE)

  0%|          | 0/1413511394 [00:00<?, ?it/s]

Chunk Details
       start         end     n_nodes size(bytes)     fill up
           0      617287   133600333   536870496   100.00%
      617288     1312180   133522667   536870252   100.00%
     1312181     1965516   133563150   536865956   100.00%
     1965517     2615657   133567276   536869680   100.00%
     2615658     3947779   132885602   536870908   100.00%
     3947780     8192152   129973348   536870896   100.00%
     8192153    12083098   130326762   536870844   100.00%
    12083099    16228722   130072071   536870792   100.00%
    16228723    20284312   130162122   536870860   100.00%
    20284313    24484493   130017528   536870848   100.00%
    24484494    28386289   130315858   536870628   100.00%
    28386290    32006505   130597503   536870888   100.00%
    32006506    36186961   130037266   536870900   100.00%
    36186962    40202824   130201800   536870664   100.00%
    40202825    44258051   130162451   536870724   100.00%
    44258052    48138725   130337050   5

Creating padded file:   0%|          | 0/60 [00:00<?, ?it/s]

## Preprocessing for random read sampler

In [9]:
CREATE_RANDOM_READ_SAMPLER_FILE = True

In [10]:
def create_random_read_edge_file(edges, output_file_path):
    assert edges.dtype.byteorder != '>', "Numpy array shoule not be big endian"
    edges.tofile(output_file_path)
    padding_needed = 512 - (len(edges) * 4) % 512

    if padding_needed > 0:
         with open(output_file_path, 'ab') as file:
            file.write(b'\x00' * padding_needed)

    print(f"File {output_file_path} has been padded with {padding_needed} bytes.")


def create_offsets_file(offsets, output_file_path):
    offsets.tofile(output_file_path)

In [11]:
if CREATE_RANDOM_READ_SAMPLER_FILE:
    create_random_read_edge_file(edges, RANDOM_READ_EDGE_FILE)
    create_offsets_file(offsets, OFFSET_FILE)
    

File /mnt/nvme2/data/yahoo/preprocessed/random_read_edges.bin has been padded with 212 bytes.


### Verification

In [12]:
streaming_edges = np.fromfile(STREAMING_EDGE_FILE, dtype=np.uint32)

In [13]:
pbar = tqdm(total=60)
for t in range(60):
    start_pos = t * CHUNK_SIZE_BYTE // 4
    cur_total = streaming_edges[start_pos + 0]
    start_node = streaming_edges[start_pos + 1]
    cur_offsets = streaming_edges[start_pos + 2: start_pos + 2 + cur_total + 1]
    is_assending = np.all(cur_offsets[1:] >= cur_offsets[:-1])
    print(t, is_assending, start_node, cur_total)
    # assert is_assending, f"Chunk {t} is not assending"
    pbar.update(1)
pbar.close()

  0%|          | 0/60 [00:00<?, ?it/s]

0 True 0 617287
1 True 617288 694892
2 True 1312181 653335
3 True 1965517 650140
4 True 2615658 1332121
5 True 3947780 4244372
6 True 8192153 3890945
7 True 12083099 4145623
8 True 16228723 4055589
9 True 20284313 4200180
10 True 24484494 3901795
11 True 28386290 3620215
12 True 32006506 4180455
13 True 36186962 4015862
14 True 40202825 4055226
15 True 44258052 3880673
16 True 48138726 3573372
17 True 51712099 4238927
18 True 55951027 4239932
19 True 60190960 4124705
20 True 64315666 4016804
21 True 68332471 3803423
22 True 72135895 4163408
23 True 76299304 4361693
24 True 80660998 3681286
25 True 84342285 3857128
26 True 88199414 4332213
27 True 92531628 4401749
28 True 96933378 3975039
29 True 100908418 3936447
30 True 104844866 3894199
31 True 108739066 4230979
32 True 112970046 4094319
33 True 117064366 3908989
34 True 120973356 4108386
35 True 125081743 18652043
36 True 143733787 49025741
37 True 192759529 51517542
38 True 244277072 48126824
39 True 292403897 46150918
40 True 3385

In [11]:
sequential_edges = np.fromfile(SEQUENTIAL_READ_EDGE_FILE, dtype=np.uint32)



In [14]:
sequential_edges[:40]

array([617288,      0, 617291, 617291, 617291, 617291, 617291, 617585,
       617585, 617585, 617585, 617585, 617602, 617606, 617626, 617626,
       617626, 617851, 618075, 618075, 618303, 618527, 618752, 618977,
       618977, 618993, 619218, 619218, 619448, 619679, 619679, 619911,
       620145, 620297, 620297, 620331, 620523, 620747, 620747, 620757],
      dtype=uint32)

In [19]:
degrees[:10]


array([  0,   0,   0,   0, 294,   0,   0,   0,   0,  17], dtype=uint32)