In [12]:
# PyArrow RecordBatch serialization and deserialization example

import pyarrow as pa
import pyarrow.ipc as ipc
import numpy as np

def test():
    # Initialize a RecordBatch
    data = [
        pa.array(np.arange(1, 100_000_000 + 1, dtype=np.int64)),
        pa.array([2] * 100_000_000, type=pa.int64())
    ]
    batch = pa.RecordBatch.from_arrays(data, ['numbers', 'strings'])


    # Serialize RecordBatch to a file
    with pa.OSFile('record_batch.arrow', 'wb') as sink:
        with ipc.new_stream(sink, batch.schema) as writer:
            for i in range(batch.num_rows):
                # row = batch[i]
                writer.write_batch(batch.slice(i,1))

    batch = None

    # Deserialize RecordBatch from a file
    with pa.memory_map('record_batch.arrow', 'rb') as source:
        with ipc.open_stream(source) as reader:
            deserialized_batch = reader.read_next_batch()

from memory_profiler import memory_usage

mem_usage = memory_usage(test)

print(mem_usage)

[34.390625, 37.171875, 784.046875, 714.125, 711.921875, 694.34375, 683.015625, 679.703125, 685.109375, 748.84375, 832.859375, 878.203125, 946.15625, 1012.359375, 1059.90625, 1108.640625, 1082.21875, 689.90625, 319.734375, 53.328125, 53.328125, 54.796875, 54.984375, 55.140625, 55.296875, 55.484375, 55.609375, 56.71875, 56.875, 57.03125, 57.1875, 57.34375, 57.53125, 57.6875, 57.84375, 58.0, 58.359375, 58.515625, 58.671875, 58.828125, 58.984375, 59.171875, 59.328125, 59.484375, 59.640625, 59.828125, 59.984375, 60.140625, 60.296875, 60.484375, 60.640625, 60.796875, 60.953125, 61.140625, 61.296875, 61.453125, 61.578125, 61.734375, 61.890625, 62.078125, 62.25, 62.40625, 62.5625, 62.71875, 62.8125, 62.9375, 63.09375, 63.25, 63.4375, 63.59375, 63.75, 63.90625, 64.0625, 64.21875, 64.375, 64.53125, 64.6875, 64.84375, 65.03125, 65.15625, 65.3125, 65.46875, 65.65625, 65.8125, 65.96875, 66.125, 66.265625, 66.421875, 66.546875, 66.703125, 66.859375, 67.015625, 67.171875, 67.359375, 67.515625, 67.640