In [1]:
import pyarrow.parquet as parquet
import pyarrow.feather as feather
import pyarrow.orc as orc
import pyarrow.csv as csv
import gc
import time
import os
import psutil
import json

In [2]:
def get_process_memory():
    current_pid = os.getpid()
    process = psutil.Process(current_pid)
    rss = process.memory_info().rss
    return rss

def memory_func(func, *args, **kwargs):
    gc.collect()
    time.sleep(5)
    pre_mem = get_process_memory()
    result = func(*args, **kwargs)
    post_mem = get_process_memory()
    consumed_mem = post_mem - pre_mem
    consumed_mem_MB = consumed_mem / (1024 ** 2)
    print(f'{func.__name__}: Consumed: {consumed_mem_MB} MB.')

    return result, consumed_mem_MB


reddit_place = '2022_place_canvas_history'
parquet_file = f'data/small/day1/{reddit_place}.parquet'
feather_file = f'data/small/day1/{reddit_place}.feather'
orc_file = f'data/small/day1/{reddit_place}.orc'
csv_file = f'data/small/day1/{reddit_place}.csv'

In [4]:

all_tests = []
for i in range(100):
    filenames = [parquet_file, orc_file, feather_file, csv_file]
    filetypes = ['parquet', 'orc', 'feather', 'csv']
    results = []
    for filename, filetype in zip(filenames, filetypes):
        result = memory_func(
            read_table,
            filename,
            filetype
        )
        results.append({
            'filename': filename,
            'filetype': filetype,
            'memory_mb': result[1]
        })
    all_tests.append(results)

with open('memtest.json', 'w', encoding='utf8') as jfile:
    json.dump(all_tests, jfile, indent=4)

read_table: Consumed: 4020.203125 MB.
read_table: Consumed: 2030.265625 MB.
read_table: Consumed: 1888.1171875 MB.
read_table: Consumed: 4271.74609375 MB.
read_table: Consumed: 3899.22265625 MB.
read_table: Consumed: 1985.12109375 MB.
read_table: Consumed: 1886.921875 MB.
read_table: Consumed: 4271.0078125 MB.
read_table: Consumed: 3440.7578125 MB.
read_table: Consumed: 1940.53515625 MB.
read_table: Consumed: 1887.05859375 MB.
read_table: Consumed: 4268.62890625 MB.
read_table: Consumed: 3849.2734375 MB.
read_table: Consumed: 1953.89453125 MB.
read_table: Consumed: 1887.4296875 MB.
read_table: Consumed: 4264.09375 MB.
read_table: Consumed: 3441.01953125 MB.
read_table: Consumed: 1940.30078125 MB.
read_table: Consumed: 1886.91015625 MB.
read_table: Consumed: 4268.81640625 MB.
read_table: Consumed: 3442.91796875 MB.
read_table: Consumed: 1984.578125 MB.
read_table: Consumed: 1887.26953125 MB.
read_table: Consumed: 4267.70703125 MB.
read_table: Consumed: 3896.89453125 MB.
read_table: Cons