In [14]:
from pathlib import Path
import pyarrow.parquet as parquet
import pyarrow.feather as feather
import pyarrow.orc as orc
import pyarrow.csv as csv
from timeit import default_timer as timer
from datetime import timedelta
import pandas as pd
import os

my_path = Path('/home/mimslade/dev/projects/file-bench/data/small/day1/compressed/2022_place_canvas_history.feather.lz4')
filename = os.path.split(my_path)[-1]
name, filetype, compression = filename.split('.')
compressed_path = Path('data/small/day1/compressed')

def time_func(func, *args, **kwargs):
    start = timer()
    #print(f'Starting {func.__name__}: [{start}]')
    result = func(*args, **kwargs)
    end = timer()
    #print(f'Done {func.__name__}: [{end}]')
    total_in_seconds = end - start
    print(f'Total time: {timedelta(seconds=total_in_seconds)}')

    return result, total_in_seconds

def load_data(filepath, filetype, compression='infer'):
    """Read entire file convert df and get shape"""
    if filetype == 'parquet':
        df = parquet.read_table(filepath).to_pandas()
    elif filetype == 'orc':
        df = orc.read_table(filepath).to_pandas()
    elif filetype == 'feather':
        df = feather.read_feather(filepath)
    elif filetype == 'csv':
        df = pd.read_csv(filepath, compression=compression)
    
    return df.shape

results = []
for filepath in compressed_path.iterdir():
    filename = os.path.split(filepath)[-1]
    _, filetype, compression = filename.split('.')
    result, time_in_seconds = time_func(
        load_data,
        filepath,
        filetype,
        compression
    )

    results.append({
        'func': load_data.__name__,
        'func_return': result,
        'filename': str(filepath),
        'filetype': filetype,
        'compression': compression,
        'time_in_seconds': time_in_seconds,
    })
    print(filepath)


Total time: 0:00:10.893156
data/small/day1/compressed/2022_place_canvas_history.orc.zstd
Total time: 0:00:10.185488
data/small/day1/compressed/2022_place_canvas_history.orc.snappy
Total time: 0:00:11.278144
data/small/day1/compressed/2022_place_canvas_history.parquet.snappy
Total time: 0:00:13.189784
data/small/day1/compressed/2022_place_canvas_history.parquet.brotli
Total time: 0:00:28.476084
data/small/day1/compressed/2022_place_canvas_history.csv.zip
Total time: 0:00:09.619341
data/small/day1/compressed/2022_place_canvas_history.orc.uncompressed
Total time: 0:00:24.490429
data/small/day1/compressed/2022_place_canvas_history.csv.zstd
Total time: 0:00:10.377279
data/small/day1/compressed/2022_place_canvas_history.parquet.none
Total time: 0:00:11.740333
data/small/day1/compressed/2022_place_canvas_history.parquet.lz4
Total time: 0:00:11.745824
data/small/day1/compressed/2022_place_canvas_history.parquet.zstd
Total time: 0:00:08.226005
data/small/day1/compressed/2022_place_canvas_histor