In [20]:
import pandas as pd
from timeit import default_timer as timer
from datetime import timedelta
import pyarrow.parquet as parquet

def time_func(func, *args, **kwargs):
    start = timer()
    print(f'Starting {func.__name__}: [{start}]')
    result = func(*args, **kwargs)
    end = timer()
    print(f'Done {func.__name__}: [{end}]')
    total_in_seconds = end - start
    print(f'Total time: {timedelta(seconds=total_in_seconds)}')

    return result, total_in_seconds

In [37]:
from pathlib import Path
import pyarrow.orc as orc
import pyarrow.feather as feather


def load_data(path, file_type):
    for path_ in path.iterdir():
        print(f'{path_.name}: {round(path_.stat().st_size / ((1024 ** 2) * 1000), 4)} GB.')
        if file_type == 'parquet':
            result = time_func(
                parquet.read_table,
                path_
            )
        elif file_type == 'orc':
            result = time_func(
                orc.read_table,
                path_
            )
        elif file_type == 'feather':
            result = time_func(
                feather.read_table,
                path_
            )
        
        df = result[0].to_pandas()
        print(df['timestamp'].iloc[1])
        print('-' * 20)
        print('')

In [38]:
parquet_path = Path('data/compressed/parquet')
orc_path = Path('data/compressed/orc')
feather_path = Path('data/compressed/feather')
variants = [(parquet_path, 'parquet'), (orc_path, 'orc'), (feather_path, 'feather')]

for path, filetype in variants[1:]:
    print(path, filetype)
    load_data(path, filetype)

data/compressed/orc orc
2022_place_canvas_history.orc.zstd: 0.8653 GB.
Starting read_table: [191589.199898697]
Done read_table: [191591.529716015]
Total time: 0:00:02.329817
2022-04-01 15:38:01.124 UTC
--------------------

2022_place_canvas_history.orc.snappy: 1.1921 GB.
Starting read_table: [191599.122264576]
Done read_table: [191601.682755539]
Total time: 0:00:02.560491
2022-04-01 15:38:01.124 UTC
--------------------

2022_place_canvas_history.orc.uncompressed: 1.543 GB.
Starting read_table: [191609.944827194]
Done read_table: [191611.426779953]
Total time: 0:00:01.481953
2022-04-01 15:38:01.124 UTC
--------------------

2022_place_canvas_history.orc.zlib: 0.8939 GB.
Starting read_table: [191619.522831619]
Done read_table: [191625.448336506]
Total time: 0:00:05.925505
2022-04-01 15:38:01.124 UTC
--------------------

2022_place_canvas_history.orc.lz4: 1.5432 GB.
Starting read_table: [191633.494437256]
Done read_table: [191635.035566907]
Total time: 0:00:01.541130
2022-04-01 15:38:0