In [1]:

import pyarrow.parquet as parquet
import pyarrow.feather as feather
import pyarrow.orc as orc
import pyarrow.csv as csv
import pyarrow as pa
import pandas as pd
from timeit import default_timer as timer
from datetime import timedelta

reddit_place = '2022_place_canvas_history'
parquet_file = f'data/small/day1/{reddit_place}.parquet'
feather_file = f'data/small/day1/{reddit_place}.feather'
orc_file = f'data/small/day1/{reddit_place}.orc'
csv_file = f'data/small/day1/{reddit_place}.csv'


def time_func(func, *args, **kwargs):
    start = timer()
    result = func(*args, **kwargs)
    end = timer()
    total_in_seconds = end - start

    return result, total_in_seconds


def read_pyarrow_to_pandas(filename, filetype):
    """Read file to pyarrow table and convert to pandas dataframe"""
    if filetype == 'parquet':
        df = parquet.read_table(filename).to_pandas()
    elif filetype == 'orc':
        df = orc.read_table(filename).to_pandas()
    elif filetype == 'feather':
        df = feather.read_feather(filename)
    elif filetype == 'csv':
        df = csv.read_csv(filename).to_pandas()
    
    return df.shape


def read_pandas(filename, filetype):
    """Read file to pandas dataframe"""
    if filetype == 'parquet':
        df = pd.read_parquet(filename)
    elif filetype == 'orc':
        df = pd.read_orc(filename)
    elif filetype == 'feather':
        df = pd.read_feather(filename)
    elif filetype == 'csv':
        df = pd.read_csv(filename)
    
    return df.shape


def time_files_with_func(func, filenames, filetypes):
    results = []
    for filename, filetype in zip(filenames, filetypes):
        result = time_func(
            func,
            filename,
            filetype
        )

        results.append({
            'function_name': func.__name__,
            'filename': filename,
            'filetype': filetype,
            'function_return': result[0],
            'time_in_seconds': result[1]
        })

    return results

In [None]:
feather.read_table

In [5]:
from pyarrow import _feather
reader = _feather.FeatherReader(feather_file, use_memory_map=True, use_threads=True)

In [10]:

orc.read_table

pyarrow.Table
timestamp: string
user_id: string
pixel_color: string
coordinate: string
conv_timestamp: timestamp[ns, tz=UTC]
__index_level_0__: int64
----
timestamp: [["2022-04-01 15:38:01.116 UTC","2022-04-01 15:38:01.124 UTC","2022-04-01 15:38:01.13 UTC","2022-04-01 15:38:01.131 UTC","2022-04-01 15:38:01.134 UTC","2022-04-01 15:38:01.135 UTC","2022-04-01 15:38:01.136 UTC","2022-04-01 15:38:01.139 UTC","2022-04-01 15:38:01.14 UTC","2022-04-01 15:38:01.14 UTC",...,"2022-04-01 15:41:56.974 UTC","2022-04-01 15:41:56.982 UTC","2022-04-01 15:41:56.989 UTC","2022-04-01 15:41:56.989 UTC","2022-04-01 15:41:56.993 UTC","2022-04-01 15:41:57.005 UTC","2022-04-01 15:41:57.008 UTC","2022-04-01 15:41:57.009 UTC","2022-04-01 15:41:57.011 UTC","2022-04-01 15:41:57.031 UTC"],["2022-04-01 15:41:57.034 UTC","2022-04-01 15:41:57.035 UTC","2022-04-01 15:41:57.039 UTC","2022-04-01 15:41:57.077 UTC","2022-04-01 15:41:57.078 UTC","2022-04-01 15:41:57.08 UTC","2022-04-01 15:41:57.08 UTC","2022-04-01 15:41:57.