In [5]:
import pyarrow.parquet as parquet
import pyarrow.feather as feather
import pyarrow.orc as orc
import pyarrow.csv as csv
import pyarrow as pa
import pandas as pd
from timeit import default_timer as timer
from datetime import timedelta

reddit_place = '2022_place_canvas_history'
parquet_file = f'data/small/day1/{reddit_place}.parquet'
feather_file = f'data/small/day1/{reddit_place}.feather'
orc_file = f'data/small/day1/{reddit_place}.orc'
csv_file = f'data/small/day1/{reddit_place}.csv'


def time_func(func, *args, **kwargs):
    start = timer()
    result = func(*args, **kwargs)
    end = timer()
    total_in_seconds = end - start

    return result, total_in_seconds


def read_pyarrow_to_pandas(filename, filetype):
    """Read file to pyarrow table and convert to pandas dataframe"""
    if filetype == 'parquet':
        df = parquet.read_table(filename).to_pandas()
    elif filetype == 'orc':
        df = orc.read_table(filename).to_pandas()
    elif filetype == 'feather':
        df = feather.read_feather(filename)
    elif filetype == 'csv':
        df = csv.read_csv(filename).to_pandas()
    
    return df.shape


def read_pandas(filename, filetype):
    """Read file to pandas dataframe"""
    if filetype == 'parquet':
        df = pd.read_parquet(filename)
    elif filetype == 'orc':
        df = pd.read_orc(filename)
    elif filetype == 'feather':
        df = pd.read_feather(filename)
    elif filetype == 'csv':
        df = pd.read_csv(filename)
    
    return df.shape


def time_files_with_func(func, filenames, filetypes):
    results = []
    for filename, filetype in zip(filenames, filetypes):
        result = time_func(
            func,
            filename,
            filetype
        )

        results.append({
            'function_name': func.__name__,
            'filename': filename,
            'filetype': filetype,
            'function_return': result[0],
            'time_in_seconds': result[1]
        })

    return results

In [6]:
functions_to_test = [read_pandas, read_pyarrow_to_pandas]
filenames = [parquet_file, orc_file, feather_file, csv_file]
filetypes = ['parquet', 'orc', 'feather', 'csv']

In [7]:
all_results = []
for i in range(10):
    results = []
    for func in functions_to_test:
        results.extend(time_files_with_func(func, filenames, filetypes))
    
    all_results.append(results)

In [9]:
flat_list = []
for test_number, results in enumerate(all_results):
    for result in results:
        flat_list.append({
            'test_number': test_number,
            **result
        })

In [10]:
import json
with open('pandas_vs_pyarrow.json', 'w', encoding='utf8') as jfile:
    json.dump(flat_list, jfile, indent=4)