In [4]:
import pyarrow.parquet as parquet
import pyarrow.feather as feather
import pyarrow.orc as orc
import pyarrow.csv as csv
import pyarrow as pa
from pathlib import Path
import gc
import time
import os
import psutil
import json
from timeit import default_timer as timer
from datetime import timedelta
import pandas as pd

reddit_place = '2022_place_canvas_history'
original_csv_file = f'data/small/day1/original/{reddit_place}.csv'
parquet_file = f'data/small/day1/{reddit_place}.parquet'
feather_file = f'data/small/day1/{reddit_place}.feather'
orc_file = f'data/small/day1/{reddit_place}.orc'
csv_file = f'data/small/day1/{reddit_place}.csv'


def time_func(func, *args, **kwargs):
    start = timer()
    print(f'Starting {func.__name__}: [{start}]')
    result = func(*args, **kwargs)
    end = timer()
    print(f'Done {func.__name__}: [{end}]')
    total_in_seconds = end - start
    print(f'Total time: {timedelta(seconds=total_in_seconds)}')

    return result, total_in_seconds

def write_file(df, filename, filetype):
    if filetype == 'csv':
        table = pa.Table.from_pandas(df)
        csv.write_csv(table, filename)
    elif filetype == 'parquet':
        table = pa.Table.from_pandas(df)
        parquet.write_table(table, filename)
    elif filetype == 'orc':
        table = pa.Table.from_pandas(df)
        orc.write_table(table, filename)
    elif filetype == 'feather':
        feather.write_feather(df, filename)

In [5]:
# Get orignal_file
df = csv.read_csv(original_csv_file).to_pandas()
filenames = [parquet_file, orc_file, feather_file, csv_file]
filetypes = ['parquet', 'orc', 'feather', 'csv']
results = []

for filename, filetype in zip(filenames, filetypes):
    filepath = Path(filename)
    if filepath.is_file():
        filepath.unlink()

    _, time_in_seconds = time_func(
        write_file,
        df,
        filename,
        filetype
    )

    filesize_in_bytes = Path(filename).stat().st_size

    results.append({
        'func': write_file.__name__,
        'filename': filename,
        'filetype': filetype,
        'time_in_seconds': time_in_seconds,
        'filesize_in_bytes': filesize_in_bytes
    })


Starting write_file: [365820.358387021]
Done write_file: [365826.639719372]
Total time: 0:00:06.281332
Starting write_file: [365826.712039057]
Done write_file: [365833.318500318]
Total time: 0:00:06.606461
Starting write_file: [365833.38929911]
Done write_file: [365836.978290953]
Total time: 0:00:03.588992
Starting write_file: [365836.978650096]
Done write_file: [365855.562975101]
Total time: 0:00:18.584325


In [6]:
results

[{'func': 'write_file',
  'filename': 'data/small/day1/2022_place_canvas_history.parquet',
  'filetype': 'parquet',
  'time_in_seconds': 6.2813323509762995,
  'filesize_in_bytes': 1346219026},
 {'func': 'write_file',
  'filename': 'data/small/day1/2022_place_canvas_history.orc',
  'filetype': 'orc',
  'time_in_seconds': 6.606461260991637,
  'filesize_in_bytes': 1618000279},
 {'func': 'write_file',
  'filename': 'data/small/day1/2022_place_canvas_history.feather',
  'filetype': 'feather',
  'time_in_seconds': 3.588991842989344,
  'filesize_in_bytes': 1551257762},
 {'func': 'write_file',
  'filename': 'data/small/day1/2022_place_canvas_history.csv',
  'filetype': 'csv',
  'time_in_seconds': 18.584325005009305,
  'filesize_in_bytes': 2218122514}]