In [33]:
import pyarrow.parquet as parquet
import pyarrow.feather as feather
import pyarrow.orc as orc
import pyarrow.csv as csv
import pyarrow as pa
from pathlib import Path
import gc
import time
import os
import psutil
import json
from timeit import default_timer as timer
from datetime import timedelta
import pandas as pd

reddit_place = '2022_place_canvas_history'
original_csv_file = f'data/small/day1/original/{reddit_place}.csv'
#parquet_file = f'data/small/day1/{reddit_place}.parquet'
#feather_file = f'data/small/day1/{reddit_place}.feather'
#orc_file = f'data/small/day1/{reddit_place}.orc'
#csv_file = f'data/small/day1/{reddit_place}.csv'

compressed_folder = 'data/small/day1/compressed'
feather_compressions = ['lz4', 'zstd', 'uncompressed']
orc_compressions = ['zlib', 'snappy', 'lz4', 'zstd', 'uncompressed']
parquet_compressions = ['zstd', 'lz4', 'brotli', 'gzip', 'snappy', 'none']
csv_compressions = ["infer", "gzip", "bz2", "zip", "xz", "zstd"]
filetypes = ['parquet', 'orc', 'feather', 'csv']
compression_lists = [
    parquet_compressions,
    orc_compressions,
    feather_compressions,
    csv_compressions
]


def time_func(func, *args, **kwargs):
    start = timer()
    print(f'Starting {func.__name__}: [{start}]')
    result = func(*args, **kwargs)
    end = timer()
    print(f'Done {func.__name__}: [{end}]')
    total_in_seconds = end - start
    print(f'Total time: {timedelta(seconds=total_in_seconds)}')

    return result, total_in_seconds


def write_and_compress_file(df, filepath, filetype, compression):
    if filetype == 'csv':
        df.to_csv(filepath, compression=compression)
    elif filetype == 'parquet':
        table = pa.Table.from_pandas(df)
        parquet.write_table(table, filepath, compression=compression)
    elif filetype == 'orc':
        table = pa.Table.from_pandas(df)
        orc.write_table(table, filepath, compression=compression)
    elif filetype == 'feather':
        feather.write_feather(df, filepath, compression=compression)
    

def load_data(filepath, filetype, compression='infer'):
    """Read entire file convert df and get shape"""
    if filetype == 'parquet':
        df = parquet.read_table(filepath).to_pandas()
    elif filetype == 'orc':
        df = orc.read_table(filepath).to_pandas()
    elif filetype == 'feather':
        df = feather.read_feather(filepath)
    elif filetype == 'csv':
        df = pd.read_csv(filepath, compression=compression)

    return df.shape

In [35]:
df = csv.read_csv(original_csv_file).to_pandas()
df.head()

Unnamed: 0,timestamp,user_id,pixel_color,coordinate,conv_timestamp
0,2022-04-01 15:38:01.116 UTC,WYFuP/nwVCIBrw5XOVYKsHyo/fJpOJcIXCm75iKLqEz92B...,#000000,191662,2022-04-01 15:38:01.116000+00:00
1,2022-04-01 15:38:01.124 UTC,1Fie0j8msAiBmD5+NfV4SdY6ilMDHV9XJc6zWdhgAyrc8H...,#000000,23918,2022-04-01 15:38:01.124000+00:00
2,2022-04-01 15:38:01.13 UTC,qvqMCBzdQyIL1ET+iZvXilEjCrt7cjq3oPG3uUad8tWi1X...,#FFFFFF,722727,2022-04-01 15:38:01.130000+00:00
3,2022-04-01 15:38:01.131 UTC,TGLWFhne7tzE8iBhUWm7K3m6SrH+2xBl599XMhdtvV6RqV...,#FF4500,64546,2022-04-01 15:38:01.131000+00:00
4,2022-04-01 15:38:01.134 UTC,C1E2rtkIcBP+omujs+YcSHfdWfCBpcxq6uIQpSlekwfyIq...,#FFFFFF,90483,2022-04-01 15:38:01.134000+00:00


[('parquet', 'zstd'), ('parquet', 'lz4'), ('parquet', 'brotli'), ('parquet', 'gzip'), ('parquet', 'snappy'), ('parquet', 'none'), ('orc', 'zlib'), ('orc', 'snappy'), ('orc', 'lz4'), ('orc', 'zstd'), ('orc', 'uncompressed'), ('feather', 'lz4'), ('feather', 'zstd'), ('feather', 'uncompressed'), ('csv', 'infer'), ('csv', 'gzip'), ('csv', 'bz2'), ('csv', 'zip'), ('csv', 'xz'), ('csv', 'zstd')]


In [36]:
import itertools

combinations = []
for filetype, compression_list in zip(filetypes, compression_lists):
    filetype_with_compressions = itertools.product([filetype], compression_list)
    combinations.extend(filetype_with_compressions)

tests = []
for i in range(10):
    results = []
    for filetype, compression in combinations:
        filepath = Path(
            compressed_folder, 
            f'{reddit_place}.{filetype}.{compression}'
        )
        if filepath.is_file():
            filepath.unlink()

        print(filepath)

        _, write_time_in_seconds = time_func(
            write_and_compress_file,
            df=df,
            filepath=filepath,
            filetype=filetype,
            compression=compression
        )
        print(filetype, compression)

        filesize_in_bytes = filepath.stat().st_size

        df_size, read_time_in_seconds = time_func(
            load_data,
            filepath=filepath,
            filetype=filetype,
            compression=compression
        )

        results.append({
            'filename': str(filepath),
            'filetype': filetype,
            'compression': compression,
            'read_time_in_seconds': read_time_in_seconds,
            'write_time_in_seconds': write_time_in_seconds,
            'filesize_in_bytes': filesize_in_bytes
        })
    
    tests.append(results)


with open('compression_test.json', 'w', encoding='utf8') as jfile:
    json.dump(tests, jfile, indent=4)


data/small/day1/compressed/2022_place_canvas_history.parquet.zstd
Starting write_and_compress_file: [385373.634733369]
Done write_and_compress_file: [385381.681716145]
Total time: 0:00:08.046983
parquet zstd
Starting load_data: [385381.6819281]
Done load_data: [385392.186927236]
Total time: 0:00:10.504999
data/small/day1/compressed/2022_place_canvas_history.parquet.lz4
Starting write_and_compress_file: [385392.203377546]
Done write_and_compress_file: [385398.173112286]
Total time: 0:00:05.969735
parquet lz4
Starting load_data: [385398.173208044]
Done load_data: [385408.233982462]
Total time: 0:00:10.060774
data/small/day1/compressed/2022_place_canvas_history.parquet.brotli
Starting write_and_compress_file: [385408.245223362]
Done write_and_compress_file: [385525.087129229]
Total time: 0:01:56.841906
parquet brotli
Starting load_data: [385525.087338853]
Done load_data: [385538.551521821]
Total time: 0:00:13.464183
data/small/day1/compressed/2022_place_canvas_history.parquet.gzip
Startin