In [15]:
import pandas as pd
from timeit import default_timer as timer
from datetime import timedelta

feather_compressions = ['lz4', 'zstd', 'uncompressed']
orc_compressions = ['zlib', 'snappy', 'lz4', 'zstd', 'uncompressed']
parquet_compressions = ['zstd', 'lz4', 'brotli', 'gzip', 'snappy', 'none']
not_supported_parquet = ['lzo']
csv_compressions = ["infer", "gzip", "bz2", "zip", "xz", "zstd"]

reddit_place = '2022_place_canvas_history'

def time_func(func, *args, **kwargs):
    start = timer()
    print(f'Starting {func.__name__}: [{start}]')
    result = func(*args, **kwargs)
    end = timer()
    print(f'Done {func.__name__}: [{end}]')
    total_in_seconds = end - start
    print(f'Total time: {timedelta(seconds=total_in_seconds)}')

    return result, total_in_seconds

In [None]:
#csv compression
df = pd.read_csv(f'data/small/day1/{reddit_place}.csv')
for compression in csv_compressions:
    print(f'{compression}: is running')
    time_func(
        df.to_csv,
        f'data/compressed/csv/{reddit_place}.csv.{compression}',
        compression=compression
    )
    print('=' * 10)

infer: is running
Starting to_csv: [178240.203605503]
Done to_csv: [178276.681595687]
Total time: 0:00:36.477990
gzip: is running
Starting to_csv: [178276.681685619]
Done to_csv: [178398.638667101]
Total time: 0:02:01.956981
bz2: is running
Starting to_csv: [178398.63887105]
Done to_csv: [178551.237841496]
Total time: 0:02:32.598970
zip: is running
Starting to_csv: [178551.237934163]
Done to_csv: [178645.17216472]
Total time: 0:01:33.934231
xz: is running
Starting to_csv: [178645.172416969]
Done to_csv: [179435.09174904]
Total time: 0:13:09.919332
zstd: is running
Starting to_csv: [179435.09182658]


ImportError: Missing optional dependency 'zstandard'.  Use pip or conda to install zstandard.

In [None]:
import lz4.frame
def compress_csv_lz4(i__file, o__file):
    with open(i__file, 'rb') as infile:
        with open(o__file, 'wb') as outfile:
            outfile.write(lz4.frame.compress(infile.read()))

infile = f'data/small/day1/{reddit_place}.csv'
outfile = f'data/compressed/csv/{reddit_place}.csv.lz4'
time_func(
    compress_csv_lz4,
    infile,
    outfile
)

Starting compress_csv_lz4: [177749.424577288]
Done compress_csv_lz4: [177756.614378102]
Total time: 0:00:07.189801


(None, 7.189800813997863)

In [None]:
time_func(
        df.to_csv,
        f'data/compressed/csv/{reddit_place}.csv.zstd',
        compression='zstd'
    )

Starting to_csv: [185218.0175044]
Done to_csv: [185269.613355241]
Total time: 0:00:51.595851


(None, 51.59585084099672)

In [8]:
# Compression testing with feather
import pyarrow.feather as feather
a_table = feather.read_table(f'data/small/day1/{reddit_place}.feather')
for compression in feather_compressions:
    print(f'{compression}: is running')
    time_func(
        feather.write_feather,
        df=a_table,
        dest=f'data/compressed/feather/{reddit_place}.feather.{compression}',
        compression=compression
    )
    print('=' * 10)

lz4: is running
Starting write_feather: [186150.613422681]
Done write_feather: [186151.594831775]
Total time: 0:00:00.981409
zstd: is running
Starting write_feather: [186151.594929562]
Done write_feather: [186153.285618917]
Total time: 0:00:01.690689
uncompressed: is running
Starting write_feather: [186153.285709172]
Done write_feather: [186154.59066106]
Total time: 0:00:01.304952


In [14]:
time_func(
    func=parquet.write_table,
    table=a_table,
    where=f'data/compressed/parquet/{reddit_place}.parquet.none',
    compression='none'
)

Starting write_table: [187721.465816396]
Done write_table: [187726.729342636]
Total time: 0:00:05.263526


(None, 5.263526240014471)

In [16]:
import pyarrow.parquet as parquet
a_table = parquet.read_table(f'data/small/day1/{reddit_place}.parquet')
for compression in parquet_compressions:
    print(f'{compression}: is running')
    time_func(
        func=parquet.write_table,
        table=a_table,
        where=f'data/compressed/parquet/{reddit_place}.parquet.{compression}',
        compression=compression
    )
    print('=' * 10)

zstd: is running
Starting write_table: [187824.634069246]
Done write_table: [187831.74502341]
Total time: 0:00:07.110954
lz4: is running
Starting write_table: [187831.745113997]
Done write_table: [187837.883311435]
Total time: 0:00:06.138197
brotli: is running
Starting write_table: [187837.883515929]
Done write_table: [187959.555333221]
Total time: 0:02:01.671817
gzip: is running
Starting write_table: [187959.555432089]
Done write_table: [188014.497124237]
Total time: 0:00:54.941692
snappy: is running
Starting write_table: [188014.497222626]
Done write_table: [188020.138380986]
Total time: 0:00:05.641158
none: is running
Starting write_table: [188020.138639426]
Done write_table: [188025.436030188]
Total time: 0:00:05.297391


In [17]:
import pyarrow.orc as orc
a_table = orc.read_table(f'data/small/day1/{reddit_place}.orc')
for compression in orc_compressions:
    print(f'{compression}: is running')
    time_func(
        orc.write_table,
        table=a_table,
        where=f'data/compressed/orc/{reddit_place}.orc.{compression}',
        compression=compression
    )
    print('=' * 10)


zlib: is running
Starting write_table: [188079.066585959]
Done write_table: [188109.002437225]
Total time: 0:00:29.935851
snappy: is running
Starting write_table: [188109.002525224]
Done write_table: [188112.797070755]
Total time: 0:00:03.794546
lz4: is running
Starting write_table: [188112.79717046]
Done write_table: [188116.239223794]
Total time: 0:00:03.442053
zstd: is running
Starting write_table: [188116.23931164]
Done write_table: [188121.235151992]
Total time: 0:00:04.995840
uncompressed: is running
Starting write_table: [188121.235246894]
Done write_table: [188124.640642752]
Total time: 0:00:03.405396
