# Why you should always use compression

## Load movielens datasets

In [1]:
import os
import numpy as np
import pandas as pd
import tables

In [2]:
!rm -f movielens-*norm-*.h5

In [3]:
# Import CSV files via pandas
dset = 'movielens-1m'
fdata = os.path.join(dset, 'ratings.dat.gz')
fitem = os.path.join(dset, 'movies.dat')

# pass in column names for each CSV
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(fdata, sep=';', names=r_cols, compression='gzip')

m_cols = ['movie_id', 'title', 'genres']
movies = pd.read_csv(fitem, sep=';', names=m_cols,
                     dtype={'title': object, 'genres': object})

In [4]:
movies.ftypes

movie_id     int64:dense
title       object:dense
genres      object:dense
dtype: object

In [5]:
ratings.ftypes

user_id           int64:dense
movie_id          int64:dense
rating            int64:dense
unix_timestamp    int64:dense
dtype: object

## Storing in HDF5/PyTables in compressed form

In [6]:
import os
import shutil
if os.path.exists("compression"):
    shutil.rmtree("compression")
os.mkdir("compression")

In [7]:
def to_hdf5(ratings, movies, filename, filters):
    
    class Ratings(tables.IsDescription):
        user_id = tables.Int32Col(pos=0)
        movie_id = tables.Int32Col(pos=1)
        rating = tables.Int8Col(pos=2)
        unix_timestamp = tables.Int64Col(pos=3)
    
    class Movies(tables.IsDescription):
        movie_id = tables.Int32Col(pos=0)
        title = tables.StringCol(100, pos=1)
        genres = tables.StringCol(50, pos=2)
    
    with tables.open_file(filename, "w", PYTABLES_SYS_ATTRS=False) as f:
        table_ratings = f.create_table(f.root, "ratings", Ratings, filters=filters, expectedrows=len(ratings))
        table_ratings.append([ratings[col].values for col in ratings.ftypes.keys()])
        table_movies = f.create_table(f.root, "movies", Movies, filters=filters, expectedrows=len(movies))
        table_movies.append([movies[col].values for col in movies.ftypes.keys()])
    return filename

In [40]:
filters = tables.Filters(complevel=0, complib="zlib", shuffle=True)
print(complib)
if filters.complevel != 0:
    complib = filters.complib if ":" not in filters.complib else filters.complib.replace(":", "-")
    shuffle = "shuffle" if filters.shuffle else "noshuffle"
    h5file = "compression/%s-%d-%s.h5" % (complib, filters.complevel, shuffle)
else:
    h5file = "compression/no-compressed.h5"
%time to_hdf5(ratings, movies, h5file, filters)

zlib
CPU times: user 21 ms, sys: 25.8 ms, total: 46.8 ms
Wall time: 59.7 ms


'compression/no-compressed.h5'

In [41]:
!h5ls -v {h5file}

Opened "compression/no-compressed.h5" with sec2 driver.
movies                   Dataset {3883/Inf}
    Location:  1:14949748
    Links:     1
    Chunks:    {425} 65450 bytes
    Storage:   597982 logical bytes, 654500 allocated bytes, 91.36% utilization
    Type:      struct {
                   "movie_id"         +0    native int
                   "title"            +4    100-byte null-terminated ASCII string
                   "genres"           +104  50-byte null-terminated ASCII string
               } 154 bytes
ratings                  Dataset {1000209/Inf}
    Location:  1:800
    Links:     1
    Chunks:    {7710} 131070 bytes
    Storage:   17003553 logical bytes, 17039100 allocated bytes, 99.79% utilization
    Type:      struct {
                   "user_id"          +0    native int
                   "movie_id"         +4    native int
                   "rating"           +8    native signed char
                   "unix_timestamp"   +9    native lon

## Exercise

PyTables comes with out-of-box support for a series of codecs.  Do a quick comparison between "zlib", "bzip2", and "blosc" for compression levels of 1 (fastest), 5 and 9 (slowest).  Which one compresses best?  Which one compresses faster?

Also, Blosc being a meta-compressor, it has support for different codecs internally that can be selected from PyTables in the "blosc:`codec`" form.  Do another comparison between internal Blosc codecs, namely, "blosc:blosclz" (the default), "blosc:lz4", "blosc:lz4hc", "blosc:snappy", "blosc:zlib" and "blosc:zstd".

Finally, avoid any compression totally (`complevel=0`).  How fast it is compared with existing codecs?

## Reading compressed datasets

In [42]:
files = list(os.walk("compression"))[0][2]

In [43]:
files

['blosc-blosclz-5-shuffle.h5',
 'blosc-lz4-5-shuffle.h5',
 'blosc-lz4hc-5-shuffle.h5',
 'blosc-snappy-5-shuffle.h5',
 'blosc-zlib-5-shuffle.h5',
 'blosc-zstd-5-shuffle.h5',
 'bzip2-5-shuffle.h5',
 'no-compressed.h5',
 'zlib-5-shuffle.h5']

In [55]:
for f in files:
    print("Reading file:", f)
    with tables.open_file(os.path.join("compression", f)) as h5f:
        %time h5f.root.ratings[:]

Reading file: blosc-blosclz-5-shuffle.h5
CPU times: user 36.7 ms, sys: 10.8 ms, total: 47.4 ms
Wall time: 32.2 ms
Reading file: blosc-lz4-5-shuffle.h5
CPU times: user 29 ms, sys: 20 ms, total: 49.1 ms
Wall time: 41.1 ms
Reading file: blosc-lz4hc-5-shuffle.h5
CPU times: user 23.6 ms, sys: 18.3 ms, total: 41.9 ms
Wall time: 29.7 ms
Reading file: blosc-snappy-5-shuffle.h5
CPU times: user 6.87 ms, sys: 10.7 ms, total: 17.6 ms
Wall time: 17.8 ms
Reading file: blosc-zlib-5-shuffle.h5
CPU times: user 92.4 ms, sys: 16.8 ms, total: 109 ms
Wall time: 65.5 ms
Reading file: blosc-zstd-5-shuffle.h5
CPU times: user 57.7 ms, sys: 26.3 ms, total: 84 ms
Wall time: 58.2 ms
Reading file: bzip2-5-shuffle.h5
CPU times: user 677 ms, sys: 10.5 ms, total: 687 ms
Wall time: 692 ms
Reading file: no-compressed.h5
CPU times: user 5.96 ms, sys: 9.06 ms, total: 15 ms
Wall time: 14.8 ms
Reading file: zlib-5-shuffle.h5
CPU times: user 65.3 ms, sys: 5.56 ms, total: 70.9 ms
Wall time: 70.7 ms


## Exercise

Which codec and compression level can read the fastest?  How it does compare with reading an uncompressed dataset?

## Exercise

Blosc can use multithreading for compressing/decompressing, although it is disabled by default.  You can enable a multithreaded Blosc in a series of ways, but perhaps the easiest is to set the "BLOSC_NTHREADS" environment variable to the desired number of threads (typically the available number of cores in your computer).

Execute the cell below and re-do the reading benchmarks and look at how the reading speed vary.  Pay special attention to the difference between the CPU times and wall times.

In [54]:
os.environ["BLOSC_NTHREADS"] = "4"  # set to any other number you prefer