# Compression Report

In [None]:
import os
import json 
import subprocess

import xbitinfo as xb
import xarray as xr

---

In [None]:
# load data
path_to_data = 'data/data.nc'  # change this
ds = xr.open_dataset(path_to_data)

In [None]:
# get information content per bit
info_per_bit = xb.get_bitinformation(ds, dim="longitude", implementation="python")
# ?? dim
# ipywidgets warnings

In [None]:
# get keepbits
keepbits = xb.get_keepbits(info_per_bit, 0.99)

In [None]:
# apply bitrounding
ds_bitrounded = xb.xr_bitround(ds, keepbits)

---

In [None]:
def save_ncdf() -> None:
    """
    Save dataset in NetCDF format
    """
    # TODO: add prefix and in ds as params
    ds_bitrounded.to_compressed_netcdf("bitrounded_compressed.nc")
    ds.to_compressed_netcdf("compressed.nc")
    ds.to_netcdf("original.nc")

In [None]:
def save_zarr() -> None:
    """
    Save dataset in Zarr format
    """
    # TODO: add prefix and in ds as params
    ds_bitrounded.to_compressed_zarr("bitrounded_compressed.zarr", mode="w")
    ds.to_compressed_zarr("compressed.zarr", mode="w")
    ds.to_zarr(
        "original.zarr", mode="w", 
        encoding={v: {"compressor": None} for v in ds.data_vars}
    );

In [None]:
def hsize(size: int, decimal_places: int=2) -> str:
    """
    Size from bytes to human readable
    """
    for unit in ['B', 'K', 'M', 'G', 'T', 'P']:
        if size < 1024.0 or unit == 'P':
            break
        size /= 1024.0
    return f"{size:.{decimal_places}f} {unit}"

---

In [None]:
save_ncdf()
save_zarr()

---

In [None]:
def get_size(ext: str) -> dict:
    """
    """
    sizes = subprocess.check_output(f'du -s *.{ext}', shell=True).decode("utf-8")
    sizes = sizes.replace('\t', ';').replace('\n', ';').split(';')
    
    shortnm = {
        'bitrounded_compressed.zarr': 'bitr_comp', 'compressed.zarr': 'comp', 'original.zarr': 'ori',
        'bitrounded_compressed.nc': 'bitr_comp', 'compressed.nc': 'comp', 'original.nc': 'ori',
    }

    output = {}
    for i in range(0, len(sizes)-1, 2):
        output[sizes[i + 1]] = int(sizes[i])
        
    for k in output.keys():
        output[shortnm[k]] = output.pop(k)
    
    return output

In [None]:
get_size('zarr')

In [None]:
get_size('nc')

In [None]:
import numpy as np

def get_ratio(data: dict) -> dict:
    """
    """
    original = data['ori']
    
    ratios = {}
    
    for k in data.keys():
        ratios[k] = np.round(original / data[k])
        
    return ratios

---

In [None]:
sdata = {
    'ncdf': get_size('nc'),
    'zarr': get_size('zarr')
}

In [None]:
sdata

In [None]:
get_ratio(sdata['ncdf'])

In [None]:
get_ratio(sdata['ncdf']).values()

In [None]:
import pandas as pd

df = pd.DataFrame(sdata)
df

---

In [None]:
# TODO plotting func
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('dark')

# plt.yscale('log')
# ??
plt.figure(figsize=(6,6))
bar = plt.bar(df.index, df['ncdf'].values, alpha=.8)
# TODO barh!
plt.xlabel('Compression Method')
plt.ylabel('Size')
plt.title('NetCDF Compression Comparison')

for rect, lbl in zip(bar, get_ratio(sdata['ncdf']).values()):
    height = rect.get_height()
    plt.text(rect.get_x() + rect.get_width() / 2.0, height, f'X {int(lbl)}',
             ha='center', va='bottom', fontsize='medium', fontweight='heavy',
             c = 'darkblue'
    )

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6,6))
bar = plt.bar(df.index, df['zarr'].values, alpha=.8, color='r')
# TODO barh!
plt.xlabel('Compression Method')
plt.ylabel('Size')
plt.title('Zarr Compression Comparison')

for rect, lbl in zip(bar, get_ratio(sdata['zarr']).values()):
    height = rect.get_height()
    plt.text(rect.get_x() + rect.get_width() / 2.0, height, f'X {int(lbl)}',
             ha='center', va='bottom', fontsize='medium', fontweight='heavy',
             c = 'darkred'
    )

plt.tight_layout()
plt.show()

In [None]:
# CLEAN
!rm *.nc
!rm -r *.zarr

In [None]:
# TODO multiple rounds of compression