# **Load and cache tables**

**Objectif de performance** : Python 3.11 + Parquet + Dask + caching + parallélisation des calculs

# Chargement, dimensions et conversion en parquet du jeu de données

**Note importante** :

`HomeCredit_columns_description` avait un caractère corrompu en position 59 qui empêchait sa lecture selon l'encodage UTF-8.

Ce caractère a été corrrigé manuellement (supression et réécriture de ce caractère).

## Lecture des CSV

In [1]:
from pepper.persist import _get_filenames_glob
from pepper.utils import pretty_timedelta_str, bold
import pandas as pd
import time
csv_dir = "../../dataset/csv/"
filenames = _get_filenames_glob(csv_dir, "csv")
datadict = {}
read_times = []
for filename in filenames:
    t = -time.time()
    datakey = filename[:-4]
    data = pd.read_csv(csv_dir + filename, encoding='utf-8')
    t += time.time()
    read_times.append(t)
    datadict[datakey] = data
    print(f"{bold(datakey)}: {data.shape} - {pretty_timedelta_str(t, 2)}")
print(f">> {bold('total read time')}: {pretty_timedelta_str(sum(read_times), 2)}")

[1mapplication_test[0m: (48744, 121) - 456 ms, 974 mus
[1mapplication_train[0m: (307511, 122) - 2 s, 822 ms
[1mbureau[0m: (1716428, 17) - 2 s, 733 ms
[1mbureau_balance[0m: (27299925, 3) - 4 s, 592 ms
[1mcredit_card_balance[0m: (3840312, 23) - 7 s, 44 ms
[1mHomeCredit_columns_description[0m: (219, 5) - 7 ms, 1 mus
[1minstallments_payments[0m: (13605401, 8) - 9 s, 735 ms
[1mPOS_CASH_balance[0m: (10001358, 8) - 5 s, 275 ms
[1mprevious_application[0m: (1670214, 37) - 6 s, 700 ms
[1msample_submission[0m: (48744, 2) - 13 ms, 244 mus
>> [1mtotal read time[0m: 39 s, 380 ms


## Dimensions

In [2]:
from pepper.utils import bold
from pepper.utils import get_file_size   # mv to pepper_persist
import pandas as pd
metadata = pd.DataFrame(
    [(key, *data.shape) for key, data in datadict.items()],
    columns=["table_name", "n_samples", "n_features"]
)
metadata["n_cells"] = metadata.n_samples * metadata.n_features
metadata["csv_size"] = metadata.table_name.apply(
    lambda x: get_file_size(csv_dir + x + ".csv")
)
metadata["csv_read_time"] = read_times
metadata.sort_values(by="n_cells", ascending=False, inplace=True)
print(f"{bold('n_cells')}: {metadata.n_cells.sum()}")
display(metadata)

[1mn_cells[0m: 493571166


Unnamed: 0,table_name,n_samples,n_features,n_cells,csv_size,csv_read_time
6,installments_payments,13605401,8,108843208,723118349,9.735015
4,credit_card_balance,3840312,23,88327176,424582605,7.044512
3,bureau_balance,27299925,3,81899775,375592889,4.592757
7,POS_CASH_balance,10001358,8,80010864,392703158,5.2751
8,previous_application,1670214,37,61797918,404973293,6.700456
1,application_train,307511,122,37516342,166133370,2.822481
2,bureau,1716428,17,29179276,170016717,2.73304
0,application_test,48744,121,5898024,26567651,0.456974
9,sample_submission,48744,2,97488,536202,0.013244
5,HomeCredit_columns_description,219,5,1095,37391,0.007001


## Sauvegarde au format `parquet`

Sauvegarde au format `parquet` (par défaut, moteur `pyarrow` et compression `snappy`).

Comparons les performances en termes d'empreinte mémoire sur le disque et de temps de chargement pour les 6 configurations possibles suivant que :
* le moteur soit `pyarrow` ou `fastparquet`,
* la compression soit `snappy`, `gzip`, `brotli` ou `None`.

**Attention**, si vous souhaitez rexécuter ce benchmark, comptez 8 minutes.

In [3]:
from pepper.persist import all_to_parquet
import itertools
import time

csv_dir = "../../dataset/csv/"
pqt_dir = "../../dataset/pqt/"

# Default: `pyarrow`, `snappy` and `fastparquet` + `brotli` => bug
engines = ["pyarrow", "fastparquet"]
compressions = ["snappy", "gzip", "brotli", None]
for engine, compression in itertools.product(engines, compressions):
    if engine == "fastparquet" and compression == "brotli":
        continue
    config_name = engine + "_" + str(compression).lower()
    pqt_subdir = pqt_dir + config_name + "/"
    t = -time.time()
    print(f"save dataset to {pqt_subdir}", end="")
    all_to_parquet(datadict, pqt_subdir, engine, compression)
    t += time.time()
    print(f" in {pretty_timedelta_str(t, 2)}")

save dataset to ../../dataset/pqt/pyarrow_snappy/.......... in 22 s, 556 ms
save dataset to ../../dataset/pqt/pyarrow_gzip/.......... in 1 m, 40 s
save dataset to ../../dataset/pqt/pyarrow_brotli/.......... in 2 m, 3 s
save dataset to ../../dataset/pqt/pyarrow_none/.......... in 21 s, 441 ms
save dataset to ../../dataset/pqt/fastparquet_snappy/.......... in 24 s, 271 ms
save dataset to ../../dataset/pqt/fastparquet_gzip/.......... in 2 m, 40 s
save dataset to ../../dataset/pqt/fastparquet_none/.......... in 21 s, 295 ms


## Comparaison des performances Parquet | CSV

In [4]:
from pepper.utils import get_file_size

def pqt_read_time(pqt_dir, table_name):
    t = time.time()
    pd.read_parquet(pqt_dir + table_name + ".pqt")
    return time.time() - t

def pqt_file_size(pqt_dir, table_name):
    return get_file_size(pqt_dir + table_name + ".pqt")


engines = ["pyarrow", "fastparquet"]
compressions = ["snappy", "gzip", "brotli", None]
for engine, compression in itertools.product(engines, compressions):
    if engine == "fastparquet" and compression == "brotli":
        continue
    config_name = engine + "_" + str(compression).lower()
    pqt_subdir = pqt_dir + config_name + "/"
    metadata[f"pqt_{config_name}_size"] = metadata.table_name.apply(
        lambda x: pqt_file_size(pqt_subdir, x)
    )
    metadata[f"pqt_{config_name}_read_time"] = metadata.table_name.apply(
        lambda x: pqt_read_time(pqt_subdir, x)
    )
display(metadata)

Unnamed: 0,table_name,n_samples,n_features,n_cells,csv_size,csv_read_time,pqt_pyarrow_snappy_size,pqt_pyarrow_snappy_read_time,pqt_pyarrow_gzip_size,pqt_pyarrow_gzip_read_time,pqt_pyarrow_brotli_size,pqt_pyarrow_brotli_read_time,pqt_pyarrow_none_size,pqt_pyarrow_none_read_time,pqt_fastparquet_snappy_size,pqt_fastparquet_snappy_read_time,pqt_fastparquet_gzip_size,pqt_fastparquet_gzip_read_time,pqt_fastparquet_none_size,pqt_fastparquet_none_read_time
6,installments_payments,13605401,8,108843208,723118349,9.735015,330470104,2.385714,246648550,1.63108,234206927,1.477867,478259694,1.025302,417551342,1.953967,273744883,1.779752,874103290,1.399879
4,credit_card_balance,3840312,23,88327176,424582605,7.044512,111274155,1.092881,87382864,1.349397,84062930,1.067713,231893301,1.143015,158525573,1.055081,99998309,1.179522,671997342,1.281753
3,bureau_balance,27299925,3,81899775,375592889,4.592757,21426895,2.504869,7220080,2.408154,6528751,2.533015,212427359,2.510618,39104894,1.964648,8773070,2.208701,573299674,2.249693
7,POS_CASH_balance,10001358,8,80010864,392703158,5.2751,124435906,1.206251,89478858,1.207468,84330013,1.419164,192425645,1.26764,166379319,1.131607,93648196,1.516641,664506876,1.470279
8,previous_application,1670214,37,61797918,404973293,6.700456,62912447,1.689787,49908242,1.739681,48304590,1.857638,80714342,1.778238,115293797,1.615352,62131222,1.842093,514893753,1.952668
1,application_train,307511,122,37516342,166133370,2.822481,22225869,0.460437,18770994,0.457052,18486431,0.558161,24879919,0.460123,49802974,0.454306,25306399,0.492319,253550609,0.530317
2,bureau,1716428,17,29179276,170016717,2.73304,35241265,0.472466,25883443,0.493529,24365040,0.555431,61232824,0.475354,52235506,0.42066,29284991,0.506664,234062490,0.523403
0,application_test,48744,121,5898024,26567651,0.456974,4255523,0.0796,3596498,0.086807,3505436,0.114524,4861820,0.084054,8361289,0.077595,4258899,0.0926,40157544,0.095491
9,sample_submission,48744,2,97488,536202,0.013244,296358,0.011006,170995,0.0095,156444,0.014007,489947,0.017202,215625,0.009495,77364,0.011968,780849,0.012833
5,HomeCredit_columns_description,219,5,1095,37391,0.007001,13372,0.00883,10505,0.008965,9931,0.011995,23605,0.01,10992,0.008477,7179,0.011065,41639,0.009964


Cf. synthèse ci-dessous, le meilleur choix nous semble être `pyarrow` + `gzip`.

C'est la configuration par défaut sur laquelle nous nous fixons.

On y gagne un facteur 5 tant en termes de vitesse que d'empreinte sur le disque.

In [5]:
display(metadata.sum(axis=0))

table_name                          installments_paymentscredit_card_balancebureau...
n_samples                                                                    58538856
n_features                                                                        346
n_cells                                                                     493571166
csv_size                                                                   2684261625
csv_read_time                                                               39.380582
pqt_pyarrow_snappy_size                                                     712551894
pqt_pyarrow_snappy_read_time                                                  9.91184
pqt_pyarrow_gzip_size                                                       529071029
pqt_pyarrow_gzip_read_time                                                   9.391634
pqt_pyarrow_brotli_size                                                     503956493
pqt_pyarrow_brotli_read_time                          

Un tableau *pretty* pour présenter ce résultat dans les transparents (le tableau markdown peut passer par le presse papier) :

In [44]:
from pepper.utils import display_dataframe_in_markdown, format_iB, pretty_timedelta_str

def format_size(x):
    sz, unity = format_iB(x)
    return f"{sz:.1f} {unity}"

def format_time(x):
    return pretty_timedelta_str(x, 1)

def format_bigint(x):
    return f"{x:n}"

cols = metadata.columns
res_cols = list(cols[:6]) + list(cols[cols.str.contains("pyarrow_gzip")])
res = metadata[res_cols]

res_2 = res.copy()
total = res_2.sum(axis=0)
total[0] = "**TOTAL**"
res_2.loc["TOTAL"] = total
res_2.csv_size = res_2.csv_size.apply(format_size)
res_2.pqt_pyarrow_gzip_size = res_2.pqt_pyarrow_gzip_size.apply(format_size)
res_2.csv_read_time = res_2.csv_read_time.apply(format_time)
res_2.pqt_pyarrow_gzip_read_time = res_2.pqt_pyarrow_gzip_read_time.apply(format_time)
res_2.n_samples = res_2.n_samples.apply(format_bigint)
res_2.n_cells = res_2.n_cells.apply(format_bigint)
res_2.columns = (
    res_2.columns
    .str.replace("n_", "#")
    .str.replace("pqt_pyarrow_gzip", "parquet")
    .str.replace("read_time", "readtime")
    .str.replace("_", " ")
)
res_2.loc["TOTAL"] = res_2.loc["TOTAL"].apply(lambda x: f"**{x}**")

display_dataframe_in_markdown(res_2)

table name|#samples|#features|#cells|csv size|csv readtime|parquet size|parquet readtime|
---|---|---|---|---|---|---|---|
installments_payments|13 605 401|8|108 843 208|689.6 MiB|9 s|235.2 MiB|1 s|
credit_card_balance|3 840 312|23|88 327 176|404.9 MiB|7 s|83.3 MiB|1 s|
bureau_balance|27 299 925|3|81 899 775|358.2 MiB|4 s|6.9 MiB|2 s|
POS_CASH_balance|10 001 358|8|80 010 864|374.5 MiB|5 s|85.3 MiB|1 s|
previous_application|1 670 214|37|61 797 918|386.2 MiB|6 s|47.6 MiB|1 s|
application_train|307 511|122|37 516 342|158.4 MiB|2 s|17.9 MiB|457 ms|
bureau|1 716 428|17|29 179 276|162.1 MiB|2 s|24.7 MiB|493 ms|
application_test|48 744|121|5 898 024|25.3 MiB|456 ms|3.4 MiB|86 ms|
sample_submission|48 744|2|97 488|523.6 KiB|13 ms|167.0 KiB|9 ms|
HomeCredit_columns_description|219|5|1 095|36.5 KiB|7 ms|10.3 KiB|8 ms|
****TOTAL****|**58 538 856**|**346**|**493 571 166**|**2.5 GiB**|**39 s**|**504.6 MiB**|**9 s**|

## Nettoyage

Récupérons les fichiers générés dans la configuration `pyarrow` + `gzip` :

In [41]:
from pathlib import Path
def move_files_to_parent(dir_path):
    parent_dir_path = Path(dir_path).parent
    for file in Path(dir_path).iterdir():
        print(file.name)
        file.rename(parent_dir_path.joinpath(file.name))

pqt_subdir = "../../dataset/pqt/pyarrow_gzip/"
move_files_to_parent(pqt_subdir)

application_test.pqt
application_train.pqt
bureau.pqt
bureau_balance.pqt
credit_card_balance.pqt
HomeCredit_columns_description.pqt
installments_payments.pqt
POS_CASH_balance.pqt
previous_application.pqt
sample_submission.pqt


Et libérons de la place sur le disque (*que serait la vie sans prendre de risque ?*).

In [42]:
import os, shutil
def _dangerous_rmtree_all_subdirs(dir_path):
    for childname in os.listdir(dir_path):
        child_path = os.path.join(dir_path, childname)
        if os.path.isdir(child_path):
            shutil.rmtree(child_path)

pqt_dir = "../../dataset/pqt"
_dangerous_rmtree_all_subdirs(pqt_dir)