# This notebook examines the read/write and partitioning structure of pandas

In [1]:
import pandas as pd
import dask.dataframe as dd
import random
import math
from pathlib import Path
import dask
from distributed import Client
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
c = Client()

distributed.diskutils - INFO - Found stale lock file and directory '/home/balast/CodingProjects/dask_tutorials/parquet-deep-dive/dask-worker-space/worker-r5spyf4k', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/balast/CodingProjects/dask_tutorials/parquet-deep-dive/dask-worker-space/worker-cagx6nlk', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/balast/CodingProjects/dask_tutorials/parquet-deep-dive/dask-worker-space/worker-2jt1dykl', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/balast/CodingProjects/dask_tutorials/parquet-deep-dive/dask-worker-space/worker-9g77irot', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/balast/CodingProjects/dask_tutorials/parquet-deep-dive/dask-worker-space/worker-akoe7ckp', purging
distributed.diskutils - INFO - Found stale lock file and directory '/home/balast/CodingProjects/dask_tutorials/parquet-deep-dive/dask-w

# Generate Trade Files

In [3]:
max_trade_files_per_ns = 200_000
n_paths = 50_000
n_dates = 120
max_paths_per_partition = 50_000 # 10 # 200

memory_needed_per_row_gb = 138/1024/6_000_000
n_partitions = math.ceil(n_paths/max_paths_per_partition)
min_node_memory_needed_gb = (memory_needed_per_row_gb * n_paths * n_dates / n_partitions) * max_trade_files_per_ns

In [4]:
min_node_memory_needed_gb

26953.125

In [5]:
all_rows = []
n_files_to_create = 1
trade_folder = './datasets'

In [6]:
dates = pd.date_range("2021-07-01", freq="3M", periods=n_dates)
for file_no in range(n_files_to_create):
    print(f'Creating File {file_no:03d}')
    for path in range(n_paths):
        for date in dates:
            all_rows.append([date, path, random.random()*200-100]) 
    
    df = pd.DataFrame(all_rows, columns=['date', 'path', 'mtm']).astype({'path': 'int32', 'mtm': 'float64'}).set_index('path')
    df.date = pd.to_datetime(df.date)
    ddf = dd.from_pandas(df, npartitions=n_partitions)
    trade_filepath = f'{trade_folder}/row_partitioned_trade_file_{file_no:03d}.parquet'
    ddf.to_parquet(trade_filepath, row_group_size=100_000)

Creating File 000


In [7]:
files = sorted(list(Path(trade_folder).glob('*.parquet')))
files

[PosixPath('datasets/row_partitioned_trade_file_000.parquet')]

In [37]:
pq.read_table?

[0;31mSignature:[0m
[0mpq[0m[0;34m.[0m[0mread_table[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0msource[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_threads[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetadata[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_pandas_metadata[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmemory_map[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mread_dictionary[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfilesystem[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfilters[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbuffer_size[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpartitioning[0m[0;34m=[0m[0;34m'hive'[0m[0;34m,[0m[0;34m[0m
[0;34m[0

In [35]:
%%timeit

pd.read_parquet(files[0])
None

108 ms ± 8.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [33]:
%%timeit

pd.read_parquet(files[0], columns=['mtm'])
None

64 ms ± 3.54 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
table = pq.ParquetFile(files[0])

OSError: Expected file path, but datasets/row_partitioned_trade_file_000.parquet is a directory

In [10]:
table = pq.ParquetFile(files[0] / 'part.0.parquet')

In [11]:
table.metadata

<pyarrow._parquet.FileMetaData object at 0x7f4e300eaae0>
  created_by: parquet-cpp-arrow version 5.0.0
  num_columns: 3
  num_rows: 6000000
  num_row_groups: 60
  format_version: 1.0
  serialized_size: 22364