In [1]:
# create some demonstrational tabular data in a python dictionary
import json

example_tabular = {
            "name": ["Alice", "Bob"],
            "age": [30, 25],
            "occupation": ["Software Engineer", "Data Scientist"],
        }
print(json.dumps(example_tabular, indent=4))

{
    "name": [
        "Alice",
        "Bob"
    ],
    "age": [
        30,
        25
    ],
    "occupation": [
        "Software Engineer",
        "Data Scientist"
    ]
}


In [2]:
# demonstrate the data as a pandas dataframe
import pandas as pd

example_pd_dataframe = pd.DataFrame(example_tabular)

example_pd_dataframe

Unnamed: 0,name,age,occupation
0,Alice,30,Software Engineer
1,Bob,25,Data Scientist


In [3]:
# demonstrate the data as a pyarrow table
import pyarrow as pa

example_pa_table = pa.Table.from_pydict(example_tabular)

example_pa_table

pyarrow.Table
name: string
age: int64
occupation: string
----
name: [["Alice","Bob"]]
age: [[30,25]]
occupation: [["Software Engineer","Data Scientist"]]

In [4]:
# show how the schema is a distinct component of a pyarrow table
example_pa_table.schema

name: string
age: int64
occupation: string

In [5]:
# show how schema metadata can be changed
example_pa_table = example_pa_table.replace_schema_metadata({"data-producer":"a-lab", "data-version":"v0.0.1"})
example_pa_table.schema

name: string
age: int64
occupation: string
-- schema metadata --
data-producer: 'a-lab'
data-version: 'v0.0.1'

In [6]:
# show how the data may be written to a parquet file
import pathlib
from pyarrow import parquet

parquet.write_table(table=example_pa_table, where="example.parquet")

print(pathlib.Path("./example.parquet").resolve())

/content/example.parquet


In [7]:
# show how the schema may be read from a parquet file (with everything intact)

parquet.read_schema(where="example.parquet")

name: string
age: int64
occupation: string
-- schema metadata --
data-producer: 'a-lab'
data-version: 'v0.0.1'

In [8]:
# show how partial data may be read from a parquet file

parquet.read_table(source="example.parquet", columns=["name"])

pyarrow.Table
name: string
----
name: [["Alice","Bob"]]

In [9]:
# show how the full data may be read from a parquet file

parquet.read_table(source="example.parquet")

pyarrow.Table
name: string
age: int64
occupation: string
----
name: [["Alice","Bob"]]
age: [[30,25]]
occupation: [["Software Engineer","Data Scientist"]]

In [10]:
# show how row-wise portions of pyarrow tables can be selected and written
pathlib.Path("./example_dataset").mkdir(exist_ok=True)

parquet.write_table(table=example_pa_table.take([0]), where="./example_dataset/example-0.parquet")
parquet.write_table(table=example_pa_table.take([1]), where="./example_dataset/example-1.parquet")

parquet.read_table(source="./example_dataset/example-0.parquet")

pyarrow.Table
name: string
age: int64
occupation: string
----
name: [["Alice"]]
age: [[30]]
occupation: [["Software Engineer"]]

In [11]:
# show how a parquet dataset can be read (composing of one or many files)

example_dataset = parquet.read_table(source="./example_dataset/")

print(example_dataset.schema, end="\n\n")
print(example_dataset)
# why is this list of lists?

name: string
age: int64
occupation: string
-- schema metadata --
data-producer: 'a-lab'
data-version: 'v0.0.1'

pyarrow.Table
name: string
age: int64
occupation: string
----
name: [["Alice"],["Bob"]]
age: [[30],[25]]
occupation: [["Software Engineer"],["Data Scientist"]]


In [12]:
# what if we have inequal nodes?
nodes = {
    "nodes": [
        {
            "name": "Alice",
            "age": 30,
            "occupation": "Software Engineer",
        },
        {
            "name": "Bob",
            "age": 25,
            "occupation": "Data Scientist",
            "favorite_color": "pink",
        },
    ],
}

nodes_df = pd.DataFrame(nodes)
nodes_df

Unnamed: 0,nodes
0,"{'name': 'Alice', 'age': 30, 'occupation': 'So..."
1,"{'name': 'Bob', 'age': 25, 'occupation': 'Data..."


In [13]:
# how can we access the age data of each node?
nodes_df.nodes.apply(lambda x: x["age"])

0    30
1    25
Name: nodes, dtype: int64

In [14]:
# what if the attributes are misaligned (some have them, some don't)?
try:
  nodes_df.nodes.apply(lambda x: x["favorite_color"])
except Exception as e:
  import traceback
  traceback.print_exc()

Traceback (most recent call last):
  File "<ipython-input-14-5e1fec795b00>", line 3, in <cell line: 2>
    nodes_df.nodes.apply(lambda x: x["favorite_color"])
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/series.py", line 4771, in apply
    return SeriesApply(self, func, convert_dtype, args, kwargs).apply()
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/apply.py", line 1123, in apply
    return self.apply_standard()
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/apply.py", line 1174, in apply_standard
    mapped = lib.map_infer(
  File "pandas/_libs/lib.pyx", line 2924, in pandas._libs.lib.map_infer
  File "<ipython-input-14-5e1fec795b00>", line 3, in <lambda>
    nodes_df.nodes.apply(lambda x: x["favorite_color"])
KeyError: 'favorite_color'


In [15]:
!pip install awkward

Collecting awkward
  Downloading awkward-2.5.2-py3-none-any.whl (742 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m742.2/742.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting awkward-cpp==28 (from awkward)
  Downloading awkward_cpp-28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (706 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m706.4/706.4 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: awkward-cpp, awkward
Successfully installed awkward-2.5.2 awkward-cpp-28


In [16]:
# show how awkard arrays can be used to understand the data differently
import awkward as ak

nodes_as_array = ak.Array(nodes)
nodes_as_array

In [17]:
# show how fields may be accessed
nodes_as_array.fields

['nodes']

In [18]:
# show how fields may be accessed (nested)
nodes_as_array.nodes.fields

['name', 'age', 'occupation', 'favorite_color']

In [19]:
# show how masking works (filter for only nodes with occupation of "Data Scientist")
nodes_as_array.mask[nodes_as_array.nodes.favorite_color == "pink"]

In [20]:
# show how data may be exported and imported from a parquet file
ak.to_parquet(nodes_as_array, "awkward_example.parquet")
ak.from_parquet("awkward_example.parquet")

In [21]:
# show how we can abstract jagged data queries into DuckDB SQL
import duckdb

with duckdb.connect() as ddb:
  result = ddb.execute("""
      SELECT
        nodes.name,
        nodes.favorite_color
      FROM read_parquet('awkward_example.parquet')
      WHERE nodes.favorite_color NOT NULL
    """).arrow()

result

pyarrow.Table
name: string
favorite_color: string
----
name: [["Bob"]]
favorite_color: [["pink"]]