In [None]:
! pip install pyspark[sql]

Collecting pyspark[sql]
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 61.8 MB/s 
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=43938c244288bd101962320e345c24cecb2c0bd083dce03b957e758c9e42cf21
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


# `events.py`

In [None]:
import logging
from typing import Any, Dict, List, Optional, Tuple

import pandas as pd
from pyspark.sql import DataFrame, functions as F, types as T

# from dl_utils import attributes_v4, serialization
# from dl_utils.types import CompactorAction

log = logging.getLogger(__name__)

ID_COLUMN = "id"

# TODO: Need to figure out how to let people pass this into the compact function :/
NO_SERIALIZE = {"c_create_dt", "c_update_dt", "c_version", "id"}

In [None]:
from pyspark.sql import Row

from datetime import datetime

datetime.now().isoformat

def prepare_events(events: DataFrame) -> DataFrame:
    return (
        events.alias("events")  # Returns a new DataFrame with an alias set
        .withColumnRenamed("_version", "c_version")  # rename existing column
        .withColumn("c_update_dt", events["_dt"])  # create a new column based on existing col
        .withColumn(
            "c_create_dt",
            F.lit(None).cast(T.StringType())  # `lit`: add a new column to DataFrame by assigning a literal or constant value.
            if "c_create_dt" not in events.columns
            else events["c_create_dt"],
        )
        .drop(
            *(
                col
                for col in events.columns
                # We want to keep the _dt column but get rid of all the other metdadata with _
                if col.startswith("_") and col != "_dt"
            )
        )
        .withColumn("_compacted", F.lit(False))  # set default `_compacted` value to be False
    )


events = spark.createDataFrame(
    [Row(_dt="2021-01-01T00:00:00Z",
        _uuid="00000000-0000-4000-8000-000000000000",
        _version="1.0.0",
        _source="test",
        _type="retailer_product",
        id="3-0652933171234",
        date_added="2021-01-01T00:00:00Z",
        date_updated="2021-01-01T00:00:00Z",
        live= '{"core": {"entity_a": {"entity_b": {}}}}'
        ),])
    # Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    # Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
# ])

events_new = prepare_events(events)
events_new.show()

+--------------------+---------+---------------+--------------------+--------------------+--------------------+--------------------+-----------+----------+
|                 _dt|c_version|             id|          date_added|        date_updated|                live|         c_update_dt|c_create_dt|_compacted|
+--------------------+---------+---------------+--------------------+--------------------+--------------------+--------------------+-----------+----------+
|2021-01-01T00:00:00Z|    1.0.0|3-0652933171234|2021-01-01T00:00:00Z|2021-01-01T00:00:00Z|{"core": {"entity...|2021-01-01T00:00:00Z|       null|     false|
+--------------------+---------+---------------+--------------------+--------------------+--------------------+--------------------+-----------+----------+



In [None]:
events_new.describe()

DataFrame[summary: string, _dt: string, c_version: string, id: string, date_added: string, date_updated: string, live: string, c_update_dt: string, c_create_dt: string]

In [None]:
def prepare_compacted(compacted: DataFrame) -> DataFrame:
    return (
        compacted.alias("compacted")  # Returns a new DataFrame with an alias set
        # Compacted entities don't have a datetime since they're not events, so
        # make sure they have a datetime of "0" that will always sort before any
        # event to avoid suprises
        .withColumn("_dt", F.lit("0")).withColumn("_compacted", F.lit(True))
    )

compacted = spark.createDataFrame(
    [Row(_dt="2021-01-01T00:00:00Z",
        _uuid="00000000-0000-4000-8000-000000000000",
        _version="1.0.0",
        _source="test",
        _type="retailer_product",
        id="3-0652933171234",
        date_added="2021-01-01T00:00:00Z",
        date_updated="2021-01-01T00:00:00Z",
        live= '{"core": {"entity_a": {"entity_b": {}}}}'
        ),])

prepare_compacted(compacted).show()

+---+--------------------+--------+-------+----------------+---------------+--------------------+--------------------+--------------------+----------+
|_dt|               _uuid|_version|_source|           _type|             id|          date_added|        date_updated|                live|_compacted|
+---+--------------------+--------+-------+----------------+---------------+--------------------+--------------------+--------------------+----------+
|  0|00000000-0000-400...|   1.0.0|   test|retailer_product|3-0652933171234|2021-01-01T00:00:00Z|2021-01-01T00:00:00Z|{"core": {"entity...|      true|
+---+--------------------+--------+-------+----------------+---------------+--------------------+--------------------+--------------------+----------+



In [None]:
def split_compacted_by_events(
    events: DataFrame,
    compacted: DataFrame,
) -> Tuple[DataFrame, DataFrame]:
    """
    只是把compacted event根据inner join和left join分一分
    """
    ids = F.broadcast(events.select(ID_COLUMN).distinct()).alias("ids").cache()

    compacted = compacted.alias("compacted")

    relevant = compacted.join(ids, on=ID_COLUMN, how="inner")
    others = compacted.join(ids, on=ID_COLUMN, how="left_anti")

    return relevant.selectExpr("compacted.*"), others.selectExpr("compacted.*")

compacted = spark.createDataFrame(
    [Row(_dt="2021-01-01T00:00:00Z",
         _uuid="00000000-0000-4000-8000-000000000000",
         _version="1.0.0",
         _source="test",
         _type="retailer_product",
         id="3-0652933171234",
         date_added="2021-01-01T00:00:00Z",
         date_updated="2021-01-01T00:00:00Z",
         live= '{"core": {"entity_a": {"entity_b": {}}}}'
         ),
      Row(_dt="2021-01-01T00:00:00Z",  # this is anti
         _uuid="00000000-0000-4000-8000-000000000012",
         _version="1.1.0",
         _source="test1",
         _type="retailer_product",
         id="compacted_event",
         date_added="2021-01-01T00:00:00Z",
         date_updated="2021-01-01T00:00:00Z",
         live= '{"core": {"entity_a": {"entity_b": {}}}}'
         )
    ])

compacted_new = prepare_compacted(compacted)

events = spark.createDataFrame(
    [Row(_dt="2021-01-01T00:00:00Z",
         _uuid="00000000-0000-4000-8000-000000000000",
         _version="1.0.0",
         _source="test11086",
         _type="retailer_product",
         id="3-0652933171234",
         date_added="2021-01-01T00:00:00Z",
         date_updated="2021-01-01T00:00:00Z",
         live= '{"core": {"entity_a": {"entity_b": {}}}}'
         ),
     Row(_dt="2021-01-01T00:00:00Z",
         _uuid="003",
         _version="1.3.0",
         _source="test003",
         _type="retailer_product",
         id="3-0652933171234",
         date_added="2021-01-01T00:00:00Z",
         date_updated="2021-01-01T00:00:00Z",
         live= '{"core": {"entity_a": {"entity_b": {}}}}'
         ),
     Row(_dt="2021-01-01T00:00:00Z",
         _uuid="00000000-0000-4000-8000-000000000001",
         _version="2.0.0",
         _source="test1",
         _type="retailer_product",
         id="asdfasdfasdf",
         date_added="2021-01-01T00:00:00Z",
         date_updated="2021-01-01T00:00:00Z",
         live= '{"core": {"entity_a": {"entity_b": {}}}}'
         )
    ])

events_new = prepare_events(events)


relevant, others = split_compacted_by_events(events_new, compacted_new)

In [None]:
relevant.show()

+---------------+---+--------------------+--------+-------+----------------+--------------------+--------------------+--------------------+----------+
|             id|_dt|               _uuid|_version|_source|           _type|          date_added|        date_updated|                live|_compacted|
+---------------+---+--------------------+--------+-------+----------------+--------------------+--------------------+--------------------+----------+
|3-0652933171234|  0|00000000-0000-400...|   1.0.0|   test|retailer_product|2021-01-01T00:00:00Z|2021-01-01T00:00:00Z|{"core": {"entity...|      true|
+---------------+---+--------------------+--------+-------+----------------+--------------------+--------------------+--------------------+----------+



In [None]:
others.show()

+---------------+---+--------------------+--------+-------+----------------+--------------------+--------------------+--------------------+----------+
|             id|_dt|               _uuid|_version|_source|           _type|          date_added|        date_updated|                live|_compacted|
+---------------+---+--------------------+--------+-------+----------------+--------------------+--------------------+--------------------+----------+
|compacted_event|  0|00000000-0000-400...|   1.1.0|  test1|retailer_product|2021-01-01T00:00:00Z|2021-01-01T00:00:00Z|{"core": {"entity...|      true|
+---------------+---+--------------------+--------+-------+----------------+--------------------+--------------------+--------------------+----------+



In [None]:
def create_entity_union(events: DataFrame, compacted: Optional[DataFrame]) -> DataFrame:
    events = prepare_events(events)

    if not compacted:
        return events

    compacted = prepare_compacted(compacted)

    return events.unionByName(compacted, allowMissingColumns=True)  
           # unionByName: Returns a new DataFrame containing union of rows in this and another DataFrame.

compacted = spark.createDataFrame(
    [Row(_dt="2021-01-01T00:00:00Z",
         _uuid="00000000-0000-4000-8000-000000000000",
         _version="1.0.0",
         _source="test",
         _type="retailer_product",
         id="compacted_event1",
         date_added="2021-01-01T00:00:00Z",
         date_updated="2021-01-01T00:00:00Z",
         live= '{"core": {"entity_a": {"entity_b": {}}}}'
         ),
      Row(_dt="2021-01-01T00:00:00Z",  # this is anti
         _uuid="00000000-0000-4000-8000-000000000012",
         _version="1.1.0",
         _source="test1",
         _type="retailer_product",
         id="compacted_event2",
         date_added="2021-01-01T00:00:00Z",
         date_updated="2021-01-01T00:00:00Z",
         live= '{"core": {"entity_a": {"entity_b": {}}}}'
         )
    ])

events = spark.createDataFrame(
    [Row(_dt="2021-01-01T00:00:00Z",
         _uuid="00000000-0000-4000-8000-000000000000",
         _version="1.0.0",
         _source="test11086",
         _type="retailer_product",
         id="events1",
         date_added="2021-01-01T00:00:00Z",
         date_updated="2021-01-01T00:00:00Z",
         live= '{"core": {"entity_a": {"entity_b": {}}}}'
         ),
     Row(_dt="2021-01-01T00:00:00Z",
         _uuid="003",
         _version="1.3.0",
         _source="test003",
         _type="retailer_product",
         id="events2",
         date_added="2021-01-01T00:00:00Z",
         date_updated="2021-01-01T00:00:00Z",
         live= '{"core": {"entity_a": {"entity_b": {}}}}'
         )
    ])

events_union = create_entity_union(events, compacted)

In [None]:
events_union.show()

+--------------------+---------+----------------+--------------------+--------------------+--------------------+--------------------+-----------+----------+--------------------+--------+-------+----------------+
|                 _dt|c_version|              id|          date_added|        date_updated|                live|         c_update_dt|c_create_dt|_compacted|               _uuid|_version|_source|           _type|
+--------------------+---------+----------------+--------------------+--------------------+--------------------+--------------------+-----------+----------+--------------------+--------+-------+----------------+
|2021-01-01T00:00:00Z|    1.0.0|         events1|2021-01-01T00:00:00Z|2021-01-01T00:00:00Z|{"core": {"entity...|2021-01-01T00:00:00Z|       null|     false|                null|    null|   null|            null|
|2021-01-01T00:00:00Z|    1.3.0|         events2|2021-01-01T00:00:00Z|2021-01-01T00:00:00Z|{"core": {"entity...|2021-01-01T00:00:00Z|       null|     fa

In [None]:
def _do_compaction(group: pd.DataFrame) -> pd.DataFrame:
    group.sort_values("_dt", inplace=True)
    events: List[Dict[str, str]] = group.to_dict("records")

    base = events[0]  # 1st record in events. Is it sorted or not???
    ret: Dict[str, Any] = {}

    create_date = None
    if base["_compacted"]:
        # Ensure no events can overwrite date_created
        create_date = base["c_create_dt"]
    else:
        # Invent a date_created for the new entity, though in such a way that
        # it can be easily overwritten by these events (but not future ones)
        ret["c_create_dt"] = base["_dt"]

    should_flatten = False

    for update in events:
        if update.get("c_flattened"):  # only serialized events will have have this key
            should_flatten = True
            update = serialization.deserialize(update, skip=NO_SERIALIZE)
        ret = attributes_v4.recursively_merge_attributes(ret, update)
        if ret.get("c_action") == CompactorAction.DELETE:  # delete the top level entity
            ret = {}

    if not ret:
        # Returning an empty dataframe tells pyspark to discard this group
        return pd.DataFrame([], columns=group.columns)

    if create_date:
        ret["c_create_dt"] = create_date

    if should_flatten:
        ret = serialization.serialize(ret, skip=NO_SERIALIZE)

    return pd.DataFrame([ret], columns=group.columns)

In [None]:
import pandas as pd
pd.DataFrame([1+ ("2",)])

TypeError: ignored

In [None]:
pd.DataFrame([1,2,3])
pd.DataFrame([(1,) + (pdf.v.mean(),)])

Unnamed: 0,0,1
0,1,2.0


In [None]:
pdf = pd.DataFrame.from_dict({"v": [1,2,3]})
pdf

Unnamed: 0,v
0,1
1,2
2,3


In [None]:
pdf.v.mean()

2.0

In [None]:
import pandas as pd


events = [
            {"h":2, "a":3},
          {'asd':4, 'asdf': 6}
        ]

df=pd.DataFrame(events, columns=[i for i in df.columns if i != "4"])
df

Unnamed: 0,1,2,3
0,,,
1,,,


In [None]:
[i for i in df.columns if i != "4"].remove("2")

In [None]:
if not None:
    print(1)

1


In [None]:
a = pd.DataFrame.from_dict(
            [{
                "_dt": "asdf",
                "_compacted": "False",
                "date_created": "_dt(0)",
                "foo": "asdf",
                "bar": "asdf",
            }],
            # columns=["_dt", "compacted", "date_created", "foo", "bar"]
        )

b = pd.DataFrame.from_dict(
            [{
                "_dt": "asdf",
                "_compacted": "False",
                "date_created": "_dt(0)",
                "foo": "asdf",
                "bar": "asdf",
            }],
            # columns=["_dt", "compacted", "date_created", "foo", "bar"]
        )

In [None]:
a==b

NameError: ignored

# Paths.py

In [None]:
from datetime import datetime, timezone
from pathlib import PurePosixPath
from typing import Dict, Optional, Tuple, cast
from urllib.parse import urlunsplit

DATE_PARTITIONS = {
    "year": "{dt:%Y}",
    "month": "{dt:%m}",
    "day": "{dt:%d}",
    "run": "{dt:%H}:{dt:%M}:{dt:%S}",
}


def date_to_partitions(dt: datetime) -> Dict[str, str]:
    return {name: formatstr.format(dt=dt) for name, formatstr in DATE_PARTITIONS.items()}


In [None]:
date_to_partitions()

AttributeError: ignored

In [None]:
datetime.now().isoformat()

'2021-12-07T17:24:27.671860'

In [None]:
datetime.fromisoformat("2021-12-31T00:00:00.000000+00:00")

datetime.datetime(2021, 12, 31, 0, 0, tzinfo=datetime.timezone.utc)

In [None]:
date_to_partitions(datetime.fromisoformat("2021-12-31T00:00:00.000000-23:00"))

{'day': '31', 'month': '12', 'run': '00:00:00', 'year': '2021'}

In [None]:
datetime.now(timezone.utc).isoformat()

'2021-12-07T17:29:17.374752+00:00'

In [None]:
def build_path(
    bucket_name: str,
    bucket_path: str,
    dt: datetime,
    partitions: Optional[Dict[str, str]] = None,
) -> str:
    s3_path = PurePosixPath(bucket_path)

    if partitions is None:
        partitions = {}
    partitions.update(date_to_partitions(dt))

    for name, value in partitions.items():
        s3_path /= name + "=" + value

    return urlunsplit(("s3", bucket_name, str(s3_path) + "/", "", ""))


In [None]:
build_path("lyst-bucket", "lyst-path", dt=datetime.fromisoformat("2021-12-31T00:00:00.000000+00:00"))

's3://lyst-bucket/lyst-path/year=2021/month=12/day=31/run=00:00:00/'

In [None]:
build_path(
            "lyst-bucket",
            "lyst-path/lyst-subpath",
            datetime.fromisoformat("2021-12-31T00:00:00.000000+00:00"),
            partitions
        )

's3://lyst-bucket/lyst-path/lyst-subpath/day=31/month=12/run=00:00:00/year=2021/'

In [None]:
partitions={
        "day": "01",
        "month": "12",
        "run": "00:00:00",
        "year": "2020",
    }

In [None]:
def extract_partitions_from_path(path_root: str, path: str) -> Dict[str, str]:
    """Extract partitions values and names from Amazon S3 path."""
    path_root = path_root if path_root.endswith("/") else f"{path_root}/"
    if path_root not in path:
        raise ValueError(f"Object {path} is not under the root path ({path_root}).")
    path_wo_filename: str = path.rpartition("/")[0] + "/"
    path_wo_prefix: str = path_wo_filename.replace(f"{path_root}/", "")
    dirs: Tuple[str, ...] = tuple(
        x for x in path_wo_prefix.split("/") if (x != "") and (x.count("=") == 1)
    )
    if not dirs:
        return {}
    values_tups = cast(Tuple[Tuple[str, str]], tuple(tuple(x.split("=")[:2]) for x in dirs))
    values_dics: Dict[str, str] = dict(values_tups)
    return values_dics

In [None]:
extract_partitions_from_path(path_root="lyst_project", path="lyst_project/year=2020/run=1024/day=32/file.gz")

{'day': '32', 'run': '1024', 'year': '2020'}

In [None]:
dt = datetime(2020, 1, 2, 3, 4, 5)

In [None]:
date_to_partitions(dt)

{'day': '02', 'month': '01', 'run': '03:04:05', 'year': '2020'}