In [None]:
from eos import proj_root

In [None]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [None]:
import os

os.chdir(proj_root / "data")
os.getcwd()

In [None]:
from fastavro import writer, reader, parse_schema

schema = {
    "doc": "A weather reading.",
    "name": "Weather",
    "namespace": "test",
    "type": "record",
    "fields": [
        {"name": "station", "type": "string"},
        {"name": "time", "type": "long"},
        {"name": "temp", "type": "int"},
    ],
}
parsed_schema = parse_schema(schema)

# 'records' can be an iterable (including generator)
records = [
    {"station": "011990-99999", "temp": 0, "time": 1433269388},
    {"station": "011990-99999", "temp": 22, "time": 1433270389},
    {"station": "011990-99999", "temp": -11, "time": 1433273379},
    {"station": "012650-99999", "temp": 111, "time": 1433275478},
]

# Writing
with open("weather.avro", "wb") as out:
    writer(out, parsed_schema, records)

# Reading
with open("weather.avro", "rb") as fo:
    for record in reader(fo):
        print(record)

In [None]:
from dataclasses import dataclass, field
from functools import reduce

from typing import Optional
import pandas as pd
import numpy as np
from ordered_set import OrderedSet
from eos.data_io.config import drivers, trucks_by_id, drivers_by_id
from datetime import datetime

a = np.array([[1, 2, 3], [4, 5, 6]])
ts = pd.to_datetime(datetime.now())

ss = np.arange(12)
a1 = ss[:4].tolist()
a2 = ss[4:8].tolist()
a3 = ss[8:].tolist()
ss = [a1, a2, a3]

ts_ind = ts + pd.to_timedelta(np.arange(0, 4 * 20, 20), "ms")
ss = np.arange(12)
a1 = ss[:4]
a2 = ss[4:8]
a3 = ss[8:]
df_ss = pd.DataFrame(
    {"timestep": ts_ind, "velocity": a1, "thrust": a2, "brake": a3}
)  # .set_index('timestep')
df_ss.columns.name = "qtuple"

state = df_ss.stack().swaplevel(0, 1)
state.name = "state"
state.index.names = ["rows", "idx"]
state.sort_index(inplace=True)

a = len(ss) + np.arange(15)
speed_ser = pd.Series(np.linspace(40, 60, 3), name="speed")
row_array = a.reshape(3, 5).transpose()
rows_df = pd.DataFrame(row_array)
rows_df.columns = [f"r{i}" for i in np.arange(3)]

ts_ind = ts + pd.to_timedelta(np.arange(5 * 20, 8 * 20, 20), "ms")
ts_ser = pd.Series(ts_ind, name="timestep")
throttle_ser = pd.Series(np.linspace(0, 1.0, 5), name="throttle")
# throttle_ser
dfs = [rows_df, ts_ser, speed_ser, throttle_ser]
action = (
    reduce(
        lambda left, right: pd.merge(
            left, right, how="outer", left_index=True, right_index=True
        ),
        dfs,
    )
    .stack()
    .swaplevel(0, 1)
    .sort_index()
)

action.name = "action"
action.index.names = ["rows", "idx"]

reward = (
    pd.DataFrame({"work": len(ss) + len(a), "timestep": ts_ind[0]}, index=[0])
    .stack()
    .swaplevel(0, 1)
    .sort_index()
)
reward.index.names = ["rows", "idx"]
reward.name = "reward"
ts_ind = ts + pd.to_timedelta(5, "s") + pd.to_timedelta(np.arange(0, 4 * 20, 20), "ms")
ss = (
    np.arange(12) + len(ss) + len(a) + len(reward) - 1
)  # exclude the timestamp in reward
a1 = ss[:4]
a2 = ss[4:8]
a3 = ss[8:]

nstate = (
    pd.DataFrame({"timestep": ts_ind, "velocity": a1, "thrust": a2, "brake": a3})
    .stack()
    .swaplevel(0, 1)
    .sort_index()
)
nstate.name = "nstate"
nstate.index.names = ["rows", "idx"]

timestamp = pd.Series([ts], name="timestamp")
timestamp.index = pd.MultiIndex.from_product(
    [timestamp.index, [0]], names=["rows", "idx"]
)

timestamp_index = (timestamp.name, "", 0)
state_index = [(state.name, *i) for i in state.index]
reward_index = [(reward.name, *i) for i in reward.index]
action_index = [(action.name, *i) for i in action.index]
nstate_index = [(nstate.name, *i) for i in nstate.index]
multiindex = pd.MultiIndex.from_tuples(
    [timestamp_index, *state_index, *action_index, *reward_index, *nstate_index]
)
observation_list = [timestamp, state, action, reward, nstate]
observation = pd.concat(observation_list)
observation.index = multiindex

observation0 = observation.copy()
observation0.loc["timestamp", "", 0] = ts + pd.Timedelta(1, "h")
observation1 = observation.copy()
observation1.loc["timestamp", "", 0] = ts + pd.Timedelta(2, "h")
observation2 = observation.copy()
observation2.loc["timestamp", "", 0] = ts + pd.Timedelta(3, "h")
observation3 = observation.copy()
observation3.loc["timestamp", "", 0] = ts + pd.Timedelta(4, "h")
observation4 = observation.copy()
observation4.loc["timestamp", "", 0] = ts + pd.Timedelta(5, "h")
observation_list = [
    observation0,
    observation1,
    observation2,
    observation3,
    observation4,
]

dfs_epi = pd.concat(observation_list, axis=1).transpose()
dfs_epi.columns.names = ["qtuple", "rows", "idx"]

dfs_episode = dfs_epi.copy()
dfs_episode.set_index(("timestamp", "", 0), inplace=True)
dfs_episode.sort_index(axis=1, inplace=True)
dfs_episode.index.name = "timestamp"

state_cols_float = [("state", col) for col in ["brake", "thrust", "velocity"]]
action_cols_float = [("action", col) for col in ["r0", "r1", "r2", "speed", "throttle"]]
reward_cols_float = [("reward", "work")]
nstate_cols_float = [("nstate", col) for col in ["brake", "thrust", "velocity"]]
for col in action_cols_float + state_cols_float + reward_cols_float + nstate_cols_float:
    dfs_episode[col[0], col[1]] = dfs_episode[col[0], col[1]].astype(
        "float"
    )  # float16 not allowed in parquet
dfs_episode = pd.concat(
    [dfs_episode], keys=[drivers_by_id["wang-cheng"].pid], names=["driver"]
)
dfs_episode = pd.concat(
    [dfs_episode], keys=[trucks_by_id["VB7"].vid], names=["vehicle"]
)
episodestart = ts - pd.Timedelta(1, "h")
dfs_episode = pd.concat([dfs_episode], keys=[ts], names=["episodestart"])
dfs_episode = dfs_episode.swaplevel(1, 0, axis=0)
dfs_episode = dfs_episode.swaplevel(1, 2, axis=0)
dfs_episode.sort_index(inplace=True)
dfs_episode

In [None]:
ts_new = pd.to_datetime((datetime.now()))
episodestart = ts_new - pd.Timedelta(2, "d")

dfs_episode0 = dfs_episode.copy()
dfs_episode0.index = dfs_episode0.index.set_levels([episodestart], level="episodestart")
dfs_episode0.index = dfs_episode0.index.set_levels(
    [[trucks_by_id["VB7"].vid], [drivers_by_id["zheng-longfei"].pid]],
    level=["vehicle", "driver"],
    verify_integrity=False,
)
ts_index = dfs_episode0.index.unique(level="timestamp")
idx_num = len(ts_index)
drop_num = np.random.randint(low=1, high=idx_num - 1)
ts_index_to_drop = np.random.choice(ts_index, drop_num, replace=False)
dfs_episode0 = dfs_episode0.drop(index=ts_index_to_drop, level="timestamp")
# srs_episode0 = dfs_episode0.stack(level=['qtuple', 'rows', 'idx'])

dfs_episode1 = dfs_episode.copy()
dfs_episode1.index = dfs_episode1.index.set_levels(
    [episodestart - pd.Timedelta(3, "d")], level="episodestart"
)
dfs_episode1.index = dfs_episode1.index.set_levels(
    [[trucks_by_id["MP73"].vid], [drivers_by_id["wang-cheng"].pid]],
    level=["vehicle", "driver"],
    verify_integrity=False,
)
ts_index = dfs_episode1.index.unique(level="timestamp")
idx_num = len(ts_index)
drop_num = np.random.randint(low=1, high=idx_num - 1)
ts_index_to_drop = np.random.choice(ts_index, drop_num, replace=False)
dfs_episode1 = dfs_episode1.drop(index=ts_index_to_drop, level="timestamp")
# srs_episode1 = dfs_episode1.stack(level=['qtuple', 'rows', 'idx'])

dfs_episode2 = dfs_episode.copy()
dfs_episode2.index = dfs_episode2.index.set_levels(
    [episodestart - pd.Timedelta(4, "d")], level="episodestart"
)
dfs_episode2.index = dfs_episode2.index.set_levels(
    [[trucks_by_id["VB7"].vid], [drivers_by_id["wang-cheng"].pid]],
    level=["vehicle", "driver"],
    verify_integrity=False,
)
ts_index = dfs_episode2.index.unique(level="timestamp")
idx_num = len(ts_index)
drop_num = np.random.randint(low=1, high=idx_num - 1)
ts_index_to_drop = np.random.choice(ts_index, drop_num, replace=False)
dfs_episode2 = dfs_episode2.drop(index=ts_index_to_drop, level="timestamp")
# srs_episode2 = dfs_episode2.stack(level=['qtuple', 'rows', 'idx'])

dfs_episode3 = dfs_episode.copy()
dfs_episode3.index = dfs_episode3.index.set_levels(
    [episodestart - pd.Timedelta(5, "d")], level="episodestart"
)
dfs_episode3.index = dfs_episode3.index.set_levels(
    [[trucks_by_id["MP73"].vid], [drivers_by_id["zheng-longfei"].pid]],
    level=["vehicle", "driver"],
    verify_integrity=False,
)
ts_index = dfs_episode3.index.unique(level="timestamp")
idx_num = len(ts_index)
drop_num = np.random.randint(low=1, high=idx_num - 1)
ts_index_to_drop = np.random.choice(ts_index, drop_num, replace=False)
dfs_episode3 = dfs_episode3.drop(index=ts_index_to_drop, level="timestamp")
# srs_episode3 = dfs_episode3.stack(level=['qtuple', 'rows', 'idx'])
from functools import reduce

episodes = [dfs_episode, dfs_episode0, dfs_episode1, dfs_episode2, dfs_episode3]
try:
    dfs_episode_all = reduce(
        lambda left, right,: pd.concat([left, right], axis=0), episodes
    )
except Exception as e:
    print(e)
# dfs_episode_all.sort_index(inplace=True)
# dfs_episode_all = dfs_episode_all.swaplevel(1, 0, axis=0)
# dfs_episode_all = dfs_episode_all.swaplevel(1, 2, axis=0)
dfs_episode_all.sort_index(inplace=True)
dfs_episode_all = dfs_episode_all[["state", "action", "reward", "nstate"]]
display("dfs_episode_all")
# dfs_episode_all.index
# dfs_episode_all.columns#
#
dfs_episode_all

In [None]:
dfs_episode
# episodes[0]

In [None]:
from dataclasses import asdict

indices = dfs_episode.index
# ep_start = indices.get_level_values(level='episodestart')[0]
indices_dict = [
    {indices.names[i]: level for i, level in enumerate(levels)} for levels in indices
]
episode_meta = indices_dict[0].copy()
episode_meta["episodestart"] = episode_meta["episodestart"].timestamp() * 1e6
episode_meta["timestamp"] = episode_meta["timestamp"].timestamp() * 1e6
try:
    episode_meta.pop("timestamp")
except KeyError:
    print(f"Key 'timestamp' not found")
# episode_meta

# episodes_indices
episodes_indices_dict = [
    [{indices.names[i]: level for i, level in enumerate(levels)} for levels in df.index]
    for df in episodes
]
episodes_meta_dict = [indices[0].pop("timestamp") for indices in episodes_indices_dict]


# for i in episodes_meta_dict:
#     i.pop('timestamp')
episodes_meta_dict = [indices[0] for indices in episodes_indices_dict]

for ep_mt in episodes_meta_dict:
    ep_mt["episodestart"] = ep_mt["episodestart"].timestamp() * 1e6
episodes_meta_dict
# for i in episodes_indices_dict:
#     i

# episodes_meta = [
#     [
#         [
#             {df.index.names[i]: level } for i, level in enumerate(levels)
#         ] for levels in df.index
#     ] for df in episodes]
# episodes_meta

In [None]:
ind = indices_dict[0]
# ind.pop('timestamp')
ind["episodestart"] = ind["episodestart"].timestamp() * 1e6
ind

In [None]:
from eos.data_io.eos_struct import ObservationMeta, StateSpecs, StateUnitCodes, ActionSpecs
from eos.data_io.utils.eos_pandas import ep_nest, df_to_ep_nested_dict, avro_ep_encoding
from pydantic import schema_json_of, schema_of

observation_meta = ObservationMeta(
    state_specs=StateSpecs(
        state_unit_codes=StateUnitCodes(
            velocity_unit_code="kph",
            thrust_unit_code="pct",
            brake_unit_code="pct",
        ),
        unit_number=trucks_by_id["VB7"].cloud_unit_number,  # 4
        unit_duration=trucks_by_id["VB7"].cloud_unit_duration,  # 1s
        frequency=trucks_by_id["VB7"].cloud_signal_frequency,  # 50 hz
    ),
    action_specs=ActionSpecs(
        action_unit_code="nm",
        action_row_number=trucks_by_id["VB7"].torque_table_row_num_flash,
        action_column_number=len(trucks_by_id["VB7"].pedal_scale),
    ),
    reward_specs={
        "reward_unit": "wh",
    },
    site=trucks_by_id["VB7"].site,
)


dict_nested = avro_ep_encoding(dfs_episode)
dict_nested

In [None]:
dict_ep = {
    "episodestart": episode_meta["episodestart"],
    "meta": {"episode_meta": episode_meta, "observation_meta": observation_meta.dict()},
    "sequence": dict_nested,
}
dict_ep

In [None]:
episodes_dict_nested = [avro_ep_encoding(ep) for ep in episodes]
episodes_dict_nested[0]

In [None]:
dict_nested_states = [[step["state"] for step in ep] for ep in episodes_dict_nested]
dict_nested_nstates = [[step["nstate"] for step in ep] for ep in episodes_dict_nested]
dict_nested_rewards = [[step["reward"] for step in ep] for ep in episodes_dict_nested]
dict_nested_actions = [[step["action"] for step in ep] for ep in episodes_dict_nested]
dict_nested_timestamps = [
    [step["timestamp"] for step in ep] for ep in episodes_dict_nested
]

arr_states = [
    [{"ts": ts, "state": state_arr[i]} for i, ts in enumerate(ts_arr)]
    for (ts_arr, state_arr) in zip(dict_nested_timestamps, dict_nested_states)
]
arr_states[0]
# dict_nested_timestamps[0]

In [None]:
import fastavro
import json

state_unit_fields_schema = [
    {"name": "velocity_unit_code", "type": "string"},
    {"name": "thrust_unit_code", "type": "string"},
    {"name": "brake_unit_code", "type": "string"},
]
state_specs_fields_schema = [
    {
        "name": "state_unit_codes",
        "type": {
            "type": "record",
            "name": "state_unit_codes_",
            "fields": state_unit_fields_schema,
        },
    },
    {"name": "unit_number", "type": "int"},
    {"name": "unit_duration", "type": "int"},
    {"name": "frequency", "type": "int"},
]
action_specs_fields_schema = [
    {"name": "action_unit_code", "type": "string"},
    {"name": "action_row_number", "type": "int"},
    {"name": "action_column_number", "type": "int"},
]
reward_specs_fields_schema = [{"name": "reward_unit", "type": "string"}]

observation_meta_fields_schema = [
    {
        "name": "state_specs",
        "type": {
            "type": "record",
            "name": "state_sp",
            "fields": state_specs_fields_schema,
        },
    },
    {
        "name": "action_specs",
        "type": {
            "type": "record",
            "name": "action_sp",
            "fields": action_specs_fields_schema,
        },
    },
    {
        "name": "reward_specs",
        "type": {
            "type": "record",
            "name": "reward_sp",
            "fields": reward_specs_fields_schema,
        },
    },
    {"name": "site", "type": "string"},
]
episode_meta_fields_schema = [
    {"name": "vehicle", "type": "string"},
    {"name": "driver", "type": "string"},
    {"name": "episodestart", "type": "long", "logicalType": "timestamp-micros"},
]
observation_meta_fields_schema = [
    {
        "name": "state_specs",
        "type": {
            "type": "record",
            "name": "state_specs_",
            "fields": state_specs_fields_schema,
        },
    },
    {
        "name": "action_specs",
        "type": {
            "type": "record",
            "name": "action_specs_",
            "fields": action_specs_fields_schema,
        },
    },
    {
        "name": "reward_specs",
        "type": {
            "type": "record",
            "name": "reward_specs_",
            "fields": reward_specs_fields_schema,
        },
    },
    {"name": "site", "type": "string"},
]

state_fields_schema = [
    {"name": "velocity", "type": {"type": "array", "items": "float"}},
    {"name": "thrust", "type": {"type": "array", "items": "float"}},
    {"name": "brake", "type": {"type": "array", "items": "float"}},
    {
        "name": "timestep",
        "type": {
            "type": "array",
            "items": {"type": "long", "logicalType": "timestamp-micros"},
        },
    },
]

torque_table_row_names = ["r0", "r1", "r2"]
action_fields_schema = [
    {"name": r, "type": {"type": "array", "items": "float"}}
    for r in torque_table_row_names
]
action_fields_schema += [
    {"name": "speed", "type": {"type": "array", "items": "float"}},
    {"name": "throttle", "type": {"type": "array", "items": "float"}},
    {
        "name": "timestep",
        "type": {
            "type": "array",
            "items": {"type": "long", "logicalType": "timestamp-micros"},
        },
    },
]


reward_fields_schema = [
    {"name": "work", "type": {"type": "array", "items": "float"}},
    {
        "name": "timestep",
        "type": {
            "type": "array",
            "items": {"type": "long", "logicalType": "timestamp-micros"},
        },
    },
]


episode_array_fields_schema = [
    {
        "type": "long",
        "name": "timestamp",
        "logicalType": "timestamp-micros",
    },
    {
        "name": "state",
        "type": {
            "type": "record",
            "name": "state_",
            "fields": state_fields_schema,
        },
    },
    {
        "name": "action",
        "type": {
            "type": "record",
            "name": "action_",
            "fields": action_fields_schema,
        },
    },
    {
        "name": "reward",
        "type": {
            "type": "record",
            "name": "reward_",
            "fields": reward_fields_schema,
        },
    },
    {
        "name": "nstate",
        "type": {
            "type": "record",
            "name": "nstate_",
            "fields": state_fields_schema,
        },
    },
]

schema_episode = {
    "type": "record",
    "name": "episode",
    "doc": "episode data with a timestamp, meta description and an array of episode steps",
    "fields": [
        {"type": "long", "name": "episodestart", "logicalType": "timestamp-micros"},
        {
            "name": "meta",
            "type": {
                "type": "record",
                "name": "meta_",
                "fields": [
                    {
                        "name": "episode_meta",
                        "type": {
                            "type": "record",
                            "name": "episode_meta_",
                            "fields": episode_meta_fields_schema,
                        },
                    },
                    {
                        "name": "observation_meta",
                        "type": {
                            "type": "record",
                            "name": "observation_meta_",
                            "fields": observation_meta_fields_schema,
                        },
                    },
                ],
            },
        },
        {
            "name": "sequence",
            "type": {
                "type": "array",
                "items": {
                    "name": "step",  # not used in constructing the episode observation array data
                    "type": "record",
                    "fields": episode_array_fields_schema,
                },
            },
        },
    ],
}

parsed_schema_episode = fastavro.schema.parse_schema(schema_episode)
print(json.dumps(schema_episode, indent=2))

In [None]:
import os
from eos import proj_root

os.chdir(proj_root / "data")
os.getcwd()

In [None]:
# import dask
# import dask.bag as db
# b_ep = db.from_sequence([dict_ep])  # turn one single record to a list, bag create only from sequence
# rec = b_ep.take(1)
# type(rec[0])
# # b_ep.to_textfiles('*.json.gz')
# b_ep.to_avro('bag_ep.*.avro', schema=schema_episode)

In [None]:
# os.getcwd()
# from fastavro import writer, reader, parse_schema
#
# with open('bag_ep3.avro', 'wb') as out:
# 	writer(out, schema_episode, rec)
#
# with open('bag_ep3.avro', 'rb') as fo:
# 	for record in reader(fo):
# 		print(record)

In [None]:
schema = {
    "doc": "A episode reading.",
    "name": "episode",
    "namespace": "test",
    "type": "record",
    "fields": [
        {"name": "station", "type": "string"},
        {"name": "time", "type": "long"},
        {"name": "velocity", "type": {"type": "array", "items": "float"}},
        {"name": "brake", "type": {"type": "array", "items": "float"}},
    ],
}

parsed_schema = parse_schema(schema)


# 'records' can be an iterable (including generator)
records = [
    {
        "station": "011990-99999",
        "velocity": [0, 1],
        "brake": [0, 1],
        "time": 1433269388,
    },
    {
        "station": "011990-99999",
        "velocity": [22, 1],
        "brake": [0, 1],
        "time": 1433270389,
    },
    {
        "station": "011990-99999",
        "velocity": [-11, 1],
        "brake": [0, 1],
        "time": 1433273379,
    },
    {
        "station": "012650-99999",
        "velocity": [111, 1],
        "brake": [0, 1],
        "time": 1433275478,
    },
]

# Writing
with open("episode1.avro", "wb") as out:
    writer(out, parsed_schema, records)

# Reading
with open("episode1.avro", "rb") as fo:
    for record in reader(fo):
        print(record)

In [None]:
state_dict1 = [dict_ep["sequence"][i]["observation"]["state"] for i in range(5)]
state_dict1
#

In [None]:
# 'records' can be an iterable (including generator)
# records0 = [
#     {u'station': u'011990-99999', u'state': state_dict1[0], u'time': 1433269388},
#     {u'station': u'011990-99999', u'state': state_dict1[1],  u'time': 1433270389},
#     {u'station': u'011990-99999', u'state': state_dict1[2], u'time': 1433273379},
#     {u'station': u'012650-99999', u'state': state_dict1[3], u'time': 1433275478},
# ]
# records0
#
# # 'records' can be an iterable (including generator)
# records1 = [
#     {u'station': u'011990-99999', u'state': {'velocity': state_dict1 [0]['velocity'],'thrust': state_dict1 [0]['thrust'],'brake': state_dict1 [0]['brake'],'timestep': state_dict1 [0]['timestep']}, u'time': 1433269388},
#     {u'station': u'011990-99999', u'state': {'velocity': state_dict1 [1]['velocity'],'thrust': state_dict1 [1]['thrust'],'brake': state_dict1 [1]['brake'],'timestep': state_dict1 [1]['timestep'] },  u'time': 1433270389},
#     {u'station': u'011990-99999', u'state': {'velocity': state_dict1 [2]['velocity'],'thrust': state_dict1 [2]['thrust'],'brake': state_dict1 [2]['brake'],'timestep': state_dict1 [2]['timestep'] },  u'time': 1433273379},
#     {u'station': u'012650-99999', u'state': {'velocity': state_dict1 [3]['velocity'],'thrust': state_dict1 [3]['thrust'],'brake': state_dict1 [3]['brake'],'timestep': state_dict1 [3]['timestep'] },  u'time': 1433275478},
# ]
# # records1
#
# records2 = [
#     {u'timestamp': dict_ep['episode'][0]['episodestep']['timestamp'], u'observation': {'state': state_dict1[0]}, u'time': 1433269388},
#     {u'timestamp': dict_ep['episode'][1]['episodestep']['timestamp'], u'observation': {'state': state_dict1[1]},  u'time': 1433270389},
#     {u'timestamp': dict_ep['episode'][2]['episodestep']['timestamp'], u'observation': {'state': state_dict1[2]},  u'time': 1433273379},
#     {u'timestamp': dict_ep['episode'][3]['episodestep']['timestamp'], u'observation': {'state': state_dict1[3]},  u'time': 1433275478},
# ]
# # records2
records3 = [
    {
        "episodestart": dict_ep["sequence"][0]["timestamp"],
        "meta": {
            "episode_meta": episode_meta,
            "observation_meta": observation_meta.dict(),
        },
        "sequence": {"state": state_dict1[0]},
        "time": 1433269388,
    },
    {
        "episodestart": dict_ep["sequence"][1]["timestamp"],
        "meta": {
            "episode_meta": episode_meta,
            "observation_meta": observation_meta.dict(),
        },
        "sequence": {"state": state_dict1[1]},
        "time": 1433270389,
    },
    {
        "episodestart": dict_ep["sequence"][2]["timestamp"],
        "meta": {
            "episode_meta": episode_meta,
            "observation_meta": observation_meta.dict(),
        },
        "sequence": {"state": state_dict1[2]},
        "time": 1433273379,
    },
    {
        "episodestart": dict_ep["sequence"][3]["timestamp"],
        "meta": {
            "episode_meta": episode_meta,
            "observation_meta": observation_meta.dict(),
        },
        "sequence": {"state": state_dict1[3]},
        "time": 1433275478,
    },
]
records3
# dict_nested_state = [dict_ep['episode'][i]['episodestep']['observation']['state'] for i in range(4)]
records4 = [
    {
        "episodestart": episodes_indices_dict[0]["episodestart"],
        "meta": {
            "episode_meta": episode_meta,
            "observation_meta": observation_meta.dict(),
        },
        "sequence": arr_states[0],
    },
    {
        "episodestart": episodes_indices_dict[1]["episodestart"],
        "meta": {
            "episode_meta": episode_meta,
            "observation_meta": observation_meta.dict(),
        },
        "sequence": arr_states[1],
    },
    {
        "episodestart": episodes_indices_dict[2]["episodestart"],
        "meta": {
            "episode_meta": episode_meta,
            "observation_meta": observation_meta.dict(),
        },
        "sequence": arr_states[2],
    },
    {
        "episodestart": episodes_indices_dict[3]["episodestart"],
        "meta": {
            "episode_meta": episode_meta,
            "observation_meta": observation_meta.dict(),
        },
        "sequence": arr_states[3],
    },
    {
        "episodestart": episodes_indices_dict[4]["episodestart"],
        "meta": {
            "episode_meta": episode_meta,
            "observation_meta": observation_meta.dict(),
        },
        "sequence": arr_states[4],
    },
]
# records4[2]

records5 = [
    {
        "episodestart": episodes_indices_dict[0]["episodestart"],
        "sequence": dict_nested_timestamps[0],
    },
    {
        "episodestart": episodes_indices_dict[1]["episodestart"],
        "sequence": dict_nested_timestamps[1],
    },
    {
        "episodestart": episodes_indices_dict[2]["episodestart"],
        "sequence": dict_nested_timestamps[2],
    },
    {
        "episodestart": episodes_indices_dict[3]["episodestart"],
        "sequence": dict_nested_timestamps[3],
    },
    {
        "episodestart": episodes_indices_dict[4]["episodestart"],
        "sequence": dict_nested_timestamps[4],
    },
]
# records5[2]

In [None]:
records7 = [
    {
        "episodestart": dict_ep["sequence"][0]["timestamp"],
        "meta": {
            "episode_meta": episode_meta,
            "observation_meta": observation_meta.dict(),
        },
        "sequence": {"ts": dict_nested[0]["timestamp"], "state": state_dict1[0]},
    },
    {
        "episodestart": dict_ep["sequence"][1]["timestamp"],
        "meta": {
            "episode_meta": episode_meta,
            "observation_meta": observation_meta.dict(),
        },
        "sequence": {"ts": dict_nested[1]["timestamp"], "state": state_dict1[1]},
    },
    {
        "episodestart": dict_ep["sequence"][2]["timestamp"],
        "meta": {
            "episode_meta": episode_meta,
            "observation_meta": observation_meta.dict(),
        },
        "sequence": {"ts": dict_nested[2]["timestamp"], "state": state_dict1[2]},
    },
    {
        "episodestart": dict_ep["sequence"][3]["timestamp"],
        "meta": {
            "episode_meta": episode_meta,
            "observation_meta": observation_meta.dict(),
        },
        "sequence": {"ts": dict_nested[3]["timestamp"], "state": state_dict1[3]},
    },
]
records7

In [None]:
records8 = [
    {"episodestart": dict_ep["sequence"][0]["timestamp"], "sequence": arr_states[0]},
    {"episodestart": dict_ep["sequence"][1]["timestamp"], "sequence": arr_states[1]},
    {"episodestart": dict_ep["sequence"][2]["timestamp"], "sequence": arr_states[2]},
    {"episodestart": dict_ep["sequence"][3]["timestamp"], "sequence": arr_states[3]},
    {"episodestart": dict_ep["sequence"][4]["timestamp"], "sequence": arr_states[4]},
]
records8

In [None]:
schema1 = {
    "doc": "An episode reading.",
    "name": "episode",
    "namespace": "test",
    "type": "record",
    "fields": [
        {"name": "station", "type": "string"},
        {"name": "time", "type": "long"},
        {
            "name": "state",
            "type": {
                "type": "record",
                "name": "state_",
                "fields": [
                    {"name": "velocity", "type": {"type": "array", "items": "float"}},
                    {"name": "thrust", "type": {"type": "array", "items": "float"}},
                    {"name": "brake", "type": {"type": "array", "items": "float"}},
                    {
                        "name": "timestep",
                        "type": {
                            "type": "array",
                            "items": {
                                "type": "long",
                                "logicalType": "timestamp-micros",
                            },
                        },
                    },
                ],
            },
        },
    ],
}
parsed_schema = parse_schema(schema1)
parsed_schema

In [None]:
# # Writing
# with open('episode5.avro', 'wb') as out:
#     writer(out, parsed_schema, records1)
#
# # Reading
# with open('episode5.avro', 'rb') as fo:
#     for record in reader(fo):
#         print(record)

In [None]:
# schema2 = {
#     'doc': 'An episode reading.',
#     'name': 'episode',
#     'namespace': 'test',
#     'type': 'record',
#     'fields': [
#         {'name': 'timestamp', 'type': 'long', 'logicalType': 'timestamp-micros'},
#         {'name': 'observation', 'type':
#             {
#                 'type': 'record',
#                 'name': 'observation_',
#                 'fields': [
#                     {
#                         'name': 'state',
#                         'type': {
#                             'type': 'record',
#                             'name': 'state_',
#                             'fields': state_fields_schema
#                         }
#                     },
#                 ]
#             }
#          },
#     ]
# }

# schema3 = {
#     'doc': 'An episode reading.',
#     'name': 'episode',
#     'namespace': 'test',
#     'type': 'record',
#     'fields': [
#         {'name': 'timestamp', 'type': 'long', 'logicalType': 'timestamp-micros'},
#         {'name': 'observation', 'type':
#             {
#                 'type': 'record',
#                 'name': 'observation_',
#                 'fields': [
#                     {
#                         'name': 'state',
#                         'type': {
#                             'type': 'record',
#                             'name': 'state_',
#                             'fields': state_fields_schema
#                         }
#                     },
#                 ]
#             }
#          },
#     ]
# }


schema3 = {
    "doc": "An episode reading.",
    "name": "sequence",
    "namespace": "test",
    "type": "record",
    "fields": [
        {"name": "episodestart", "type": "long", "logicalType": "timestamp-micros"},
        {
            "name": "meta",
            "type": {
                "type": "record",
                "name": "meta_",
                "fields": [
                    {
                        "name": "episode_meta",
                        "type": {
                            "type": "record",
                            "name": "episode_meta_",
                            "fields": episode_meta_fields_schema,
                        },
                    },
                    {
                        "name": "observation_meta",
                        "type": {
                            "type": "record",
                            "name": "observation_meta_",
                            "fields": observation_meta_fields_schema,
                        },
                    },
                ],
            },
        },
        {
            "name": "sequence",
            "type": {
                "type": "record",
                "name": "sequence_",
                "fields": [
                    {
                        "name": "state",
                        "type": {
                            "type": "record",
                            "name": "state_",
                            "fields": state_fields_schema,
                        },
                    },
                ],
            },
        },
    ],
}

# parsed_schema = parse_schema(schema3)
# # parsed_schema
# schema3

schema4 = {
    "doc": "An episode reading.",
    "name": "episode",
    "namespace": "test",
    "type": "record",
    "fields": [
        {"name": "episodestart", "type": "long", "logicalType": "timestamp-micros"},
        {
            "name": "meta",
            "type": {
                "type": "record",
                "name": "meta_",
                "fields": [
                    {
                        "name": "episode_meta",
                        "type": {
                            "type": "record",
                            "name": "episode_meta_",
                            "fields": episode_meta_fields_schema,
                        },
                    },
                    {
                        "name": "observation_meta",
                        "type": {
                            "type": "record",
                            "name": "observation_meta_",
                            "fields": observation_meta_fields_schema,
                        },
                    },
                ],
            },
        },
        {
            "name": "sequence",
            "type": {
                "type": "array",
                "items": [
                    {
                        "name": "step",
                        "type": {
                            "type": "record",
                            "name": "step_",
                            "fields": [
                                {
                                    "name": "ts",
                                    "type": {
                                        "type": "long",
                                        "logicalType": "timestamp-micros",
                                    },
                                }
                            ],
                        },
                    }
                    # {'name': 'observation',
                    #     'type': {
                    #         'type': 'record',
                    #         'name': 'observation_',
                    #         'fields': [
                    #             {
                    #                 'name': 'state',
                    #                 'type': {
                    #                     'type': 'record',
                    #                     'name': 'state_',
                    #                     'fields': state_fields_schema
                    #                 },
                    #             },
                    #         ]
                    #     }
                    # },
                ],
            },
        },
    ],
}
# parsed_schema = parse_schema(schema4)
# parsed_schema
# schema4
schema5 = {
    "doc": "An episode reading.",
    "name": "episode",
    "namespace": "test",
    "type": "record",
    "fields": [
        {"name": "episodestart", "type": "long", "logicalType": "timestamp-micros"},
        {
            "name": "sequence",
            "type": {
                "type": "record",
                "name": "sequence_",
                "fields": [
                    {
                        "name": "step",
                        "type": {
                            "type": "record",
                            "name": "step_",
                            "fields": [
                                {
                                    "name": "ts",
                                    "type": {
                                        "type": "array",
                                        "items": {
                                            "type": "long",
                                            "logicalType": "timestamp-micros",
                                        },
                                    },
                                },
                            ]
                            # [{
                            #     'name': 'ts',
                            #     'type': {'type': 'long', 'logicalType': 'timestamp-micros'}
                            # }]
                        },
                    }
                ],
            },
        },
    ],
}

# parsed_schema

schema6 = {
    "doc": "An episode reading.",
    "name": "episode",
    "namespace": "test",
    "type": "record",
    "fields": [
        {"name": "episodestart", "type": "long", "logicalType": "timestamp-micros"},
        {
            "name": "sequence",
            "type": {
                "type": "record",
                "name": "sequence_",
                "fields": [
                    {
                        "name": "ts",
                        "type": "array",
                        "items": {"type": "long", "logicalType": "timestamp-micros"},
                    }
                ],
            },
        },
    ],
}

In [None]:
schema7 = {
    "doc": "An episode reading.",
    "name": "episode",
    "namespace": "test",
    "type": "record",
    "fields": [
        {"name": "episodestart", "type": "long", "logicalType": "timestamp-micros"},
        {
            "name": "sequence",
            "type": {
                "type": "record",
                "name": "sequence_",
                "fields": [
                    {"name": "ts", "type": "long", "logicalType": "timestamp-micros"},
                    {
                        "name": "state",
                        "type": {
                            "type": "record",
                            "name": "state_",
                            "fields": state_fields_schema,
                        },
                    },
                ],
            },
        },
    ],
}
parsed_schema = parse_schema(schema7)
parsed_schema
schema7

In [None]:
schema8 = {
    "doc": "An episode reading.",
    "name": "episode",
    "namespace": "test",
    "type": "record",
    "fields": [
        {"name": "episodestart", "type": "long", "logicalType": "timestamp-micros"},
        {
            "name": "sequence",
            "type": {
                "type": "array",
                "items": {
                    "name": "step",  # not used in constructing the array data, only implicit name for schema namespace,
                    "type": "record",
                    "fields": [
                        {
                            "name": "ts",
                            "type": "long",
                            "logicalType": "timestamp-micros",
                        },
                        {
                            "name": "state",
                            "type": {
                                "type": "record",
                                "name": "state_",
                                "fields": state_fields_schema,
                            },
                        },
                    ],
                },
            },
        },
    ],
}
parsed_schema = parse_schema(schema8)
# parsed_schema
schema8

In [None]:
schema9 = {
    "doc": "An episode reading.",
    "name": "episode",
    "namespace": "test",
    "type": "record",
    "fields": [
        {"name": "episodestart", "type": "long", "logicalType": "timestamp-micros"},
        {
            "name": "sequence",
            "type": {
                "type": "array",
                "items": {
                    "name": "step",  # not used in constructing the array data, only implicit name for schema namespace,
                    "type": "record",
                    "fields": [
                        {
                            "name": "ts",
                            "type": "long",
                            "logicalType": "timestamp-micros",
                        },
                        {
                            "name": "state",
                            "type": {
                                "type": "record",
                                "name": "state_",
                                "fields": state_fields_schema,
                            },
                        },
                    ],
                },
            },
        },
    ],
}
parsed_schema = parse_schema(schema9)
# parsed_schema
schema9

In [None]:
# records_episodes = [
#     {u'episodestart': episodes_indices_dict[0]['episodestart'],
#      u'meta': {'episode_meta': episode_meta, 'observation_meta': observation_meta.dict()},
#      u'sequence': episodes_dict_nested[0]},
#     {u'episodestart': episodes_indices_dict[1]['episodestart'],
#      u'meta': {'episode_meta': episode_meta, 'observation_meta': observation_meta.dict()},
#      u'sequence': episodes_dict_nested[1]},
#     {u'episodestart': episodes_indices_dict[2]['episodestart'],
#      u'meta': {'episode_meta': episode_meta, 'observation_meta': observation_meta.dict()},
#      u'sequence': episodes_dict_nested[2]},
#     {u'episodestart': episodes_indices_dict[3]['episodestart'],
#      u'meta': {'episode_meta': episode_meta, 'observation_meta': observation_meta.dict()},
#      u'sequence': episodes_dict_nested[3]},
#     {u'episodestart': episodes_indices_dict[4]['episodestart'],
#      u'meta': {'episode_meta': episode_meta, 'observation_meta': observation_meta.dict()},
#      u'sequence': episodes_dict_nested[4]},
# ]


records_episodes = [
    {
        "episodestart": episodes_meta_dict[i]["episodestart"],
        "meta": {
            "episode_meta": episodes_meta_dict[i],
            "observation_meta": observation_meta.dict(),
        },
        "sequence": episodes_dict_nested[i],
    }
    for i in range(4)
]

records_episodes[1]

In [None]:
os.getcwd()

In [None]:
# Writing
with open("./episodes2.avro", "wb") as out:
    writer(out, parsed_schema_episode, records_episodes)

In [None]:
# Reading
with open("./episodes2.avro", "rb") as fo:
    for record in reader(fo):
        # print(record)
        print(record["meta"]["episode_meta"]["vehicle"])
        print(record["meta"]["episode_meta"]["driver"])
        print(pd.to_datetime(record["episodestart"], unit="us"))
        print(pd.to_datetime(record["sequence"][0]["timestamp"], unit="us"))
        print(len(record["sequence"]))

        print("\n")

In [None]:
os.chdir("bags")
os.getcwd()

In [None]:
import dask
import dask.bag as db
from dask import delayed

b = db.from_sequence([{"name": "Alice", "value": 100}, {"name": "Bob", "value": 200}])
b.take(2, npartitions=2)
schema = {
    "name": "People",
    "doc": "Set of people's scores",
    "type": "record",
    "fields": [{"name": "name", "type": "string"}, {"name": "value", "type": "int"}],
}
b.to_avro("my-data.*.avro", schema)

b_episodes = db.from_sequence(records_episodes)
b_episodes.npartitions
data = b_episodes.take(4, npartitions=4)
len(data)
b_episodes.to_avro("bag_episodes.*.avro", schema=parsed_schema_episode)
# dd_episodes = [delayed(db.from_sequence(ep))  for ep in records_episodes]
# # b_episodes_data = b_episodes.compute()
# b_episodes = db.from_delayed(dd_episodes)
# b_episodes.take(2)
# b_episodes.to_avro('bag_episodes.*.avro', schema=parsed_schema_episode)
#
# # rec = b_episodes.take(1)
#
# b_episodes.to_avro('bag_episodes.*.avro', schema=parsed_schema_episode)
#

# rec

In [None]:
b_episodes_read = db.read_avro("bag_episodes.*.avro")
rec = b_episodes_read.take(4, npartitions=4)
len(rec)
rec[0]["meta"]["episode_meta"]["vehicle"]
rec[0]["meta"]["episode_meta"]["driver"]

In [None]:
records_episode_to_add = [
    {
        "episodestart": episodes_meta_dict[4]["episodestart"],
        "meta": {
            "episode_meta": episodes_meta_dict[4],
            "observation_meta": observation_meta.dict(),
        },
        "sequence": episodes_dict_nested[4],
    }
]
# records_episode_to_add

b_episodes_new = db.concat([b_episodes_read, db.from_sequence(records_episode_to_add)])
b_episodes_new.npartitions
b_episodes_new.to_avro("bag_episodes.*.avro", schema=parsed_schema_episode)

In [None]:
b_episodes_read = db.read_avro("bag_episodes.*.avro")
b_episodes_read.npartitions
b_episodes_read.take(5, npartitions=5)