## Import Python Module to Create/Upload New Data Instances
### (Persona: Data Engineer)

Use metaflow-based data model

In [1]:
import s3fs
import os
import json
import weave
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np

In [2]:
s3 = s3fs.S3FileSystem(client_kwargs = {'endpoint_url': os.environ['S3_ENDPOINT']})

### Refresh provenance demo bucket

In [3]:
if s3.exists('provenance-demo'):
    s3.rm('s3://provenance-demo', recursive = True)

weave.upload.UploadBasket(
    upload_items=[{'path': '/home/jovyan/opal/resources/NASA_MILSTD1553_DTS.yaml', 'stub': False}],
    basket_type='NASA_MILSTD1553_DTS',
    pantry_path='provenance-demo',
    file_system=s3,
)
weave.upload.UploadBasket(
    upload_items=[{'path': '/home/jovyan/opal/resources/NASA_ARINC429_DTS.yaml', 'stub': False}],
    basket_type='NASA_ARINC429_DTS',
    pantry_path='provenance-demo',
    file_system=s3,
)
weave.upload.UploadBasket(
    upload_items=[{'path': '/home/jovyan/opal/resources/652200104150842.ch10', 'stub': False}],
    basket_type='ch10',
    pantry_path='provenance-demo',
    file_system=s3,
    metadata = {'ch10name': '652200104150842'},
).get_upload_path()

'provenance-demo/ch10/6270a53c5cc211eeaa8d0242ac120005'

# What data do I have?

### Create an index of my data store using Weave.

<img src="resources/weave to minio.drawio.png">

In [11]:
# Create a pantry
pantry = weave.Pantry(
    weave.IndexPandas,
    pantry_path="basket-data",
    file_system=s3,
)

# Ensure the index is up to date, then get the pandas df representation.
index = pantry.index
index.generate_index()
index_df = index.to_pandas_df(max_rows=5000)
index_df.head()

Unnamed: 0,uuid,upload_time,parent_uuids,basket_type,label,weave_version,address,storage_type
0,58a7843efc1a11ed8cb60242ac120009,2023-05-26 23:09:20,[],HUD_video,,<0.13.0,basket-data/HUD_video/58a7843efc1a11ed8cb60242...,S3FileSystem
1,cb33dc3afbfe11edb4a40242ac120005,2023-05-26 19:52:06,[],NASA_ARINC429_DTS,,<0.13.0,basket-data/NASA_ARINC429_DTS/cb33dc3afbfe11ed...,S3FileSystem
2,d611a2b8fbfe11edb4a40242ac120005,2023-05-26 19:52:25,[],NASA_MILSTD1553_DTS,,<0.13.0,basket-data/NASA_MILSTD1553_DTS/d611a2b8fbfe11...,S3FileSystem
3,00762bacfbf311ed9ee60242ac120005,2023-05-26 18:27:42,[],ch10,652200101201352.0,<0.13.0,basket-data/ch10/00762bacfbf311ed9ee60242ac120005,S3FileSystem
4,018e74aefbf311ed9ee60242ac120005,2023-05-26 18:27:45,[],ch10,652200101201529.0,<0.13.0,basket-data/ch10/018e74aefbf311ed9ee60242ac120005,S3FileSystem


In [14]:
print(f"Index length: {len(index)}")
index_df.basket_type.value_counts()

Index length: 2803


ch10                          560
ch10_parsed                   560
ch10_summary                  560
ch10_translated_ARINC429      560
ch10_translated_MILSTD1553    560
HUD_video                       1
NASA_ARINC429_DTS               1
NASA_MILSTD1553_DTS             1
Name: basket_type, dtype: int64

# Where did my data come from?

### Provenance tracking example using Metaflow and TIP.

In [4]:
small_index = weave.index.create_index.create_index_from_fs(root_dir='provenance-demo', file_system=s3)
small_index

Unnamed: 0,uuid,upload_time,parent_uuids,basket_type,label,weave_version,address,storage_type
0,6265de2c5cc211eeaa8d0242ac120005,2023-09-26 23:14:03.974346+00:00,[],NASA_ARINC429_DTS,,0.14.1,provenance-demo/NASA_ARINC429_DTS/6265de2c5cc2...,S3FileSystem
1,625385605cc211eeaa8d0242ac120005,2023-09-26 23:14:03.902339+00:00,[],NASA_MILSTD1553_DTS,,0.14.1,provenance-demo/NASA_MILSTD1553_DTS/625385605c...,S3FileSystem
2,6270a53c5cc211eeaa8d0242ac120005,2023-09-26 23:14:04.076037+00:00,[],ch10,,0.14.1,provenance-demo/ch10/6270a53c5cc211eeaa8d0242a...,S3FileSystem


In [6]:
!python /home/jovyan/opal/data-engineering-resources/NASA_ch10_flows/parse_nasa_ch10s_flow.py --no-pylint run --bucket_name provenance-demo --n 1

[35m[1mMetaflow 2.7.23[0m[35m[22m executing [0m[31m[1mNASAch10ParseFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:jovyan[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m2023-09-26 23:15:03.648 [0m[1mWorkflow starting (run-id 1695770103433570):[0m
[35m2023-09-26 23:15:03.663 [0m[32m[1695770103433570/start/1 (pid 1116)] [0m[1mTask is starting.[0m
[35m2023-09-26 23:15:06.782 [0m[32m[1695770103433570/start/1 (pid 1116)] [0m[1mTask finished successfully.[0m
[35m2023-09-26 23:15:06.882 [0m[32m[1695770103433570/parse_ch10s/2 (pid 1155)] [0m[1mTask is starting.[0m
[35m2023-09-26 23:15:08.670 [0m[32m[1695770103433570/parse_ch10s/2 (pid 1155)] [0m[22m1/1: provenance-demo/ch10/6270a53c5cc211eeaa8d0242ac120005[0m
[35m2023-09-26 23:15:08.686 [0m[32m[1695770103433570/parse_ch10s/2 (pid 1155)] [0m[22mhere1[0m
[35m2023-09-26 23:15:08.731 [0m[32m[1695

In [17]:
small_index = weave.index.create_index.create_index_from_fs(root_dir='provenance-demo', file_system=s3)
small_index

Unnamed: 0,uuid,upload_time,parent_uuids,basket_type,label,weave_version,address,storage_type
0,7de84b145cc011ee87c00242ac120005,2023-09-26 23:00:31.151993+00:00,[],NASA_ARINC429_DTS,,0.14.1,provenance-demo/NASA_ARINC429_DTS/7de84b145cc0...,S3FileSystem
1,7dd977745cc011ee87c00242ac120005,2023-09-26 23:00:31.054644+00:00,[],NASA_MILSTD1553_DTS,,0.14.1,provenance-demo/NASA_MILSTD1553_DTS/7dd977745c...,S3FileSystem
2,7df9dd7a5cc011ee87c00242ac120005,2023-09-26 23:00:31.281083+00:00,[],ch10,,0.14.1,provenance-demo/ch10/7df9dd7a5cc011ee87c00242a...,S3FileSystem


In [18]:
!python /home/jovyan/opal/data-engineering-resources/NASA_ch10_flows/translate_nasa_ch10s_flow.py --no-pylint run --bucket_name provenance-demo --n 1
!python /home/jovyan/opal/data-engineering-resources/NASA_ch10_flows/translate_nasa_ch10s_flow.py --no-pylint run --bucket_name provenance-demo --data_type ARINC429 --n 1

[35m[1mMetaflow 2.7.23[0m[35m[22m executing [0m[31m[1mNASAch10TranslateFlow[0m[35m[22m[0m[35m[22m for [0m[31m[1muser:jovyan[0m[35m[22m[K[0m[35m[22m[0m
[35m[22mValidating your flow...[K[0m[35m[22m[0m
[32m[1m    The graph looks good![K[0m[32m[1m[0m
[35m2023-09-26 23:02:57.272 [0m[1mWorkflow starting (run-id 1695769377093582):[0m
[35m2023-09-26 23:02:57.287 [0m[32m[1695769377093582/start/1 (pid 465)] [0m[1mTask is starting.[0m
[35m2023-09-26 23:03:00.385 [0m[32m[1695769377093582/start/1 (pid 465)] [0m[1mTask finished successfully.[0m
[35m2023-09-26 23:03:00.433 [0m[32m[1695769377093582/get_dts_file/2 (pid 510)] [0m[1mTask is starting.[0m
[35m2023-09-26 23:03:03.648 [0m[32m[1695769377093582/get_dts_file/2 (pid 510)] [0m[1mTask finished successfully.[0m
[35m2023-09-26 23:03:03.706 [0m[32m[1695769377093582/translate_parsed/3 (pid 555)] [0m[1mTask is starting.[0m
[35m2023-09-26 23:03:05.873 [0m[32m[1695769377093582/tr

In [19]:
small_index = weave.index.create_index.create_index_from_fs(root_dir='provenance-demo', file_system=s3)
small_index

Unnamed: 0,uuid,upload_time,parent_uuids,basket_type,label,weave_version,address,storage_type
0,7de84b145cc011ee87c00242ac120005,2023-09-26 23:00:31.151993+00:00,[],NASA_ARINC429_DTS,,0.14.1,provenance-demo/NASA_ARINC429_DTS/7de84b145cc0...,S3FileSystem
1,7dd977745cc011ee87c00242ac120005,2023-09-26 23:00:31.054644+00:00,[],NASA_MILSTD1553_DTS,,0.14.1,provenance-demo/NASA_MILSTD1553_DTS/7dd977745c...,S3FileSystem
2,7df9dd7a5cc011ee87c00242ac120005,2023-09-26 23:00:31.281083+00:00,[],ch10,,0.14.1,provenance-demo/ch10/7df9dd7a5cc011ee87c00242a...,S3FileSystem


In [22]:
# arinc_basket = small_index[small_index.basket_type == 'ch10_translated_ARINC429']
# arinc_path = arinc_basket.address.iloc[0]
arincx_basket = small_index[small_index["basket_type"] == "ch10_translated_ARINC429"]

print(arinc_basket)
arinc_metadata_path = os.path.join(arinc_path, 'basket_metadata.json')
with s3.open(arinc_metadata_path, 'rb') as file:
    arinc_metadata = json.load(file)
arinc_metadata

Empty DataFrame
Columns: [uuid, upload_time, parent_uuids, basket_type, label, weave_version, address, storage_type]
Index: []


NameError: name 'arinc_path' is not defined

### A look back at the complete index.

In [15]:
index_df

Unnamed: 0,uuid,upload_time,parent_uuids,basket_type,label,weave_version,address,storage_type
0,02f5c1d6093511eebd3c0242c0a82011,2023-06-12 15:22:58,[],NASA_ARINC429_DTS,,<0.13.0,basket-data/NASA_ARINC429_DTS/02f5c1d6093511ee...,S3FileSystem
1,14a09eec093511eebd3c0242c0a82011,2023-06-12 15:23:27,[],NASA_MILSTD1553_DTS,,<0.13.0,basket-data/NASA_MILSTD1553_DTS/14a09eec093511...,S3FileSystem
2,38721480093111ee9c260242c0a82011,2023-06-12 14:55:51,[],ch10,652200101100441,<0.13.0,basket-data/ch10/38721480093111ee9c260242c0a82011,S3FileSystem
3,39fe2b54093111ee9c260242c0a82011,2023-06-12 14:55:54,[],ch10,652200101121118,<0.13.0,basket-data/ch10/39fe2b54093111ee9c260242c0a82011,S3FileSystem
4,3b028fc2093111ee9c260242c0a82011,2023-06-12 14:55:54,[],ch10,652200101121218,<0.13.0,basket-data/ch10/3b028fc2093111ee9c260242c0a82011,S3FileSystem
...,...,...,...,...,...,...,...,...
497,77a2f0da093511eeba3d0242c0a82011,2023-06-12 15:26:14,"[f8d0fe8e093211ee8e980242c0a82011, 14a09eec093...",ch10_translated_MILSTD1553,652200101281241,<0.13.0,basket-data/ch10_translated_MILSTD1553/77a2f0d...,S3FileSystem
498,78c70f5a093511eeba3d0242c0a82011,2023-06-12 15:26:16,"[fc7cb67c093211ee8e980242c0a82011, 14a09eec093...",ch10_translated_MILSTD1553,652200101281527,<0.13.0,basket-data/ch10_translated_MILSTD1553/78c70f5...,S3FileSystem
499,79902b60093511eeba3d0242c0a82011,2023-06-12 15:26:17,"[fcbcb980093111ee8e980242c0a82011, 14a09eec093...",ch10_translated_MILSTD1553,652200101100441,<0.13.0,basket-data/ch10_translated_MILSTD1553/79902b6...,S3FileSystem
500,7a7024a4093511eeba3d0242c0a82011,2023-06-12 15:26:18,"[ff648906093111ee8e980242c0a82011, 14a09eec093...",ch10_translated_MILSTD1553,652200101121118,<0.13.0,basket-data/ch10_translated_MILSTD1553/7a7024a...,S3FileSystem


<img src="./resources/ch10_flow.png"  width = "600" height="5500">

In [16]:
my_translated_data = index_df[index_df.basket_type == 'ch10_translated_MILSTD1553'].sample()
my_translated_data

Unnamed: 0,uuid,upload_time,parent_uuids,basket_type,label,weave_version,address,storage_type
410,2ca4123a093511eeba3d0242c0a82011,2023-06-12 15:24:08,"[114faf56093211ee8e980242c0a82011, 14a09eec093...",ch10_translated_MILSTD1553,652200101131142,<0.13.0,basket-data/ch10_translated_MILSTD1553/2ca4123...,S3FileSystem


In [17]:
my_parents = index_df[index_df.uuid.isin(my_translated_data.parent_uuids.iloc[0])]
my_parents

Unnamed: 0,uuid,upload_time,parent_uuids,basket_type,label,weave_version,address,storage_type
1,14a09eec093511eebd3c0242c0a82011,2023-06-12 15:23:27,[],NASA_MILSTD1553_DTS,,<0.13.0,basket-data/NASA_MILSTD1553_DTS/14a09eec093511...,S3FileSystem
110,114faf56093211ee8e980242c0a82011,2023-06-12 15:01:54,[44856e98093111ee9c260242c0a82011],ch10_parsed,652200101131142.0,<0.13.0,basket-data/ch10_parsed/114faf56093211ee8e9802...,S3FileSystem


In [18]:
parsed_data = my_parents.iloc[1]
original_ch10 = index_df[index_df.uuid.isin(parsed_data.parent_uuids)]
original_ch10

Unnamed: 0,uuid,upload_time,parent_uuids,basket_type,label,weave_version,address,storage_type
12,44856e98093111ee9c260242c0a82011,2023-06-12 14:56:11,[],ch10,652200101131142,<0.13.0,basket-data/ch10/44856e98093111ee9c260242c0a82011,S3FileSystem


# How do I access my data?

### View a portion of the data contained in a ch10.

<img src="resources/weave to pandas.drawio.png">

In [19]:
my_data = index_df[index_df.label == '652200104211052']
my_data

Unnamed: 0,uuid,upload_time,parent_uuids,basket_type,label,weave_version,address,storage_type


In [20]:
path_1553 = os.path.join(my_data[my_data.basket_type == 'ch10_translated_MILSTD1553'].address.iloc[0], 'parsed_data_translated', 'NAV.parquet', '00.parquet')
df_1553 = pd.read_parquet(path_1553, filesystem = s3)
df_1553

IndexError: single positional indexer is out-of-bounds

In [21]:
# Filter out invalid lat/long/altitude measurements
where_valid = df_1553[df_1553["NAV-0110"] & df_1553["NAV-0111"]]
plt.rcParams.update({"font.size":18})

# plot
fig, ax = plt.subplots()
ax = where_valid.plot(
    kind="scatter", title="Aircraft Position (1553)",
    x="NAV-23", y="NAV-21", c="NAV-25", s=1,
    cmap="viridis", figsize=(15, 10), ax=ax)

ax.set_aspect("equal")
ax.set_xlabel("Longitude [deg]")
ax.set_ylabel("Latitude [deg]")
plt.gcf().get_axes()[1].set_ylabel("Altitude [ft]")
plt.show()

NameError: name 'df_1553' is not defined

### Clear Bucket

In [22]:
if s3.exists('provenance-demo'):
    s3.rm('s3://provenance-demo', recursive = True)