In [None]:
import s3fs
import os
import pandas as pd
import numpy as np
import dask.dataframe as dd



fs = s3fs.S3FileSystem(
    key=os.environ["S3_KEY"],
    secret=os.environ["S3_SECRET"],
    token=os.environ["S3_SESSION"],
    client_kwargs={
        "endpoint_url": os.environ['S3_ENDPOINT']
    }
)

options = {
    "key":os.environ["S3_KEY"],
    "secret":os.environ["S3_SECRET"],
    "token":os.environ["S3_SESSION"],
    "client_kwargs":{
        "endpoint_url": os.environ['S3_ENDPOINT']
    }
}

## Accessing the catalog to get the dataset metadata

In [None]:
%%time

from opal import kinds
kinds = kinds.load()

translated_kind = kinds.lookup("tip_translated")

datasets = {}
for t_id in translated_kind.list_instances():
    t_meta = translated_kind.read_instance_metadata(t_id)
    parsed_id = t_meta["derived_from"]
    if not parsed_id in datasets:
        datasets[parsed_id] = {}
    datasets[parsed_id][t_meta["translated_type"]] = t_id

## Creating list of S3 paths to pass to the distributed cluster

In [None]:
def path_to(translated_kind, instance_id, dataset):
    meta = translated_kind.read_instance_metadata(instance_id)
    return f"s3://data/{translated_kind.get_instance_path(instance_id)}/{meta['folder_name']}/{dataset}"

path_tuples = []
for ds in datasets.values():
    ds = ds
    s3_1553 = path_to(translated_kind, ds['1553'], 'NAV.parquet')
    s3_arinc429 = path_to(translated_kind, ds['arinc429'] , 'Engine_Fan_RPM_N1_ACTUAL_40.parquet')
    path_tuples.append((s3_1553, s3_arinc429))

## Defining the function the distributed cluster will run

In [None]:


altitude = "NAV-25"
altitude_valid = "NAV-0111"
rpm = "N1_RPM_ACTUAL"
def get_rpm_at_cruise_altitude(t_1553, t_429, options):
    # get the 1553 and ARINC429 data
    df_1553 = dd.read_parquet(t_1553, storage_options=options).compute()
    df_429 = dd.read_parquet(t_429, storage_options=options).compute()
    
    # filter 1553 by the 'altitude valid' big
    df_1553 = df_1553[df_1553[altitude_valid]]
    
    # join 1553 and ARINC429 data by time
    joined = pd.concat([ df_1553[["time", altitude]], df_429[["time", rpm]] ]).sort_values("time")
    joined[rpm] = joined[rpm].fillna(method="ffill")
    joined = joined.dropna().copy()
    
    # calculate change in altitude over time (time is in nanoseconds)
    joined.loc[:,"diff_altitude"] = joined[altitude].diff() / (joined["time"].diff() / 10**9)
    # smooth out the altitude derivative
    joined.loc[:,"diff_altitude"] = joined["diff_altitude"].rolling(int(1 / 0.04)).mean()
    
    # select only where difference in altitude is small enough
    # and the aircraft is sufficiently above what we think is ground level
    joined = joined[abs(joined["diff_altitude"]) < 0.25]
    start_altitude = joined[altitude].iloc[:10].mean()
    end_altitude = joined[altitude].iloc[-10:].mean()
    min_altitude = max(start_altitude, end_altitude)
    joined = joined[joined[altitude] > min_altitude * 2]
    joined = joined.dropna()
    
    # group periods of time when it was cruising, take the average
    joined['group'] = (joined['time'].diff() > 10 * 10**9).cumsum()
    grouped = joined.groupby('group').mean()
    return grouped[[altitude, rpm]]

## Creating the distributed Coiled cluster 

In [None]:
%%time
# Create a Coiled cluster that uses our conda environment
import os
worker_env = {"S3_KEY": os.environ["S3_KEY"],
"S3_SECRET": os.environ["S3_SECRET"],
"S3_SESSION": os.environ["S3_SESSION"],
"S3_ENDPOINT": os.environ['S3_ENDPOINT']}

import coiled
cluster = coiled.Cluster(
    name="opal-dask",
    software="usaf/demo",
    n_workers=40,
    account='usaf',
    environ=worker_env
)

from dask.distributed import Client
client = Client(cluster)
print('Dashboard:', client.dashboard_link)

## Running the pre-defined function on the cluster

In [None]:
%%time
from dask.distributed import Client, progress
import dask.bag
import dask.dataframe



ds_bag = dask.bag.from_sequence(path_tuples)
ds_bag = ds_bag.map(lambda ds: get_rpm_at_cruise_altitude(ds[0], ds[1], options))
future = client.compute(ds_bag)
progress(future, notebook=False)
df = pd.concat(future.result())

## Plotting the results with matplotlib

In [None]:
df.plot(
    kind="scatter", x=altitude, y=rpm, 
    xlabel="Altitude (ft)", ylabel="Engine RPM",
    title="Engine RPM over altitude when cruising (all flights)",
    figsize=(15, 10)
)

## Interactive Scatter plot with Bokeh

In [None]:
from bokeh.plotting import figure, output_file, show
from bokeh.resources import INLINE
import bokeh.io
from bokeh import *
bokeh.io.output_notebook(INLINE)

# output to static HTML file
output_file("line.html")

p = figure(title = "Trends of Engine RPM with Altitude", width=1000, height=400)
p.xaxis.axis_label = "Altitude"
p.yaxis.axis_label = "Engine RPM"
# add a circle renderer with a size, color, and alpha
p.circle(df['NAV-25'], df['N1_RPM_ACTUAL'], size=3, color="navy", alpha=0.2)
# show the results
show(p)