In [None]:
import copy
import pathlib

import numpy as np
import pandas as pd

from tiled.client import from_uri
from tiled.examples.xdi import read_xdi
from tiled.queries import Key

from aimmdb.schemas import ExperimentalXASMetadata

In [None]:
# now we will ingest the newville dataset
# first download and unpack the raw data
!curl -L https://github.com/XraySpectroscopy/XASDataLibrary/archive/master.tar.gz | tar xz

In [None]:
# we will enforce that XAS metadata satisfies the following schema
ExperimentalXASMetadata.schema()

In [None]:
# we can check if the metadata satisfies the schema locally by running
# ExperimentalXASMetadata.parse_obj(metadata)

In [None]:
# example of valid metadata
metadata = {
    "dataset": "example",
    "element": {"edge": "K", "symbol": "Fe"},
    "facility": {"name": "ALS"},
    "beamline": {"name": "8.0.1"},
}
ExperimentalXASMetadata.parse_obj(metadata)

In [None]:
# examples of invalid metadata

In [None]:
# missing dataset
metadata = {
    "element": {"edge": "K", "symbol": "Fe"},
    "facility": {"name": "ALS"},
    "beamline": {"name": "8.0.1"},
}
ExperimentalXASMetadata.parse_obj(metadata)

In [None]:
# invalid edge
metadata = {
    "dataset": "example",
    "element": {"edge": "FOO", "symbol": "Fe"},
    "facility": {"name": "ALS"},
    "beamline": {"name": "8.0.1"},
}
ExperimentalXASMetadata.parse_obj(metadata)

In [None]:
# invalid element symbol
metadata = {
    "dataset": "example",
    "element": {"edge": "K", "symbol": "FOO"},
    "facility": {"name": "ALS"},
    "beamline": {"name": "8.0.1"},
}
ExperimentalXASMetadata.parse_obj(metadata)

In [None]:
# invalid facility name
metadata = {
    "dataset": "example",
    "element": {"edge": "K", "symbol": "Fe"},
    "facility": {"name": "FOO"},
    "beamline": {"name": "8.0.1"},
}
ExperimentalXASMetadata.parse_obj(metadata)

In [None]:
# getting existing data to fit into this schema requires some munging...

In [None]:
def lower_case_dict(d):
    out = {}
    modified = False

    for k, v in d.items():
        if isinstance(v, dict):
          v, modified_ = lower_case_dict(v)
          modified = modified or modified_
        if isinstance(k, str) and not k.islower():
            out[k.lower()] = v
            modified = True
        else:
            out[k] = v

    return out, modified

def load_newville(data_path):
    """
    Load newville dataset into a dataframe parsing sample information from metadata
    """
    files = list(data_path.rglob("*.xdi"))
    print(f"found {len(files)} xdi files to ingest")

    data_list = []

    for f in files:
        name = f.stem
        df_xas, metadata = read_xdi(str(f))
        fields = metadata.pop("fields")
        fields, _ = lower_case_dict(fields)
        metadata.update({k.lower(): v for k, v in fields.items()})
        metadata["sample"].setdefault("prep", None)
        
        try:
          facility_name = metadata["facility"]["name"]
        except:
          metadata["facility"] = {"name" : None}
        try:
          beamline_name = metadata["beamline"]["name"]
        except:
          metadata["beamline"] = {"name" : None}

        data_list.append(
            {
                "name": f.stem,
                "file": str(f),
                "metadata": metadata,
                "columns" : tuple(df_xas)
            }
        )

    df = pd.DataFrame(data_list)

    return df

In [None]:
# read through all the files and extract some metadata
newville = load_newville(pathlib.Path("./XASDataLibrary-master/data/"))
newville

In [None]:
# not all records have facility name
newville["metadata"].apply(lambda x : x["facility"]["name"]).unique()

In [None]:
missing_facility_name = newville["metadata"].apply(lambda x : x["facility"]["name"]).isnull()

In [None]:
# we can infer the facility name from the beamline names
newville[missing_facility_name].apply(lambda x : x["metadata"]["beamline"]["name"], axis=1).unique()

In [None]:
# this is a good example of data munging necessary to fit real data into explicit schemas
def beamline_to_facility(beamline_name):
  if beamline_name == "20BM" or beamline_name.startswith("APS"):
    return "APS"
  elif beamline_name.startswith("SSRL"):
    return "SSRL"
  else:
    raise KeyError(f"unable to infer facility name from beamline name {beamline_name}")

In [None]:
# fill in the missing facility information

def f(x):
  x["facility"]["name"] = beamline_to_facility(x["beamline"]["name"])
  return x

newville.loc[missing_facility_name, "metadata"] = newville[missing_facility_name]["metadata"].map(f)

In [None]:
# now all records have a facility name
newville["metadata"].apply(lambda x : x["facility"]["name"]).unique()

In [None]:
# create columns so we can do group measurements by sample
newville.loc[:, "sample.name"] = newville.apply(lambda x : x["metadata"]["sample"]["name"], axis=1)
newville.loc[:, "sample.prep"] = newville.apply(lambda x : x["metadata"]["sample"]["prep"], axis=1)

In [None]:
# print out all the samples
for (name, prep), g in newville.groupby(["sample.name", "sample.prep"]):
  print(f"{name}: {prep}, [{len(g)}]")

In [None]:
# check that all our metadata satisfies the ExperimentalXAS schema
# this is enforced by the server but we can check locally as well

for metadata in newville["metadata"]:
  metadata = copy.deepcopy(metadata)
  metadata["dataset"] = "dataset"
  ExperimentalXASMetadata.parse_obj(metadata)

In [None]:
# now we are ready to ingest the data into the tiled server

In [None]:
# create tiled client object
# because we have installed the aimmdb package, tiled automatically constructs an AIMMCatalog instance
# this is a subclass of the generic tiled client providing a more expressive repr and methods for uploading data to the AIMM database
c = from_uri("https://aimm-staging.lbl.gov/api")
c

In [None]:
# login to gain authenticated access
c.login()

In [None]:
c["uid"]

In [None]:
### DANGER ###
# delete everything so we start fresh
# NOTE we need to explicity iterate the node using list to gather the list of uids BEFORE we start deleting
for k in list(c["uid"]):
    del c["uid"][k]

In [None]:
# catalog should be empty now
c["uid"]

In [None]:
# the server supports writing array data using the write_array method
x = np.random.rand(100, 100)
metadata = {"dataset" : "sandbox", "foo" : "bar"}
node = c["uid"].write_array(x, metadata=metadata)
node

In [None]:
c["uid"]

In [None]:
# we can retrive the uid associated with the object
key = node.item["id"]
key

In [None]:
node = c["uid"][key]
node

In [None]:
# we can check that we get back the same data we uploaded
np.testing.assert_array_equal(node.read(), x)

In [None]:
node.metadata

In [None]:
# it is also possible to write dataframes
df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "sandbox", "foo" : "bar"}
node = c["uid"].write_dataframe(df, metadata=metadata)
node

In [None]:
c["uid"]

In [None]:
key = node.item["id"]

In [None]:
node = c["uid"][key]
node

In [None]:
# we can check that we get back the same data we uploaded
pd.testing.assert_frame_equal(node.read(), df)

In [None]:
node.metadata

In [None]:
# when submitting data you MUST include a 'dataset' key in the metadata
# this is used to allow providing access control and enforcement of schemas
metadata = {"foo" : "bar"}
node = c["uid"].write_array(x, metadata=metadata)

In [None]:
# the server can be configured to apply custom validation to data in a particular dataset
# we have configured a 'newville' dataset which requires that the metadata conform to the ExperimentalXASMetadata schema
# submitting arbitrary data to this dataset will be rejected by the server

In [None]:
# no spec specified
df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "newville", "foo" : "bar"}
key = c["uid"].write_dataframe(df, metadata=metadata)

In [None]:
# correct spec but metadata fails serverside validation
df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "newville", "foo" : "bar"}
key = c["uid"].write_dataframe(df, metadata=metadata, specs=["XAS"])

In [None]:
# wrong structure family
x = np.random.rand(100, 100)
metadata = {"dataset" : "newville", "foo" : "bar"}
key = c["uid"].write_array(x, metadata=metadata, specs=["XAS"])

In [None]:
# with the correct metadata we can write to the server
# NOTE this doesn't prevent you from writing garbage but does help
df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "newville", "foo" : "bar", "element" : {"symbol" : "Au", "edge" : "K"}, "facility" : {"name" : "ALS"}, "beamline" : {"name" : "8.0.1"}}
node = c["uid"].write_dataframe(df, metadata=metadata, specs=["XAS"])
node

In [None]:
c["uid"]

In [None]:
c["dataset"]

In [None]:
c["dataset"]["newville"]["uid"]

In [None]:
key = node.item["id"]
key

In [None]:
# since we just submitted some garbage lets delete it
del c["uid"][key]

In [None]:
c["dataset"]["newville"]["uid"]

In [None]:
# aimmdb also provides functionality for associating metadata and measurements with samples
# use write_sample to write some metadata to the sample database and get a unique id
sample_metadata = {"name" : "NiO", "description" : "Jos's Nickle Oxide Sample"}
sample_key = c.write_sample(sample_metadata)
sample_key

In [None]:
# we can include sample_id in the metadata when submitting measurements

df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "sandbox", "element" : {"symbol" : "Ni", "edge" : "K"}, "sample_id" : sample_key}
node = c["uid"].write_dataframe(df, metadata=metadata)

df = pd.DataFrame({"a" : np.random.rand(100), "b" : np.random.rand(100)})
metadata = {"dataset" : "sandbox", "element" : {"symbol" : "Ni", "edge" : "K"}, "sample_id" : sample_key}
node = c["uid"].write_dataframe(df, metadata=metadata)

In [None]:
c["sample"]

In [None]:
# we can then retrieve all measurements associated with that sample_id
c["sample"][sample_key]["uid"]

In [None]:
node = c["sample"][sample_key]["uid"].values().first()
node

In [None]:
# the metadata from the sample database is merged into the measurement metadata
node.metadata

In [None]:
# samples can also be deleted
# NOTE this only prevents new data from being associated with this sample_key
# existing records are not deleted and retain the sample metadata merged at time of ingestion
c.delete_sample(sample_key)

In [None]:
# we are now ready to ingest the newville dataset

In [None]:
def ingest_newville(c, df, verbose=False):
    """
    Upload the newville dataset to database
    """

    for (name, prep), g in df.groupby(["sample.name", "sample.prep"]):
        if verbose:
            print(f"{name}: {prep}, {len(g)}")

        sample_id = c.write_sample({"name" : name, "prep" : prep})

        for i, row in g.iterrows():
            xas_df, _ = read_xdi(row.file)
            metadata = row.metadata
            metadata["dataset"] = "newville"
            metadata["sample_id"] = sample_id
            c["uid"].write_dataframe(xas_df, metadata=metadata, specs=["XAS"])

In [None]:
print("starting ingestion...")
ingest_newville(c, newville, verbose=True)
print("finished.")

In [None]:
# now we can see the newville data on the server
c["dataset"]["newville"]["uid"]

In [None]:
c["dataset"]["newville"]["uid"].values()[:10]

In [None]:
# because we have the aimmdb package installed, tiled automatically constructs an XASClient instance when we select a single measurement
# this is a subclass of the generic tiled dataframe client providing a more expressive repr which shows the sample name and edge information for the measurement
x = c["dataset"]["newville"]["uid"].values().first()
x

In [None]:
# the measurement uid can be accessed as a property
x.uid

In [None]:
# equivalent to
x.item["id"]

In [None]:
# the measurement itself is stored in a dataframe which can be obtained using the .read() method
df = x.read()
df

In [None]:
# once the data is ingested it can be queried in a variety of ways
# below we show a non-exhaustive set of examples:

In [None]:
# the data can be grouped by sample
c["dataset"]["newville"]["sample"]

In [None]:
# get the sample key
k = c["dataset"]["newville"]["sample"].keys().first()
k

In [None]:
# get all measurements associated with the sample_key
c["dataset"]["newville"]["sample"][k]["uid"].values()[:]

In [None]:
# get all absorbing elements in dataset
c["dataset"]["newville"]["element"].keys()[:]

In [None]:
# get all absorbing elements in dataset (alternative more flexible syntax)
c["dataset"]["newville"]["uid"].distinct("element.symbol", counts=True)

In [None]:
# get all measurements for a particular absorbing element
c["dataset"]["newville"]["element"]["Au"]["uid"].values()[:]

In [None]:
# get all measurements for a particular absorbing element (alternative more flexible syntax)
c["dataset"]["newville"]["uid"].search(Key("element.symbol") == "Au").values()[:]

In [None]:
# get all edges
c["dataset"]["newville"]["edge"]

In [None]:
# alt
c["dataset"]["newville"]["uid"].distinct("element.edge", counts=True)

In [None]:
# get all K edge measurements
c["dataset"]["newville"]["edge"]["K"]["uid"]

In [None]:
# alt
c["dataset"]["newville"]["uid"].search(Key("element.edge") == "K")

In [None]:
# get all beamlines (NOTE not accessible using tree)
c["dataset"]["newville"]["uid"].distinct("beamline.name", counts=True)

In [None]:
# get data frame a particular beamline
c["dataset"]["newville"]["uid"].search(Key("beamline.name") == "SSRL 4-1")