In [1]:
import itertools
import json
import pathlib
import re
from pprint import pprint

import numpy as np
import pandas as pd
import tiled
from tiled.client import from_uri
from tiled.examples.xdi import read_xdi
from tqdm import tqdm

In [2]:
import aimmdb
from aimm_adapters.heald_labview import (
    mangle_dup_names,
    normalize_dataframe,
    parse_heald_labview,
)

In [3]:
def lower_case_dict(d):
    out = {}
    modified = False

    for k, v in d.items():
        if isinstance(v, dict):
          v, modified_ = lower_case_dict(v)
          modified = modified or modified_
        if isinstance(k, str) and not k.islower():
            out[k.lower()] = v
            modified = True
        else:
            out[k] = v

    return out, modified

In [5]:
c = from_uri("https://aimm.lbl.gov/api")
c.login()

# c = from_uri("http://localhost:8000/api")


Navigate web browser to this address to obtain access code:

https://orcid.org/oauth/authorize?response_type=code&scope=openid&client_id=APP-0ROS9DU5F717F7XN&redirect_uri=https://aimm.lbl.gov/api/auth/provider/orcid/login




Access code (quotes optional):  ········


You have logged in with ORCID as 0000-0003-3670-0431


In [6]:
c["uid"]

<AIMMCatalog {'95NqB8p5xEd', 'ZF5xybczh8f', 'Hi4mWQe24iK', ...} ~5009 entries>

In [7]:
data_root = pathlib.Path("/run/media/joseph/seagate/jkleinhenz/projects/aimm/data")

In [None]:
def load_newville(data_path):
    files = list(data_path.rglob("*.xdi"))
    print(f"found {len(files)} xdi files to ingest")

    data_list = []

    for f in files:
        name = f.stem
        _, metadata = read_xdi(str(f))
        fields = metadata.pop("fields")
        fields, _ = lower_case_dict(fields)
        metadata.update(fields)
        sample = metadata["sample"]
        name = sample.pop("name")
        prep = sample.pop("prep", None)
        
        try:
          facility_name = metadata["facility"]["name"]
        except:
          facility_name = pd.NA
          
        try:
          beamline_name = metadata["beamline"]["name"]
        except:
          beamline_name = pd.NA

        # FIXME extract formula if present
        data_list.append(
            {
                "name": f.stem,
                "file": str(f),
                "sample.name": name,
                "sample.prep": prep,
                "facility.name" : facility_name,
                "beamline.name" : beamline_name,
                "metadata": metadata,
            }
        )

    df = pd.DataFrame(data_list)

    return df

In [None]:
df = load_newville(data_root / "newville" / "data")
df

In [None]:
df["facility.name"].unique()

In [None]:
df[df["facility.name"] == pd.NA]

In [None]:
df[df["facility.name"].isnull()]["beamline.name"].unique()

In [None]:
list(df["facility.name"])

In [None]:
df.iloc[100].metadata

In [None]:
def ingest_newville(c, df, verbose=False):
    """
    Upload the newville dataset to database
    """

    c_uid = c["uid"]
    for (name, prep), g in df.groupby(["sample.name", "sample.prep"]):
        if verbose:
            print(f"{name}: {prep}, {len(g)}")

        sample_id = c.write_sample({"name": name, "prep": prep})

        for i, row in g.iterrows():
            xas_df, _ = read_xdi(row.file)

            metadata = {"dataset": "newville", "sample_id": sample_id, **row.metadata}
            print(metadata)
            c_uid.write_dataframe(xas_df, metadata, specs=["ExperimentalXAS"])

In [None]:
print("starting ingestion...")
ingest_newville(c, df, verbose=True)
print("finished")

In [8]:
data_path = data_root / "NCM"
assert data_path.exists()

In [9]:
def ingest_aimm_ncm_samples(c, data_path):
    sample_ids = {}

    with open(data_path / "ncm_samples.json", "r") as f:
        ncm_samples = json.load(f)

    for sample in ncm_samples:
        name = sample["name"]
        sample_ids[name] = c.write_sample(sample)

    return sample_ids

In [10]:
sample_ids = ingest_aimm_ncm_samples(c, data_path)

In [11]:
sample_ids

{'BM_NCM622': 'FVHEqkxTqz8',
 'BM_NCM712': 'SNH7Dg7PR9h',
 'BM_NCMA': 'f6pVatZS3D9'}

In [12]:
def get_aimm_ncm_params():
    # cycle, voltage, charge state
    values = [
        (0, 0, "DC"),
        (1, 4.3, "C"),
        (1, 4.8, "C"),
        (1, 3.0, "DC"),
        (2, 4.3, "C"),
        (2, 4.8, "C"),
        (10, 4.8, "C"),
        (10, 3.0, "DC"),
    ]
    keys = ["cycle", "voltage", "state"]
    params = [dict(zip(keys, v)) for v in values]
    return params

In [13]:
def ingest_aimm_ncm_chenjun(c, sample_ids, data_path):
    aimm_ncm_params = get_aimm_ncm_params()

    c_uid = c["uid"]

    for atom in ["Ni", "Mn", "Co"]:
        with open(data_path / f"NCMBM24{atom}.last", "r") as f:
            N = int(f.read().split()[0])
            print(f"{atom}: {N=}")

            for i, (charge, sample) in zip(
                range(1, N + 1),
                itertools.cycle(itertools.product(aimm_ncm_params, sample_ids.items())),
            ):
                path = data_path / f"NCMBM24{atom}.{i:04d}"
                with open(path) as f:
                    fname = path.name
                    print(fname, charge, sample)

                    df, metadata = parse_heald_labview(f)
                    df, translation = normalize_dataframe(df, standardize=True)
                    metadata["translation"] = translation
                    df["mutrans"] = np.log(df["i0"] / df["itrans"])
                    df["murefer"] = np.log(df["i0"] / df["irefer"])

                    sample_id = sample[1]

                    metadata = {
                        "dataset": "nmc",
                        "fname": fname,
                        "facility": {"name": "APS"},
                        "beamline": {"name": "20 BM"},
                        "element": {"symbol": atom, "edge": "K"},
                        "sample_id": sample_id,
                        "charge": charge,
                    }

                    c_uid.write_dataframe(df, metadata, ["XAS_trans", "HasBatteryChargeData"])

In [14]:
ingest_aimm_ncm_chenjun(c, sample_ids, data_path / "chenjun")

Ni: N=48
NCMBM24Ni.0001 {'cycle': 0, 'voltage': 0, 'state': 'DC'} ('BM_NCM622', 'FVHEqkxTqz8')
NCMBM24Ni.0002 {'cycle': 0, 'voltage': 0, 'state': 'DC'} ('BM_NCM712', 'SNH7Dg7PR9h')
NCMBM24Ni.0003 {'cycle': 0, 'voltage': 0, 'state': 'DC'} ('BM_NCMA', 'f6pVatZS3D9')
NCMBM24Ni.0004 {'cycle': 1, 'voltage': 4.3, 'state': 'C'} ('BM_NCM622', 'FVHEqkxTqz8')
NCMBM24Ni.0005 {'cycle': 1, 'voltage': 4.3, 'state': 'C'} ('BM_NCM712', 'SNH7Dg7PR9h')
NCMBM24Ni.0006 {'cycle': 1, 'voltage': 4.3, 'state': 'C'} ('BM_NCMA', 'f6pVatZS3D9')
NCMBM24Ni.0007 {'cycle': 1, 'voltage': 4.8, 'state': 'C'} ('BM_NCM622', 'FVHEqkxTqz8')
NCMBM24Ni.0008 {'cycle': 1, 'voltage': 4.8, 'state': 'C'} ('BM_NCM712', 'SNH7Dg7PR9h')
NCMBM24Ni.0009 {'cycle': 1, 'voltage': 4.8, 'state': 'C'} ('BM_NCMA', 'f6pVatZS3D9')
NCMBM24Ni.0010 {'cycle': 1, 'voltage': 3.0, 'state': 'DC'} ('BM_NCM622', 'FVHEqkxTqz8')
NCMBM24Ni.0011 {'cycle': 1, 'voltage': 3.0, 'state': 'DC'} ('BM_NCM712', 'SNH7Dg7PR9h')
NCMBM24Ni.0012 {'cycle': 1, 'voltage': 3.

In [15]:
def read_header(f):
    header = ""
    for line in f:
        if line.startswith("Time (s)"):
            header = line.split("\t")
            return header


def read_wanli(f):
    names = read_header(f)
    names = mangle_dup_names(names)
    df = pd.read_csv(f, sep="\t", names=names)

    translation = {
        "Mono Energy": "energy",
        "Counter 3": "i0",
        "Counter 1": "tey",
        "Counter 2": "tfy",
        "Counter 0": "i0_alt",
    }
    df = df.rename(columns=translation)[list(translation.values())]

    df["mu_tfy"] = df["tfy"] / df["i0"]
    df["mu_tey"] = df["tey"] / df["i0"]

    return df


# NOTE this hardcodes BM prefix
def parse_filename(name):
    if "622" in name:
        sample = "BM_NCM622"
    elif "NCMA" in name:
        sample = "BM_NCMA"
    elif "712" in name:
        sample = "BM_NCM712"
    else:
        raise KeyError(f"unable to parse sample from {name}")

    if sample == "Ni_metal":
        charge = None
    elif "Pristine" in name:
        charge = (0, 0.0, "DC")
    else:
        if "1st" in name:
            cycle = 1
        elif "2nd" in name:
            cycle = 2
        elif "10th" in name:
            cycle = 10
        else:
            raise KeyError(f"unable to parse cycle from {name}")

        voltage_str = re.search("(\d*)V", name)[0]
        if voltage_str == "43V":
            voltage = 4.3
            state = "C"
        elif voltage_str == "48V":
            voltage = 4.8
            state = "C"
        elif voltage_str == "3V":
            voltage = 3.0
            state = "DC"
        else:
            raise KeyError(f"unable to parse voltage from {voltage_str}")

        charge = (cycle, voltage, state)

    if charge:
        keys = ["cycle", "voltage", "state"]
        charge = dict(zip(keys, charge))
    return sample, charge


def ingest_aimm_ncm_wanli(c, sample_ids, data_path):
    c_uid = c["uid"]

    files = list(data_path.glob("*.txt"))
    provenance = {"source": "wanli"}
    for file in files:
        fname = file.name
        print(fname)

        try:
            sample_name, charge = parse_filename(fname)
        except KeyError as e:
            print(f"failed to extract sample from {fname}")
            continue

        sample_id = sample_ids[sample_name]

        with open(file, "r") as f:
            df = read_wanli(f)

        metadata = {
            "dataset": "nmc",
            "fname": fname,
            "charge": charge,
            "facility": {"name": "ALS"},
            "beamline": {"name": "8.0.1"},
            "element": {"symbol": "Ni", "edge": "L3"},
            "sample_id": sample_id,
        }

        c_uid.write_dataframe(df, metadata, specs=["XAS_TEY", "XAS_TFY",  "HasBatteryChargeData"])

In [16]:
ingest_aimm_ncm_wanli(
    c, sample_ids, data_path / "wanli" / "Unimodal NCM622_712Al-doped_NCMA_Ni L3"
)

622_Pristine.txt
712_Al-doped_Pristine.txt
NCMA_Pristine.txt
1st_43V_NCMA_1.txt
1st_43V_NCMA_2.txt
1st_48V_622_1.txt
1st_48V_622_2.txt
1st_48V_NCMA.txt
Ni metal.txt
failed to extract sample from Ni metal.txt
1st_43V_622.txt
1st_3V_622.txt
1st_3V_NCMA.txt
2nd_43V_622.txt
2nd_43V_NCMA.txt
2nd_48V_622.txt
2nd_48V_NCMA.txt
10th_3V_622.txt
10th_3V_NCMA.txt
10th_48V_622.txt
10th_48V_712_Al-doped.txt
10th_48V_NCMA.txt
1st_3V_712_Al-doped.txt
1st_43V_712_Al-doped.txt
1st_48V_712_Al-doped.txt
2nd_43V_712_Al-doped.txt
2nd_48V_712_Al-doped.txt
10th_3V_712_Al-doped.txt


In [None]:
c["dataset"]["nmc"]["uid"].values().last().read()

In [None]:
def ingest_aimm_core_wanli_oxygen_k(c, data_path):
    c_uid = c["uid"]

    files = list((data_path / "O_K").glob("*.txt"))

    for file in files:
        fname = file.name
        name = file.stem
        print(name)

        sample_id = c.write_sample({"name": name})

        df = pd.read_csv(file, header=None, delimiter="\t", names=["energy", "mu"])

        metadata = {
            "dataset": "aimm_core",
            "fname": fname,
            "facility": {"name": "ALS"},
            "beamline": {"name": "8.0.1"},
            "element": {"symbol": "O", "edge": "K"},
            "sample_id": sample_id,
        }

        c_uid.write_xas(df, metadata)

In [None]:
ingest_aimm_core_wanli_oxygen_k(c, data_root / "wanli" / "core")

In [None]:
def read_csv_helper(file):
    # check if there is a header
    with open(file, "r") as f:
        head = f.readline()
        if len(head.split()) == 2:
            header = None
        elif len(head.split()) == 1:
            header = 0
        else:
            assert False

    df = pd.read_csv(file, header=header, delimiter="\t", names=["energy", "mu"])
    assert not df.isnull().values.any()

    return df


def ingest_aimm_core_wanli_tm_l(c, data_path):

    c_uid = c["uid"]

    provenance = {"source": "wanli"}

    for d in (data_path / "TM_L").iterdir():
        if d.is_file():
            continue
        symbol = d.stem

        for file in d.glob("*.txt"):
            if file.stem.startswith("IgorPlot"):
                continue

            fname = file.name
            name = file.stem
            print(name)

            sample_id = c.write_sample({"name": name})

            df = read_csv_helper(file)

            metadata = {
                "dataset": "aimm_core",
                "fname": fname,
                "facility": {"name": "ALS"},
                "beamline": {"name": "8.0.1"},
                "element": {"symbol": symbol, "edge": "L"},
                "sample_id": sample_id,
            }

            c_uid.write_xas(df, metadata)

In [None]:
ingest_aimm_core_wanli_tm_l(c, data_root / "wanli" / "core")

In [None]:
def ingest_aimm_core_ni_metal(c, data_path):
    file = data_path / "Unimodal NCM622_712Al-doped_NCMA_Ni L3" / "Ni metal.txt"

    assert file.exists()

    fname = file.name
    name = file.stem
    print(name)

    sample_id = c.write_sample({"name": "Ni metal"})

    with open(file, "r") as f:
        df = read_wanli(f)

    metadata = {
        "dataset": "aimm_core",
        "fname": fname,
        "facility": {"name": "ALS"},
        "beamline": {"name": "8.0.1"},
        "element": {"symbol": "Ni", "edge": "L3"},
        "sample_id": sample_id,
    }

    c["uid"].write_xas(df, metadata)

In [None]:
ingest_aimm_core_ni_metal(c, data_root / "NCM" / "wanli")

In [None]:
def ingest_iss_spectra(c, data_path):
    c_uid = c["uid"]
    
    assert data_path.exists()

    for f in data_path.glob("*.json"):
        uid = f.stem
        with open(data_path / f"{uid}.json") as x:
            metadata = json.load(x)

        with open(data_path / f"{uid}.csv") as x:
            df = pd.read_csv(x)

        df.rename(columns={"Energy": "energy", "mu_norm": "mu"}, inplace=True)

        if len(df) < 5:
            continue

        sample = {}
        sample["name"] = metadata.pop("Sample_name")
        sample["compound"] = metadata.pop("compound")
        print(sample["name"])
        
        sample_id = c.write_sample(sample)

        symbol = metadata.pop("Element")
        edge = metadata.pop("Edge")

        metadata["dataset"] = "iss"
        metadata["beamline"] = {"name": "iss"}
        metadata["facility"] = {"name": "NSLS-II"}
        metadata["element"] = {"symbol" : symbol, "edge" : edge}
        metadata["sample_id"] = sample_id

        c_uid.write_xas(df, metadata)

In [None]:
ingest_iss_spectra(c, data_root / "iss-spectra")

In [None]:
def ingest_aimm_ncm_eli(c, data_path, sample_ids):
    c_uid = c["uid"]
    
    for f in data_path.glob("*.json"):
        print(f.name)
        with open(f, "r") as x:
            metadata_, sample, proc, data = json.load(x)

        energy = data.pop("Energy")
        mu = data.pop("mu_flat")
        df = pd.DataFrame({"energy": energy, "mu": mu})

        facility_name = metadata_.pop("Facility")
        beamline_id = metadata_.pop("beamline_id")

        metadata = {}
        metadata["beamline"] = {"name": "iss", "id": beamline_id}
        metadata["facility"] = {"name": "NSLSII"}

        edge = metadata_.pop("edge")
        symbol = metadata_.pop("element")

        m = re.match("(.*) \d Charging cycle (\d) (\S*)V (\S*)", f.stem)
        sample, charge_cycle, voltage, atom = m[1], m[2], m[3], m[4]
        voltage = float(voltage)
        charge_cycle = int(charge_cycle)
        sample = {"NMC 622": "BM_NCM622", "NMC 712": "BM_NCM712", "NMCA": "BM_NCMA"}[
            sample
        ]
        assert atom == symbol

        state = {4.3: "C", 4.8: "C", 3.0: "DC"}[voltage]

        sample_id = sample_ids[sample]

        metadata["element"] = {"symbol" : symbol, "edge" : edge}
        metadata["charge"] = {"cycle": charge_cycle, "voltage": voltage, "state": state}
        metadata["sample_id"] = sample_id
        metadata["dataset"] = "aimm"
        
        c_uid.write_xas(df, metadata)

In [None]:
ingest_aimm_ncm_eli(c, data_root / "NCM/eli/dataset0", sample_ids)