In [None]:
from bw_processing.constants import DEFAULT_LICENSES
from bw_processing.filesystem import clean_datapackage_name, safe_filename
from bw_processing.utils import check_name
from pathlib import Path
from typing import Optional
import datetime
import json
import pandas as pd
import uuid

Had some trouble here as I didn't realize that `indicators` originally comes with the same length as `cfs`, so the joins were in effect cross products. One could in theory join these two `DataFrames` based on the indices, but I feel more comfortable using `drop_duplicates` and actually joining on the data attributes.

In [None]:
def load_formatted_dataframe(excel_fp, separator="|"):
    cfs = pd.read_excel(
        excel_fp, 
        sheet_name="CFs"
    ).rename(columns={'CF': 'Characterization factor', 'Name': 'Flowable'})
    indicators = pd.read_excel(
        excel_fp,
        sheet_name="Indicators"
    ).drop_duplicates().rename(columns={'Unit': 'Indicator unit'})
    cfs_merged = pd.merge(cfs, indicators, how='left', on=['Method', 'Category', 'Indicator'])
    assert len(cfs_merged) == len(cfs)
    
    cfs_merged['Context'] = cfs.apply(lambda row: row['Compartment'] + separator + row['Subcompartment'], axis=1)
    cfs_merged['Indicator'] = cfs.apply(lambda row: row['Category'] + separator + row['Indicator'], axis=1)
    cfs_merged.drop(columns=['Compartment', 'Subcompartment', 'Category'], inplace=True)
    return cfs_merged

In [None]:
METHOD_METADATA = json.load(open("methods.json"))

In [None]:
INDICATOR_MAPPING = {
    (row['method'], row['indicator']): row['uuid'] 
    for row in json.load(open("indicators.json"))
}

In [None]:
def add_to_method_metadata(df, method_metadata):
    new_data = {
        key: {
            "version": "1.0",
            "description": "",
            "url": "",
            "uuid": uuid.uuid4().hex,
        } for key in set(df['Method']).difference(set(method_metadata))
    }
    if new_data:
        method_metadata.update(new_data)
        with open("methods.json", "w") as f:
            json.dump(
                {key: method_metadata[key] for key in sorted(method_metadata)}, 
                f, 
                ensure_ascii=False, 
                indent=2
            )

In [None]:
def add_to_indicator_metadata(df, indicator_metadata):
    keys = {
        (tpl.Method, tpl.Indicator)
        for tpl in df[['Method', 'Indicator']].drop_duplicates(ignore_index=True).itertuples()
    }
    new_data = {
        key: uuid.uuid4().hex
        for key in keys 
        if key not in indicator_metadata
    }
    if new_data:
        indicator_metadata.update(new_data)
        with open("indicators.json", "w") as f:
            json.dump(
                sorted(
                    [
                        {
                            'method': key[0],
                            'indicator': key[1],
                            'uuid': value
                        } for key, value in indicator_metadata.items()
                    ], key=lambda x: (x['method'], x['indicator'])
                ),
                f, 
                ensure_ascii=False, 
                indent=2
            )

In [None]:
def get_cleaned_dataframe(version):
    fp = Path(f"/Users/chrismutel/Sync/Documents/LCA/Ecoinvent/{version}/LCIA/LCIA Implementation {version}.xlsx")
    if not fp.exists():
        fp = Path(f"/Users/chrismutel/Sync/Documents/LCA/Ecoinvent/{version}/LCIA/LCIA Implementation v{version}.xlsx")
    if not fp.exists():
        fp = Path(f"/Users/chrismutel/Sync/Documents/LCA/Ecoinvent/{version}/LCIA/LCIA_Implementation_{version}.xlsx")
    if not fp.exists():
        raise ValueError(f"File not found: {fp}")
    
    df = load_formatted_dataframe(fp)
    flows = pd.read_csv(Path.cwd().parent / "Elementary flow mapping" / "outputs" / f"ecoinvent-{version}" / f"ecoinvent-{version}.csv")    

    add_to_method_metadata(df, METHOD_METADATA)
    add_to_indicator_metadata(df, INDICATOR_MAPPING)
    
    df['Method UUID'] = df.apply(lambda row: METHOD_METADATA[row['Method']]['uuid'], axis=1)
    df['Indicator UUID'] = df.apply(lambda row: INDICATOR_MAPPING[(row['Method'], row['Indicator'])], axis=1)   
    df = pd.merge(df, flows, how='left', on=['Flowable', 'Context'])

    missing_mask = df['Flow UUID'].isnull()
    if sum(missing_mask):
        print("Dropping {} flows not mapped in elementary flow list".format(sum(missing_mask)))
        df = df[~missing_mask]
    
    df.drop(columns=['Formula', 'Synonyms', 'Class', 'ExternalReference', 'Preferred', 'AltUnit', 'Second CAS'], inplace=True)
    return df[[
        'Method', 'Method UUID', 'Indicator', 'Indicator UUID', 
        'Indicator unit', 'Flowable', 'Flow UUID', 'Context', 
        'Unit', 'CAS No', 'Characterization factor'
    ]]

Split into separate dataframes for each method family:

In [None]:
def clean(s):
    return s.replace(" ", "_").replace(",", "")

In [None]:
def get_resources_and_metadata(df):
    all_methods = sorted(df['Method'].unique())

    metadata, resources = [], []

    for method in all_methods:
        assert method in METHOD_METADATA
        filename = safe_filename(clean(method), add_hash=False)

        # TODO: Could specify a specific resource profile just for LCIA data 
        # to avoid repeating columns, and for data validation

        metadata.append({
            "path": f"{filename}.csv",
            "profile": "tabular-data-resource",
            "mediatype": "text/csv",
            "separator": "|",
            "schema": {
                "fields": [
                    {'name': 'Method', 'type': 'string'},
                    {'name': 'Method UUID', 'type': 'string'},
                    {'name': 'Indicator', 'type': 'string', 'separated': True},
                    {'name': 'Indicator UUID', 'type': 'string'},
                    {'name': 'Indicator unit', 'type': 'string'},
                    {'name': 'Flowable', 'type': 'string'},
                    {'name': 'Flow UUID', 'type': 'string'},
                    {'name': 'Context', 'type': 'string', 'separated': True},
                    {'name': 'Unit', 'type': 'string'},
                    {'name': 'CAS No', 'type': 'string'},
                    {'name': 'Characterization factor', 'type': 'number'},
                ],
            },
        })

        resources.append(df[df.Method == method])    
        
    return resources, metadata

In [None]:
ECOINVENT_EULA_LICENSE = {
    "name": "ecoinvent-eula-2022.05.01",
    "path": "https://ecoinvent.org/wp-content/uploads/2022/04/ecoinvent_new-db-eula_01_04_2022.pdf",
    "title": "ecoinvent End User Licence Agreement effect 2022.05.01",
}

In [None]:
def to_datapackage(
    dirpath: Path,
    resources: list,
    resources_metadata: list,
    name: str,
    author: str,
    description: str,
    elementary_flow_lists: Optional[list] = None,
    version: Optional[str] = None,
    id_: Optional[str] = None,
    licenses: Optional[list] = None,
):
    dirpath.mkdir(parents=True, exist_ok=True)

    name = clean_datapackage_name(name)
    check_name(name)

    metadata = {
        "profile": "tabular-data-package",  # https://dataprotocols.org/tabular-data-package/
        "name": name,
        "description": description,
        "id": id_ or uuid.uuid4().hex,
        "licenses": licenses or DEFAULT_LICENSES,
        "resources": resources_metadata,
        "created": datetime.datetime.utcnow().isoformat("T") + "Z",
    }

    json.dump(metadata, open(dirpath / "metadata.json", "w"), indent=2, ensure_ascii=False)
    
    for df, meta in zip(resources, resources_metadata):
        df.to_csv(dirpath / meta['path'], index=False)

In [None]:
CONFIG = [
    (
        "3.9", 
        "Implementation of selected LCIA methods as described in https://v39.ecoquery.ecoinvent.org/File/File?fileName=ecoinvent+3.9+(2022)%2c+current%5csupporting+documents%5cecoinvent+3.9_LCIA_implementation.7z&hash=85940519&type=Files"
    ),
    (
        "3.8", 
        "Implementation of selected LCIA methods as described in https://v39.ecoquery.ecoinvent.org/File/File?fileName=ecoinvent+3.8+(2021)%2c+outdated%5csupporting+documents%5cecoinvent+3.8_LCIA_implementation.7z&hash=1032948235&type=Files"
    ),
]

In [None]:
for version, description in CONFIG:
    df = get_cleaned_dataframe(version)
    resources, metadata = get_resources_and_metadata(df)
    to_datapackage(
        dirpath=Path.cwd() / "outputs" / version,
        resources=resources,
        resources_metadata=metadata,
        name=f"ecoinvent-{version}-lcia",
        author="Thomas Sonderegger",
        description=description,
        elementary_flow_lists=[f"ecoinvent-{version}"],
        version="1.0",
        licenses=[ECOINVENT_EULA_LICENSE],
    )