In [1]:
from bw2data.utils import recursive_str_to_unicode
from lxml import objectify
from numbers import Number
from pathlib import Path
import bw2io as bi
import numpy as np
import pandas as pd

# CAS Numbers

All information from https://www.cas.org/support/documentation/chemical-substances/checkdig

CAS numbers have the for `A`-`B`-`C`, where:

* `A` has between 2 and 7 integers
* `B` has 2 integers
* `C` is a single check digit integer

To calculate the check digit:

Each integer *starting from the right*, and ignoring hyphens, is given a weight corresponding to its ordinal position (1-indexed). The check is calculated from the sum of the weighted values, taking the values in the ones digit. For example for `107-07-3`, the sum would be:

$$ 1 \cdot 7 + 2 \cdot 0 + 3 \cdot 7 + 4 \cdot 0 + 5 \cdot 1 = 33$$

And the check digit would be 3 (the values in the ones position. Similarly, for `110-63-4`:

$$ 1 \cdot 3 + 2 \cdot 6 + 3 \cdot 0 + 4 \cdot 1 + 5 \cdot 1 = 24$$

And indeed we get 4 as the check digit.

In [2]:
reference = pd.read_csv(Path.cwd() / "inputs" / "ecoinventEFv3.7.csv")
reference

Unnamed: 0,Flowable,CASNo,Formula,Synonyms,Unit,Class,ExternalReference,Preferred,Context,FlowUUID,AltUnit,Unnamed: 11,Second CAS
0,"1,3-Dioxolan-2-one",000096-49-1,,,kg,chemical,,,water/unspecified,5b7d620e-2238-5ec9-888a-6999218b6974,,,96-49-1
1,"1,4-Butanediol",000110-63-4,,Butylene glycol,kg,chemical,,,"air/low population density, long-term",d21da01e-f96f-4db5-9746-7b70db8a1f2c,,,36684-44-3; 28324-25-6; 110-63-4; 110-63-4
2,"1,4-Butanediol",000110-63-4,,Butylene glycol,kg,chemical,,,air/lower stratosphere + upper troposphere,90653a29-2f53-4b1b-88bd-9ae2fe64a8d6,,,36684-44-3; 28324-25-6; 110-63-4; 110-63-4
3,"1,4-Butanediol",000110-63-4,,Butylene glycol,kg,chemical,,,air/non-urban air or from high stacks,83bafcf1-2f2e-4a32-89a0-f1f16ca10626,,,36684-44-3; 28324-25-6; 110-63-4; 110-63-4
4,"1,4-Butanediol",000110-63-4,,Butylene glycol,kg,chemical,,,air/unspecified,09db39be-d9a6-4fc3-8d25-1f80b23e9131,,,36684-44-3; 28324-25-6; 110-63-4; 110-63-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4305,Zirconium-95,013967-71-0,,,kBq,chemical,,,"water/ground-, long-term",ffd4940c-b652-4622-b7d0-d4d861b083c3,,,13967-71-0
4306,Zirconium-95,013967-71-0,,,kBq,chemical,,,water/ocean,aa7f8d81-9670-4a5c-af68-b50fbe247958,,,13967-71-0
4307,Zirconium-95,013967-71-0,Zr,,kBq,chemical,,,water/surface water,8db2694c-9a3f-4f8d-a915-cd8a380f5394,,,13967-71-0
4308,Zirconium-95,013967-71-0,,,kBq,chemical,,,water/unspecified,b71d0281-e4b2-4831-a2bd-aed221d2a15f,,,13967-71-0


It seems like, at least in this reference data, the column `CASNo` is zero-padded (i.e. `A` is the full seven numbers, with zeros added), while `Second CAS` has the leading zeros removed.

In [3]:
reference[reference['Second CAS'] == '96-49-1']

Unnamed: 0,Flowable,CASNo,Formula,Synonyms,Unit,Class,ExternalReference,Preferred,Context,FlowUUID,AltUnit,Unnamed: 11,Second CAS
0,"1,3-Dioxolan-2-one",000096-49-1,,,kg,chemical,,,water/unspecified,5b7d620e-2238-5ec9-888a-6999218b6974,,,96-49-1


We set up convenience functions for these two forms:

In [4]:
def check_digit(s):
    ERROR = "CAS Check Digit error: CAS '{}' has check digit of {}, but it should be {}"
    
    total = sum((a + 1) * int(b) for a, b in zip(range(9), s.replace("-", "")[-2::-1]))
    if not total % 10 == int(s[-1]):
        raise ValueError(ERROR.format(s, s[-1], total % 10))
                

def check_cas(s):
    if not s:
        return None
    assert s.count("-") == 2
    check_digit(s)
    return True


def zero_pad_cas(s):
    if not s:
        return s
    zeros = "0" * (12- len(s))
    return zeros + s
    
    
def no_padding_cas(s):
    if not s:
        return s
    return s.lstrip("0")

Check our functions:

In [5]:
assert zero_pad_cas('96-49-1') == "0000096-49-1"
assert no_padding_cas("0000096-49-1") == '96-49-1'
assert check_cas('96-49-1')

Should raise an error:

In [6]:
check_cas('96-49-2')

ValueError: CAS Check Digit error: CAS '96-49-2' has check digit of 2, but it should be 1

Unfortunately, we also have some bad data (found because we check the check digit). Some of these have been fixed in 3.9, but we will fix it in other files as well.

This is a bit tricky, as ideally we would present the data strictly as given in the input files. However, these are cases where the provided values are simply false, and this could hinder matching, which is the main purpose of generating the elementary flow lists.

In [7]:
fixed_cas_values = {
    '7727-34-7': '7727-43-7',  # Barite; https://pubchem.ncbi.nlm.nih.gov/compound/24414
    '439-94-3': '7439-94-3',   # Lutetium; https://pubchem.ncbi.nlm.nih.gov/compound/23929
    '117-15-3': '107-15-3',    # Ethylenediamine; https://pubchem.ncbi.nlm.nih.gov/compound/3301
    '75-89-5': '74-89-5',      # Methyl amine; https://pubchem.ncbi.nlm.nih.gov/compound/6329
}

In [8]:
def fix_cas(s):
    s = no_padding_cas(s.strip() if s else None)
    return fixed_cas_values.get(s, s)

There are a few attributes which are not provided in the XML, and which we need to retrieve from the reference data. They are:

* `Class`: String, has 12 possible values, like `chemical` or energy
* `Preferred`: Mostly missing, but sometimes `0` or `1`
* `Second CAS`: List of strings, separated in the reference data by `;`

We can make lookup dictionaries using the name and context.

In [9]:
reference['Compound'] = list(zip(reference['Flowable'], reference['Context']))

In [10]:
class_mapping = dict(zip(reference['Compound'], reference['Class']))
preferred_mapping = dict(zip(reference['Compound'], reference['Preferred']))
second_cas_mapping = dict(zip(reference['Compound'], reference['Second CAS']))

Change values in `preferred_mapping` to integers when possible:

In [11]:
preferred_mapping = {k: int(v) if v in {0.0, 1.0} else v for k, v in preferred_mapping.items()}

We need to clean up `second_cas_mapping` to remove duplicate entries and use a single, consistent separator.

In [12]:
def clean_cas_multiple(obj):
    if not obj or (not isinstance(obj, str) and np.isnan(obj)):
        return None
    return {no_padding_cas(s.strip()) for s in obj.split(";")}

In [13]:
second_cas_mapping = {k: clean_cas_multiple(v) for k, v in second_cas_mapping.items()}

Code to read in the XML and format to UNEP format

In [14]:
def extract_flow_data(o, separator="|"):
    cas = fix_cas(o.get("casNumber"))
    key = (
        o.name.text, 
        "/".join((
            o.compartment.compartment.text,
            o.compartment.subcompartment.text,
        ))
    )
    data = {
        "Flowable": o.name.text,
        'CASNo': zero_pad_cas(cas),
        "Formula": o.get("formula"),
        "Synonyms": separator.join(
            syn.text    
            for syn in o.iterchildren()
            if syn.tag == "{http://www.EcoInvent.org/EcoSpold02}synonym"
            and syn.text 
        ),
        "Unit": o.unitName.text,
        "Class": class_mapping.get(key),
        "ExternalReference": None,
        'Preferred': preferred_mapping.get(key),
        "Context": separator.join((
            o.compartment.compartment.text,
            o.compartment.subcompartment.text,
        )),
        "FlowUUID": o.get("id"),
        "AltUnit": None,
        'Second CAS': separator.join(second_cas_mapping.get(key) or "") or None
    }    
    check_cas(cas)
    return data
    
    
def extract_elem_flows_xml(fp):
    if not isinstance(fp, Path):
        fp = Path(fp)
    assert fp.is_file()
    
    root = objectify.parse(open(fp, encoding="utf-8")).getroot()
    flow_data = recursive_str_to_unicode(
        [extract_flow_data(ds) for ds in root.iterchildren()]
    )
    return pd.DataFrame(sorted(flow_data, key=lambda x: (x['Flowable'], x['Context'])))

In [15]:
BASE_DATA_DIR = Path("/Users/chrismutel/Sync/Documents/LCA/Ecoinvent/")

In [16]:
df = extract_elem_flows_xml(BASE_DATA_DIR / "3.6" / "cutoff" / "MasterData" / "ElementaryExchanges.xml")
df.to_csv(path_or_buf=Path.cwd() / "outputs" / "ecoinventEFv3.6.csv", index=False)

In [17]:
df = extract_elem_flows_xml(BASE_DATA_DIR / "3.7" / "cutoff" / "MasterData" / "ElementaryExchanges.xml")
df.to_csv(path_or_buf=Path.cwd() / "outputs" / "ecoinventEFv3.7.csv", index=False)

In [18]:
df = extract_elem_flows_xml(BASE_DATA_DIR / "3.8" / "cutoff" / "MasterData" / "ElementaryExchanges.xml")
df.to_csv(path_or_buf=Path.cwd() / "outputs" / "ecoinventEFv3.8.csv", index=False)

In [19]:
df = extract_elem_flows_xml(BASE_DATA_DIR / "3.9" / "cutoff" / "MasterData" / "ElementaryExchanges.xml")
df.to_csv(path_or_buf=Path.cwd() / "outputs" / "ecoinventEFv3.9.csv", index=False)