In [1]:
import pandas as pd

# Import external modules for data processing

from filters import CompartmentFilter, CASNumberFilter, ExcludeCompartmentFilter, VOCFilter, SpecificCASFilter, NameFilter, apply_filter, normalize_cas, has_carbon
from data_fetching import fetch_ar6_ghg_data, fetch_hodnebrog_data
from data_enrichment import enrich_ar6_with_manual_ghg, enrich_ar6_with_hodnebrog
from flow_matching import match_flows

# for dummy database
from dummy import make_dummy_database

In [2]:
# Step 0: Initialize and extract biosphere3 database

from biosphere_export import ensure_database_exists, extract_biosphere_flows

# Ensure biosphere3 database exists
ensure_database_exists("biosphere3")

# Extract all flows from biosphere3 into a DataFrame
df = extract_biosphere_flows("biosphere3")
print(f"Loaded {len(df)} flows from biosphere3 database")

biosphere3 already exists in current project.
Loaded 4709 flows from biosphere3 database


In [3]:
# df = make_dummy_database()

In [4]:
# Step 1: Fetch IPCC AR6 GHG data from GitHub

df_ar6, ar6_cols = fetch_ar6_ghg_data()

# Step 2: Add manual entries for CO2, CH4, N2O with precise values

df_ar6 = enrich_ar6_with_manual_ghg(df_ar6, ar6_cols)

# Step 3: Fetch and enrich with Hodnebrog et al. (2020) data

df_hodne, hodne_cols = fetch_hodnebrog_data()

# Enrich AR6 data with Hodnebrog values
df_ar6, update_counts = enrich_ar6_with_hodnebrog(df_ar6, df_hodne, ar6_cols, hodne_cols)

Total rows in CSV: 249
  Updated 'Carbon dioxide' -> CAS 124-38-9, Radiative_Efficiency 1.33e-05, Molar_Mass 44.01 g/mol
  Updated 'Methane' -> CAS 74-82-8, Radiative_Efficiency 0.000388, Molar_Mass 16.04 g/mol
  Updated 'Nitrous oxide' -> CAS 10024-97-2, Radiative_Efficiency 0.0032, Molar_Mass 44.01 g/mol
Updated radiative efficiency from Hodnebrog for 246 rows.
Updated molar mass from Hodnebrog for 246 rows.

Remaining entities with no molar mass: 0


In [5]:
print(df_ar6.to_string())

                             # Name         CASRN                Acronym                          Formula Lifetime [years]  Radiative_efficiency [W m-2 ppb-1] Molar_mass [g mol-1]
0                    Carbon dioxide      124-38-9                                                     CO2              nan                            0.000013                44.01
1                           Methane       74-82-8                                                     CH4           11.800                            0.000388                16.04
2                     Nitrous oxide    10024-97-2                                                     N2O          109.000                            0.003200                44.01
3            Trichlorofluoromethane       75-69-4                 CFC-11                            CCl3F           52.000                            0.259410               137.36
4           Dichlorodifluoromethane       75-71-8                 CFC-12                           C

In [6]:
# Step 4: Filter for air compartment, making sure to exclude natural resource, soil, and water

df_air = apply_filter(df, CompartmentFilter("air"))
df_air = apply_filter(df_air, ExcludeCompartmentFilter(["natural resource", "soil", "water"]))
print(f"Total flows in 'air' compartment: {len(df_air)}")

Total flows in 'air' compartment: 1867


In [7]:
# Filter for carbon dioxide (CAS: 124-38-9) using SpecificCASFilter
from filters import SpecificCASFilter

df_air_reset = df_air.reset_index().rename(columns={'index': 'orig_index'})

cas_co2 = ["124-38-9"]
df_co2_raw = apply_filter(df_air_reset, SpecificCASFilter(cas_co2))
matched_idx = df_co2_raw['orig_index']

df_nonco2 = (
    df_air_reset.loc[~df_air_reset['orig_index'].isin(matched_idx)]
    .drop(columns=['orig_index'])
    .reset_index(drop=True)
 )

df_co2 = (
    df_co2_raw.drop(columns=['orig_index'])
    .reset_index(drop=True)
 )

print(f"Flows with CO2 (by CAS 124-38-9): {len(df_co2)}")
print(f"Flows excluding CO2: {len(df_nonco2)}")

Flows with CO2 (by CAS 124-38-9): 15
Flows excluding CO2: 1852


In [8]:
# Split non-CO2/non-CH4 flows by CAS presence
df_cas = apply_filter(df_nonco2, CASNumberFilter(cas=True))
df_no_cas = apply_filter(df_nonco2, CASNumberFilter(cas=False))

print(f"Non-CO2/non-CH4 flows WITH CAS: {len(df_cas)}")
print(f"Non-CO2/non-CH4 flows WITHOUT CAS: {len(df_no_cas)}")
print(f"Total non-CO2/non-CH4 flows: {len(df_nonco2)}")

Non-CO2/non-CH4 flows WITH CAS: 1719
Non-CO2/non-CH4 flows WITHOUT CAS: 133
Total non-CO2/non-CH4 flows: 1852


In [18]:
df = df_cas.copy().reset_index(drop=True)
df["CAS"] = df["CAS"].apply(normalize_cas)

# One classifier instance (online enabled)
clf = VOCFilter("UNKNOWN", online_lookup=False, cache_db="voc_cache.sqlite")

# Import count_carbon_atoms at the top
from filters import count_carbon_atoms

def classify_with_details(cas):
    cas = normalize_cas(cas)
    status = clf._classify_flow(cas)  # triggers offline + (if needed) online
    entry = clf._classification_cache.get(cas, {})
    formula = entry.get("formula")
    return pd.Series({
        "voc_status": status,
        "formula": formula,
        "bp_c": entry.get("bp_c"),
        "source": entry.get("source"),
        "carbon_atoms": count_carbon_atoms(formula),
    })

details = df["CAS"].apply(classify_with_details)
for col in details.columns:
    df[col] = details[col].values

# Now split cleanly
df_voc = df[df["voc_status"] == "VOC"]
df_not_voc = df[df["voc_status"] == "NOT_VOC"]
df_unknown = df[df["voc_status"] == "UNKNOWN"]

print("VOC:", len(df_voc))
print("NOT_VOC:", len(df_not_voc))
print("UNKNOWN:", len(df_unknown))
print("Total:", len(df_voc) + len(df_not_voc) + len(df_unknown))

# This is the "error visibility" you expected:
print("\nOnline failure messages (count):")
mask_fail = df["source"].fillna("").str.startswith("online lookup failed")
print(df.loc[mask_fail, "source"].value_counts().to_string())

VOC: 697
NOT_VOC: 608
UNKNOWN: 414
Total: 1719

Online failure messages (count):
Series([], )


In [16]:
# Find CAS column in biosphere data
flow_cas_col = None
for col in df.columns:
    if 'cas' in col.lower():
        flow_cas_col = col
        break

# Match VOC, NOT_VOC and UNKNOWN flows against AR6 table

# VOC matches -> df_voc_lf
# VOC non-matches -> df_voc_nolf
# NOT_VOC + UNKNOWN matches -> df_other_ghg
# NOT_VOC non-matches -> df_no_ghg
# UNKNOWN non-matches -> df_unknown

# Match VOC
matched_voc, not_matched_voc = match_flows(
    df_voc,
    df_ar6,
    flow_cas_col=flow_cas_col,
    ar6_cas_col=ar6_cols['cas'],
    ar6_cols={
        'lifetime': ar6_cols['lifetime'],
        'rad_eff': ar6_cols['rad_eff'],
        'molar_mass': ar6_cols['molar_mass']
    }
)

# Match NOT_VOC
matched_not, not_matched_not = match_flows(
    df_not_voc,
    df_ar6,
    flow_cas_col=flow_cas_col,
    ar6_cas_col=ar6_cols['cas'],
    ar6_cols={
        'lifetime': ar6_cols['lifetime'],
        'rad_eff': ar6_cols['rad_eff'],
        'molar_mass': ar6_cols['molar_mass']
    }
)

# Match UNKNOWN
matched_unknown, not_matched_unknown = match_flows(
    df_unknown,
    df_ar6,
    flow_cas_col=flow_cas_col,
    ar6_cas_col=ar6_cols['cas'],
    ar6_cols={
        'lifetime': ar6_cols['lifetime'],
        'rad_eff': ar6_cols['rad_eff'],
        'molar_mass': ar6_cols['molar_mass']
    }
)

# Store VOC sets
df_voc_lf = matched_voc
df_voc_nolf = not_matched_voc.reset_index(drop=True)

# Combine matches from NOT_VOC + UNKNOWN into df_other_ghg
df_other_ghg = pd.concat([matched_not, matched_unknown], ignore_index=True)

# Store non-matching sets
df_no_ghg = not_matched_not.reset_index(drop=True)
# Overwrite df_unknown with still-unmatched UNKNOWN rows
df_unknown = not_matched_unknown.reset_index(drop=True)

print(f"VOC with lifetime: {len(df_voc_lf)}")
print(f"VOC without lifetime: {len(df_voc_nolf)}")
print(f"Other GHGs: {len(df_other_ghg)}")
print(f"Not relevant flows: {len(df_no_ghg)}")
print(f"Remaining UNKNOWN: {len(df_unknown)}")

VOC with lifetime: 197
VOC without lifetime: 500
Other GHGs: 15
Not relevant flows: 593
Remaining UNKNOWN: 414


In [11]:
print(df_voc_lf.to_string())

                                                   name                                     categories      unit      type location    database                                  code reference_product comment        CAS   formula voc_status    bp_c                            source  Lifetime [years]  Radiative_Efficiency [W m-2 ppb-1]  Molar_Mass [g mol-1]
0                                          Bromopropane                                         (air,)  kilogram  emission     None  biosphere3  f2e4394f-61bb-5493-983d-d3d5b7b96a41              None    None   106-94-5    C3H7Br        VOC   70.80  BP:chemicals | Formula:chemicals             0.041                            0.002340                123.00
1                                                Butane                                         (air,)  kilogram  emission     None  biosphere3  982b0510-96ac-4bcb-a758-e98006b95f4d              None    None   106-97-8     C4H10        VOC   -0.50  BP:chemicals | Formula:chemicals   

In [12]:
unspecified_voc = ["VOC", 
                   "Aldehydes",
                   "Hydrocarbons",
                   "NMVOC"]  


df_voc_unspecified = apply_filter(df_no_cas, NameFilter(unspecified_voc))

In [13]:
df_voc_no_lifetime = pd.concat([df_voc_nolf, df_voc_unspecified], ignore_index=True)

print({len(df_voc_nolf)})
print({len(df_voc_unspecified)})
print({len(df_voc_no_lifetime)})

print(df_voc_no_lifetime.to_string())

{500}
{37}
{537}
                                              name                                     categories      unit      type location    database                                  code reference_product comment          CAS      formula voc_status    bp_c                            source
0                              1,2-Dichlorobenzene                                         (air,)  kilogram  emission     None  biosphere3  b1c36287-329c-49f0-93c2-68246d34007c              None    None      95-50-1      C6H4Cl2        VOC  180.20  BP:chemicals | Formula:chemicals
1                                   1,4-Butanediol                                         (air,)  kilogram  emission     None  biosphere3  09db39be-d9a6-4fc3-8d25-1f80b23e9131              None    None     110-63-4      C4H10O2        VOC  229.50  BP:chemicals | Formula:chemicals
2                                       1-Pentanol                                         (air,)  kilogram  emission     None  biosphere

In [14]:
print("Overview of classification")
print(f"CO2 flows: {len(df_co2)}")
print(f"CH4 flows: {len(df_ch4)}")
print(f"VOC flows: {len(df_voc)}")
print(f"Other GHGs flows: {len(df_other)}")
print(f"Flows where information for classification is insufficient: {len(df_unknown)}")

Overview of classification
CO2 flows: 15


NameError: name 'df_ch4' is not defined

In [None]:
print(df_voc.to_string())