In [1]:
import pandas as pd

# Import external modules for data processing

from filters import CompartmentFilter, CASNumberFilter, ExcludeCompartmentFilter, NMVOCFilter, SpecificCASFilter, apply_filter, normalize_cas
from data_fetching import fetch_ar6_ghg_data, fetch_hodnebrog_data
from data_enrichment import enrich_ar6_with_manual_ghg, enrich_ar6_with_hodnebrog
from flow_matching import match_flows

# for dummy database
from dummy import make_dummy_database

In [2]:
# Step 0: Initialize and extract biosphere3 database

from biosphere_export import ensure_database_exists, extract_biosphere_flows

# Ensure biosphere3 database exists
ensure_database_exists("biosphere3")

# Extract all flows from biosphere3 into a DataFrame
df = extract_biosphere_flows("biosphere3")
print(f"Loaded {len(df)} flows from biosphere3 database")

biosphere3 already exists in current project.
Loaded 4709 flows from biosphere3 database


In [None]:
# df = make_dummy_database()

In [3]:
# Step 1: Fetch IPCC AR6 GHG data from GitHub

df_ar6, ar6_cols = fetch_ar6_ghg_data()

# Step 2: Add manual entries for CO2, CH4, N2O with precise values

df_ar6 = enrich_ar6_with_manual_ghg(df_ar6, ar6_cols)

# Step 3: Fetch and enrich with Hodnebrog et al. (2020) data

df_hodne, hodne_cols = fetch_hodnebrog_data()

# Enrich AR6 data with Hodnebrog values
df_ar6, update_counts = enrich_ar6_with_hodnebrog(df_ar6, df_hodne, ar6_cols, hodne_cols)

Total rows in CSV: 249
  Updated 'Carbon dioxide' -> CAS 124-38-9, Radiative_Efficiency 1.33e-05, Molar_Mass 0.04401
  Updated 'Methane' -> CAS 74-82-8, Radiative_Efficiency 0.000388, Molar_Mass 0.01604
  Updated 'Nitrous oxide' -> CAS 10024-97-2, Radiative_Efficiency 0.0032, Molar_Mass 0.04401
Updated radiative efficiency from Hodnebrog for 246 rows.
Updated molar mass from Hodnebrog for 246 rows.

Remaining entities with no molar mass: 0


In [4]:
# Step 4: Filter for air compartment, making sure to exclude natural resource, soil, and water

df_air = apply_filter(df, CompartmentFilter("air"))
df_air = apply_filter(df_air, ExcludeCompartmentFilter(["natural resource", "soil", "water"]))
print(f"Total flows in 'air' compartment: {len(df_air)}")

Total flows in 'air' compartment: 1867


In [5]:
# Filter for carbon dioxide (CAS: 124-38-9) and methane (CAS: 74-82-8) using SpecificCASFilter
from filters import SpecificCASFilter

df_air_reset = df_air.reset_index().rename(columns={'index': 'orig_index'})

cas_targets = ["124-38-9", "74-82-8"]
df_co2_ch4 = apply_filter(df_air_reset, SpecificCASFilter(cas_targets))
matched_idx = df_co2_ch4['orig_index']

df_nonco2_nonch4 = df_air_reset[~df_air_reset['orig_index'].isin(matched_idx)] \
    .drop(columns=['orig_index']).reset_index(drop=True)

df_co2 = apply_filter(df_air_reset, SpecificCASFilter(["124-38-9"])) \
    .drop(columns=['orig_index']).reset_index(drop=True)
df_ch4 = apply_filter(df_air_reset, SpecificCASFilter(["74-82-8"])) \
    .drop(columns=['orig_index']).reset_index(drop=True)

print(f"Flows with CO2 (by CAS 124-38-9): {len(df_co2)}")
print(f"Flows with CH4 (by CAS 74-82-8): {len(df_ch4)}")
print(f"Flows excluding CO2 or CH4: {len(df_nonco2_nonch4)}")

Flows with CO2 (by CAS 124-38-9): 15
Flows with CH4 (by CAS 74-82-8): 15
Flows excluding CO2 or CH4: 1837


In [6]:
# Split non-CO2/non-CH4 flows by CAS presence
df_cas = apply_filter(df_nonco2_nonch4, CASNumberFilter(cas=True))
df_no_cas = apply_filter(df_nonco2_nonch4, CASNumberFilter(cas=False))

print(f"Non-CO2/non-CH4 flows WITH CAS: {len(df_cas)}")
print(f"Non-CO2/non-CH4 flows WITHOUT CAS: {len(df_no_cas)}")
print(f"Total non-CO2/non-CH4 flows: {len(df_nonco2_nonch4)}")

Non-CO2/non-CH4 flows WITH CAS: 1704
Non-CO2/non-CH4 flows WITHOUT CAS: 133
Total non-CO2/non-CH4 flows: 1837


In [None]:
# Normalize CAS numbers
df_cas = df_cas.copy()
df_cas['CAS'] = df_cas['CAS'].apply(normalize_cas)

# Apply NMVOCFilter to classify flows
df_nmvoc = apply_filter(
    df_cas,
    NMVOCFilter(
        "NMVOC",
        cache_db="nmvoc_cache.sqlite",
        threshold_c=250.0,
        allow_estimates=False,
        online_lookup=False,
    ),
)
df_not_nmvoc = apply_filter(
    df_cas,
    NMVOCFilter(
        "NOT_NMVOC",
        cache_db="nmvoc_cache.sqlite",
        threshold_c=250.0,
        allow_estimates=False,
        online_lookup=False,
    ),
)
df_unknown = apply_filter(
    df_cas,
    NMVOCFilter(
        "UNKNOWN",
        cache_db="nmvoc_cache.sqlite",
        threshold_c=250.0,
        allow_estimates=False,
        online_lookup=True,
    ),
)

print(f"\nClassification results for non-CO2/non-CH4 flows:")
print(f"  NMVOC: {len(df_nmvoc)}")
print(f"  NOT_NMVOC: {len(df_not_nmvoc)}")
print(f"  UNKNOWN: {len(df_unknown)}")
print(f"  Total: {len(df_nmvoc) + len(df_not_nmvoc) + len(df_unknown)}")


Classification results for non-CO2/non-CH4 flows:
  NMVOC: 682
  NOT_NMVOC: 608
  UNKNOWN: 414
  Total: 1704


In [8]:
# Find CAS column in biosphere data
flow_cas_col = None
for col in df.columns:
    if 'cas' in col.lower():
        flow_cas_col = col
        break

# Match NOT_NMVOC and UNKNOWN flows against AR6 table
# All matches (from both sets) -> df_other
# Non-matching from NOT_NMVOC -> df_not_other
# Non-matching from UNKNOWN -> overwrite df_unknown

df_not_other, df_other_not = None, None  # placeholders for clarity

# Match NOT_NMVOC
matched_not, not_matched_not = match_flows(
    df_not_nmvoc,
    df_ar6,
    flow_cas_col=flow_cas_col,
    ar6_cas_col=ar6_cols['cas'],
    ar6_cols={
        'lifetime': ar6_cols['lifetime'],
        'rad_eff': ar6_cols['rad_eff'],
        'molar_mass': ar6_cols['molar_mass']
    }
)

# Match UNKNOWN
matched_unknown, not_matched_unknown = match_flows(
    df_unknown,
    df_ar6,
    flow_cas_col=flow_cas_col,
    ar6_cas_col=ar6_cols['cas'],
    ar6_cols={
        'lifetime': ar6_cols['lifetime'],
        'rad_eff': ar6_cols['rad_eff'],
        'molar_mass': ar6_cols['molar_mass']
    }
)

# Combine matches from both into df_other
df_other = pd.concat([matched_not, matched_unknown], ignore_index=True)

# Store non-matching sets
df_not_other = not_matched_not
# Overwrite df_unknown with still-unmatched UNKNOWN rows
df_unknown = not_matched_unknown.reset_index(drop=True)

print(f"Matched from NOT_NMVOC: {len(matched_not)}")
print(f"Matched from UNKNOWN: {len(matched_unknown)}")
print(f"Total matched: {len(df_other)}")
print(f"Remaining NOT_NMVOC: {len(df_not_other)}")
print(f"Remaining UNKNOWN: {len(df_unknown)}")

Matched from NOT_NMVOC: 15
Matched from UNKNOWN: 0
Total matched: 15
Remaining NOT_NMVOC: 593
Remaining UNKNOWN: 414


In [11]:
print(df_not_nmvoc.to_string())

                                                         name                                     categories            unit      type location    database                                  code reference_product comment          CAS formula nmvoc_status
0                                    2,4-D dimethylamine salt                                         (air,)        kilogram  emission     None  biosphere3  1dfb1f01-807b-5379-94f7-88e3ae421f57              None    None    2008-39-1    None    NOT_NMVOC
1                                         2-Nitrobenzoic acid                                         (air,)        kilogram  emission     None  biosphere3  98d3f159-b420-4e43-aab5-4fd093b82429              None    None     552-16-9    None    NOT_NMVOC
2                                                   Abamectin                                         (air,)        kilogram  emission     None  biosphere3  47da714a-ed02-5e6e-9e50-ef26a1d13a9d              None    None   71751-41-2    No

In [9]:
print("Overview of classification")
print(f"CO2 flows: {len(df_co2)}")
print(f"CH4 flows: {len(df_ch4)}")
print(f"NMVOC flows: {len(df_nmvoc)}")
print(f"Other GHGs flows: {len(df_other)}")
print(f"Flows where information for classification is insufficient: {len(df_unknown)}")

Overview of classification
CO2 flows: 15
CH4 flows: 15
NMVOC flows: 682
Other GHGs flows: 15
Flows where information for classification is insufficient: 414


In [None]:
print(df_unknown.to_string())