In [None]:
# Brightway packages
import bw2io as bi
import bw2data as bd
import pandas as pd
# Brightway type hints
from bw2io import SingleOutputEcospold2Importer
# import/export
import os

In [2]:
from bw2data import projects, databases
from bw2io import create_default_biosphere3

projects.set_current("PB_LCIA")

if "biosphere3" not in databases:
    create_default_biosphere3()
    print("biosphere3 created.")
else:
    print("biosphere3 already exists.")

print("Databases now:", list(databases))


biosphere3 already exists.
Databases now: ['biosphere3']


In [3]:
bio = bd.Database("biosphere3")

rows = []
for flow in bio:
    # Each flow is a dict-like dataset
    rows.append({
        "name": flow.get("name"),
        "categories": flow.get("categories"), # compartment
        "unit": flow.get("unit"),
        "type": flow.get("type"),          # e.g., "emission", "resource", etc.
        "location": flow.get("location"),  # often None for biosphere flows
        "database": flow.get("database"),  # should be "biosphere3"
        "code": flow.get("code"),          # unique identifier inside the database
        "reference_product": flow.get("reference product"),
        "comment": flow.get("comment"),
        "CAS": flow.get("CAS number"),
        "formula": flow.get("formula"),
    })

df = pd.DataFrame(rows)

# Optional: sort for readability
df = df.sort_values(["categories", "name"], na_position="last").reset_index(drop=True)

# 3) Write to Excel
# output_path = "biosphere3_export.xlsx"
# df.to_excel(output_path, index=False)

# print(f"Exported {len(df)} biosphere flows to: {output_path}")

In [4]:
from filters import CompartmentFilter, CASNumberFilter, NameFilter, apply_filter, normalize_cas

In [5]:
import requests
from io import StringIO

# Step 1: Filter for air compartment
print("=" * 60)
print("STEP 1: Filter for Air Compartment")
print("=" * 60)

df_air = apply_filter(df, CompartmentFilter("air"))
print(f"Total flows in 'air' compartment: {len(df_air)}")

STEP 1: Filter for Air Compartment
Total flows in 'air' compartment: 1877


In [6]:
# Step 2: Split air flows by CAS presence
print("\n" + "=" * 60)
print("STEP 2: Split Air Flows by CAS Presence")
print("=" * 60)

df_air_with_cas = apply_filter(df_air, CASNumberFilter(cas=True))
# Normalize CAS numbers in df_air_with_cas
df_air_with_cas['CAS'] = df_air_with_cas['CAS'].apply(normalize_cas)

df_air_without_cas = apply_filter(df_air, CASNumberFilter(cas=False))

print(f"Air flows WITH CAS number: {len(df_air_with_cas)}")
print(f"Air flows WITHOUT CAS number: {len(df_air_without_cas)}")




STEP 2: Split Air Flows by CAS Presence
Air flows WITH CAS number: 1742
Air flows WITHOUT CAS number: 135


In [27]:
# Step 3: Fetch CSV from GitHub
print("\n" + "=" * 60)
print("STEP 3: Fetching CSV from GitHub")
print("=" * 60)

csv_url = "https://raw.githubusercontent.com/chrisroadmap/ar6/main/data_input/ghg_properties/metrics_supplement.csv"
response = requests.get(csv_url)
response.raise_for_status()

df_csv = pd.read_csv(StringIO(response.text))

# Keep only first 6 columns
df_csv = df_csv.iloc[:, :6]
print(f"\nCSV loaded and trimmed to first 6 columns: {df_csv.columns.tolist()}")
print(f"Total rows in CSV: {len(df_csv)}")

# Find the correct column names (case-insensitive search)
name_col = None
cas_col = None
lifetime_col = None
rad_eff_col = None

for col in df_csv.columns:
    col_lower = col.lower()
    if 'name' in col_lower:
        name_col = col
    if 'cas' in col_lower:
        cas_col = col
    if 'lifetime' in col_lower:
        lifetime_col = col
    if 'radiative' in col_lower or ('rad' in col_lower and 'eff' in col_lower):
        rad_eff_col = col

print(f"\nDetected column names:")
print(f"  Name column: {name_col}")
print(f"  CAS column: {cas_col}")
print(f"  Lifetime column: {lifetime_col}")
print(f"  Radiative Efficiency column: {rad_eff_col}")

if cas_col is None:
    print("\n⚠️ ERROR: Could not find CAS column. Available columns:")
    for col in df_csv.columns:
        print(f"  - {col}")
    raise ValueError("CAS column not found in CSV")

# Normalize CAS numbers in CSV using the function from filters.py
df_csv[cas_col] = df_csv[cas_col].apply(normalize_cas)

print(df_csv.to_string())


STEP 3: Fetching CSV from GitHub

CSV loaded and trimmed to first 6 columns: ['# Name', 'CASRN', 'Acronym', 'Formula', 'Lifetime', 'Radiative_efficiency']
Total rows in CSV: 249

Detected column names:
  Name column: # Name
  CAS column: CASRN
  Lifetime column: Lifetime
  Radiative Efficiency column: Radiative_efficiency
                             # Name         CASRN                Acronym                          Formula    Lifetime  Radiative_efficiency
0                    Carbon dioxide             0                                                     CO2         nan                 0.000
1                           Methane             0                                                     CH4      11.800                 0.000
2                     Nitrous oxide             0                                                     N2O     109.000                 0.003
3            Trichlorofluoromethane       75-69-4                 CFC-11                            CCl3F      52.0

In [28]:
# Step 4: Add manual CAS entries for common GHGs
print("\n" + "=" * 60)
print("STEP 4: Add Manual GHG CAS Entries")
print("=" * 60)

import re

# Manual CAS entries for common GHGs
ghg_cas_mapping = {
    "Carbon dioxide": "124-38-9",
    "Methane": "74-82-8",
    "Nitrous oxide": "10024-97-2"
}

for substance_name, cas_number in ghg_cas_mapping.items():

    # Iterate through rows in Name column only
    for idx, row in df_csv.iterrows():
        if name_col and pd.notna(row[name_col]):
            # Check if substance name matches exactly (case-insensitive)
            if str(row[name_col]).strip().lower() == substance_name.lower():
                # Update CAS number for this row
                df_csv.loc[idx, cas_col] = cas_number
                print(f"\n  Updated CAS from '{substance_name}' to '{cas_number}'")

print(df_csv.to_string())


STEP 4: Add Manual GHG CAS Entries

  Updated CAS from 'Carbon dioxide' to '124-38-9'

  Updated CAS from 'Methane' to '74-82-8'

  Updated CAS from 'Nitrous oxide' to '10024-97-2'
                             # Name         CASRN                Acronym                          Formula    Lifetime  Radiative_efficiency
0                    Carbon dioxide      124-38-9                                                     CO2         nan                 0.000
1                           Methane       74-82-8                                                     CH4      11.800                 0.000
2                     Nitrous oxide    10024-97-2                                                     N2O     109.000                 0.003
3            Trichlorofluoromethane       75-69-4                 CFC-11                            CCl3F      52.000                 0.291
4           Dichlorodifluoromethane       75-71-8                 CFC-12                           CCl2F2     102.000 

In [33]:
# Step 5: Match air flows with CAS to CSV data
print("\n" + "=" * 60)
print("STEP 4: Matching Air Flows (with CAS) to CSV")
print("=" * 60)

matching_flows = []
non_matching_flows = []

for idx, row in df_air_with_cas.iterrows():
    flow_cas_normalized = normalize_cas(row['CAS'])
    
    # Find if this CAS matches any in the CSV
    csv_match = df_csv[df_csv['CASRN'] == flow_cas_normalized]
    
    if not csv_match.empty:
        # Found a match
        match_row = csv_match.iloc[0]
        flow_data = row.to_dict()
        flow_data['Lifetime'] = pd.to_numeric(match_row.get(lifetime_col), errors='coerce') if lifetime_col else None
        flow_data['Radiative_Efficiency'] = pd.to_numeric(match_row.get(rad_eff_col), errors='coerce') if rad_eff_col else None
        matching_flows.append(flow_data)
    else:
        # No match found
        non_matching_flows.append(row)

# Create output dataframes
df_air_matched = pd.DataFrame(matching_flows) if matching_flows else pd.DataFrame()
df_air_not_matched = pd.DataFrame(non_matching_flows) if non_matching_flows else pd.DataFrame()

print(f"\nAir flows with CAS that MATCH CSV: {len(df_air_matched)}")
print(f"Air flows with CAS that DON'T MATCH CSV: {len(df_air_not_matched)}")

if len(df_air_matched) > 0:
    print("\nMatching air flows (first 5):")
    print(df_air_matched[['name', 'CAS', 'Lifetime', 'Radiative_Efficiency']].to_string())
else:
    print("\nNo matching air flows")

if len(df_air_not_matched) > 0:
    print("\nNon-matching air flows with CAS (first 5):")
    print(df_air_not_matched[['name', 'CAS']].head())
else:
    print("\nAll air flows with CAS matched!")

print(f"\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Total air flows: {len(df_air)}")
print(f"  ├─ With CAS: {len(df_air_with_cas)}")
print(f"  │  ├─ Matched with CSV: {len(df_air_matched)}")
print(f"  │  └─ Not matched: {len(df_air_not_matched)}")
print(f"  └─ Without CAS: {len(df_air_without_cas)}")


STEP 4: Matching Air Flows (with CAS) to CSV

Air flows with CAS that MATCH CSV: 229
Air flows with CAS that DON'T MATCH CSV: 1513

Matching air flows (first 5):
                                                   name         CAS   Lifetime  Radiative_Efficiency
0                                          Bromopropane    106-94-5      0.041                 0.002
1                                                Butane    106-97-8      0.019                 0.000
2                                Carbon dioxide, fossil    124-38-9        NaN                 0.000
3            Carbon dioxide, from soil or biomass stock    124-38-9        NaN                 0.000
4                            Carbon dioxide, non-fossil    124-38-9        NaN                 0.000
5                                            Chloroform     67-66-3      0.501                 0.074
6                                   Dinitrogen monoxide  10024-97-2    109.000                 0.003
7                            

STEP 1: Filter for Air Compartment
Total flows in 'air' compartment: 1877

STEP 2: Split Air Flows by CAS Presence
Air flows WITH CAS number: 1742
Air flows WITHOUT CAS number: 135

STEP 3: Fetching CSV from GitHub
CSV loaded with columns: ['# Name', 'CASRN', 'Acronym', 'Formula', 'Lifetime', 'Radiative_efficiency', 'AGWP20', 'GWP20', 'AGWP100', 'GWP100', 'AGWP500', 'GWP500', 'AGTP50', 'GTP50', 'AGTP100', 'GTP100', 'CGTP50', 'CGTP100']
Total rows in CSV: 249
                           # Name                  CASRN  \
0                  Carbon dioxide                          
1                         Methane                          
2                   Nitrous oxide                          
3          Trichlorofluoromethane                75-69-4   
4         Dichlorodifluoromethane                75-71-8   

                 Acronym                          Formula  Lifetime  \
0                                                     CO2       nan   
1                                 

In [23]:
print(f"CSV loaded with columns: {df_csv.columns.tolist()}")
print(f"Total rows in CSV: {len(df_csv)}")
print(df_csv.head())

EXAMPLE 1: Single Filters

Flows in 'air' compartment: 1877
                      name categories          CAS
0      1,2-Dichlorobenzene     (air,)  000095-50-1
1           1,4-Butanediol     (air,)  000110-63-4
2               1-Pentanol     (air,)  000071-41-0
3                1-Pentene     (air,)  000109-67-1
4  2,2,4-Trimethyl pentane     (air,)     540-84-1

Flows with CAS number: 4120
                      name categories          CAS
0      1,2-Dichlorobenzene     (air,)  000095-50-1
1           1,4-Butanediol     (air,)  000110-63-4
2               1-Pentanol     (air,)  000071-41-0
3                1-Pentene     (air,)  000109-67-1
4  2,2,4-Trimethyl pentane     (air,)     540-84-1

Flows without CAS number: 589


In [11]:
print(df_with_cas[["name", "categories", "CAS"]].head())

                      name categories          CAS
0      1,2-Dichlorobenzene     (air,)  000095-50-1
1           1,4-Butanediol     (air,)  000110-63-4
2               1-Pentanol     (air,)  000071-41-0
3                1-Pentene     (air,)  000109-67-1
4  2,2,4-Trimethyl pentane     (air,)     540-84-1


In [15]:
# Example: Apply filters sequentially
print("=" * 60)
print("EXAMPLE 2: Sequential Filter Application")
print("=" * 60)


# Start with all flows
df_filtered = df.copy()
print(f"\nStart with: {len(df_filtered)} flows")

# Apply first filter: air compartment
df_filtered = apply_filter(df_filtered, CompartmentFilter("air"))
print(f"After filtering for 'air' compartment: {len(df_filtered)} flows")

# Apply second filter: has CAS number
df_filtered = apply_filter(df_filtered, CASNumberFilter(cas=True))
print(f"After filtering for CAS number: {len(df_filtered)} flows")

print("\nFinal results (air compartment AND has CAS):")
print(df_filtered[["name", "categories", "CAS"]].head(10))


EXAMPLE 2: Sequential Filter Application

Start with: 4709 flows
After filtering for 'air' compartment: 1877 flows
After filtering for CAS number: 1742 flows

Final results (air compartment AND has CAS):
                       name categories          CAS
0       1,2-Dichlorobenzene     (air,)  000095-50-1
1            1,4-Butanediol     (air,)  000110-63-4
2                1-Pentanol     (air,)  000071-41-0
3                 1-Pentene     (air,)  000109-67-1
4   2,2,4-Trimethyl pentane     (air,)     540-84-1
5  2,4-D dimethylamine salt     (air,)  002008-39-1
6           2-Aminopropanol     (air,)  002749-11-3
7       2-Methyl-1-propanol     (air,)  000078-83-1
8         2-Methyl-2-butene     (air,)  000513-35-9
9           2-Methylpentane     (air,)  000107-83-5


In [21]:
# Example: Different sequential combinations
print("=" * 60)
print("EXAMPLE 3: Different Filter Combinations")
print("=" * 60)

# Combination 1: Water compartment WITHOUT CAS number
df_combo1 = df.copy()
df_combo1 = apply_filter(df_combo1, CompartmentFilter("water"))
df_combo1 = apply_filter(df_combo1, CASNumberFilter(cas=False))
print(f"\nWater compartment WITHOUT CAS number: {len(df_combo1)} flows")
print(df_combo1[["name", "categories", "CAS"]].head(5))

# Combination 2: Flows with 'carbon' in name AND with CAS number
df_combo2 = df.copy()
df_combo2 = apply_filter(df_combo2, NameFilter("carbon"))
df_combo2 = apply_filter(df_combo2, CASNumberFilter(cas=True))
print(f"\n'Carbon' in name AND has CAS number: {len(df_combo2)} flows")
print(df_combo2[["name", "CAS"]].head(10))

# Combination 3: Soil compartment
df_combo3 = df.copy()
df_combo3 = apply_filter(df_combo3, CompartmentFilter("soil"))
print(f"\nSoil compartment: {len(df_combo3)} flows")
print(df_combo3[["name", "categories"]].head(10))


EXAMPLE 3: Different Filter Combinations

Water compartment WITHOUT CAS number: 174 flows
                                                name  \
0  Energy, potential (in hydropower reservoir), c...   
1                         Volume occupied, reservoir   
2                    AOX, Adsorbable Organic Halogen   
3                Actinides, radioactive, unspecified   
4                                       Antimony ion   

                     categories   CAS  
0  (natural resource, in water)  None  
1  (natural resource, in water)  None  
2                      (water,)  None  
3                      (water,)  None  
4                      (water,)  None  

'Carbon' in name AND has CAS number: 95 flows
                                          name          CAS
0                           Ammonium carbonate  000506-87-6
1                       Carbon dioxide, fossil  000124-38-9
2   Carbon dioxide, from soil or biomass stock  000124-38-9
3                   Carbon dioxide, non-fossil

In [20]:
df_combo2_sorted = df_combo2.sort_values("CAS")
print(df_combo2_sorted[["name", "CAS"]].to_string())

                                               name          CAS
71                                      Bicarbonate  000071-52-3
17                                 Carbon disulfide  000075-15-0
50                                 Carbon disulfide  000075-15-0
72                                 Carbon disulfide  000075-15-0
77                                 Carbon disulfide  000075-15-0
82                                 Carbon disulfide  000075-15-0
28                                 Carbon disulfide  000075-15-0
91                                 Carbon disulfide  000075-15-0
4                                  Carbon disulfide  000075-15-0
39                                 Carbon disulfide  000075-15-0
87                                 Carbon disulfide  000075-15-0
58                           Carbon dioxide, in air  000124-38-9
38                       Carbon dioxide, non-fossil  000124-38-9
27                       Carbon dioxide, non-fossil  000124-38-9
26       Carbon dioxide, 