# Preparing Electrification emissions into NEI-SMOKE format

Caleb prepared two sets of emission files (e.g., facility_easyhard (with EIS_ID and SCC) and powerplant_easyhard (with only EIS_ID)). 

 - For "facility" data, it will be similar to the CCS emission processing (matching EIS_ID and SCC).

 - For the powerplant data, I need to process the facility-level emisisons using 2020 NEI emissions (per EIS ID and per SCC) by splitting into each SCC by the NEI 2020 emissions weight. 



## Step 1: Read raw emissions and NEI-SMOKE all point source shapefile

In [None]:
import geopandas as gpd
from pyproj import CRS
import os, sys

# Add the path to the main package directory
package_path = os.path.abspath('/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/LOCAETA_AQ')
if package_path not in sys.path:
    sys.path.append(package_path)

import emission_processing

# Read the point source emissions
original_emis ='/Users/yunhalee/Documents/LOCAETA/RCM/INMAP/evaldata_v1.6.1/2020_nei_emissions/combined_NEI2020_pt_oilgas_ptegu_ptnonipm_w_sectors.shp'
nei_all_pt = gpd.read_file(original_emis)

# Reset index to ensure proper comparison
nei_all_pt.reset_index(drop=True, inplace=True)

# rename the nei emissions
pollutant_cols = ['NOx','PM2_5', 'VOC','NH3', 'SOx']

col_dict = {}
for poll in pollutant_cols:
    col_dict[poll] = f'{poll}_nei'

nei_all_pt.rename(columns = col_dict, inplace=True)

nei_all_pt.head()

In [None]:
import pandas as pd
import geopandas as gpd
import os
import numpy as np

def reformat_powerplant(df):

    # columns I need
    pollutant_cols = [col for col in df.columns if '_tons_final' in col]
    base_cols = [col for col in df.columns if '_tons_base' in col]
    Facilities_col_names = pollutant_cols + base_cols + ['eis','cambium_gea','DOE/EIA ORIS plant or facility code']

    # subset the dataframe 
    df = df[Facilities_col_names] 

    # Total before grouping
    total_before = df[pollutant_cols + base_cols].sum()

    # drop the rows if eis is missing
    df = df.dropna(subset=['eis'])

    # Total after grouping
    total_after = df[pollutant_cols + base_cols].sum()

    # Define columns as integers
    df = df.astype({'eis': 'int64', 'DOE/EIA ORIS plant or facility code': 'int64'})

    # rename columns
    df.rename(columns={'eis': 'EIS_ID', 'DOE/EIA ORIS plant or facility code': "oris_ID"}, inplace=True)

    # Group by EIS_ID and aggregate emissions and cambium_gea
    df_grouped = df.groupby('EIS_ID').agg({
        **{col: 'sum' for col in pollutant_cols + base_cols},
        'cambium_gea': 'first' 
    }).reset_index()


    # Check if Totals are preserved
    print('before :', total_before, 'after: ', total_after) 
    #print("Reformatted df with cambium_gea:", df_grouped.head())

    return df_grouped

from itertools import combinations

def find_minimal_unique_identifier_columns(df, max_combination_size=30):
    """
    Finds the minimal set of columns that uniquely identify rows in a DataFrame.

    Args:
        df: pandas.DataFrame
        max_combination_size: int, maximum number of columns to consider in combinations (avoid long runtime)

    Returns:
        List of column names or None
    """
    cols = df.columns.tolist()
    for r in range(1, min(len(cols), max_combination_size) + 1):
        for combo in combinations(cols, r):
            if not df.duplicated(subset=combo).any():
                return list(combo)
    return None

def mapping_powerplant_to_nei(nei_with_powerplant, nei_all_pt, unique_identifier_columns, is_base):

    if is_base: 
        # Column mapping between NEI and eGRID
        pollutant_map = {
            'NOx': 'NOx_tons_base',
            'PM2_5': 'PM2.5_tons_base',
            'VOC': 'VOC_tons_base',
            'NH3': 'NH3_tons_base',
            'SOx': 'SO2_tons_base'
        }
    else:
        pollutant_map = {
            'NOx': 'NOx_tons_final',
            'PM2_5': 'PM2.5_tons_final',
            'VOC': 'VOC_tons_final',
            'NH3': 'NH3_tons_final',
            'SOx': 'SO2_tons_final'
        }

    # add Boolean to track back the data center data later
    nei_with_powerplant['was_mapped'] = True  # add flag


    # Compute and apply split factors per pollutant
    for nei_col, Facilities_col in pollutant_map.items():

        print (nei_col, Facilities_col)
        # Group sum for each pollutant by EIS_ID
        total_by_eis = nei_with_powerplant.groupby('EIS_ID')[f'{nei_col}_nei'].transform('sum')
        nei_with_powerplant[f'{nei_col}_total_by_eis'] = total_by_eis

        # Default: compute split factor using NEI emissions
        split_col = f'{nei_col}_split'

        nei_with_powerplant[split_col] = np.where(
            total_by_eis == 0, 
            np.nan, 
            nei_with_powerplant[f'{nei_col}_nei'] / total_by_eis
        )

        # Find EIS_IDs where total_by_eis is zero but Facilities_col is non-zero
        mask_zero_total = (total_by_eis == 0) & nei_with_powerplant[Facilities_col].notna() & (nei_with_powerplant[Facilities_col] != 0)

        print(f"{nei_col}: # fallback allocations due to zero NEI = {mask_zero_total.sum()}")

        # For these EIS_IDs, assign equal split factor across matching rows
        for eid in nei_with_powerplant.loc[mask_zero_total, 'EIS_ID'].unique():
            match_rows = nei_with_powerplant['EIS_ID'] == eid
            n_rows = match_rows.sum()
            nei_with_powerplant.loc[match_rows, split_col] = 1.0 / n_rows

        # Now compute eGRID-scaled emissions and save as nei original name
        nei_with_powerplant[f'{nei_col}'] = nei_with_powerplant[split_col] * nei_with_powerplant[Facilities_col]

        print(f"after {Facilities_col} splitting : ", nei_with_powerplant[f'{nei_col}'].sum())
    # OPTIONAL: Drop intermediate split columns
    #nei_with_powerplant.drop(columns=[f'{k}_split' for k in pollutant_map], inplace=True)
    # Merge results back into the full NEI dataset

    # Merge results back into the full NEI dataset
    nei_all_pt_final = nei_all_pt.merge(
        nei_with_powerplant[ 
            unique_identifier_columns + ["was_mapped",'cambium_gea'] + [f'{k}' for k in pollutant_map]
        ],
        on=unique_identifier_columns,
        how='left'
    )

    gdf_subset = nei_all_pt_final[nei_all_pt_final['was_mapped'] == True]
    print("base dataframe size ", gdf_subset.shape, nei_with_powerplant.shape)
    print("before filling; subset nei sum ", gdf_subset[['PM2_5_nei', 'NH3_nei', 'VOC_nei', 'NOx_nei', 'SOx_nei']].sum())
    print("before filling; subset base sum ", gdf_subset[['PM2_5', 'NH3', 'VOC', 'NOx', 'SOx']].sum())
    print("before filling; nei_all_pt_final base sum ", nei_all_pt_final[['PM2_5', 'NH3', 'VOC', 'NOx', 'SOx']].sum()) 

    # fill the empty rows with NEI dataset
    for k in pollutant_map:
        nei_all_pt_final[f'{k}'] = nei_all_pt_final[f'{k}'].fillna(nei_all_pt_final[f'{k}_nei'])
        nei_all_pt_final[f'{k}_diff'] = nei_all_pt_final[f'{k}'] - nei_all_pt_final[f'{k}_nei']

    # Define difference columns
    diff_cols = ['VOC_diff', 'NH3_diff', 'NOx_diff', 'SOx_diff', 'PM2_5_diff']

    # Mask for rows that were mapped
    mapped_mask = nei_all_pt_final['was_mapped'] == True

    # Mask for rows with no difference in any pollutant
    no_change_mask = (nei_all_pt_final[diff_cols] == 0).all(axis=1)

    # Combine masks
    mapped_but_unchanged = nei_all_pt_final[mapped_mask & no_change_mask]

    # Show result
    print("Number of rows where emissions were mapped but did not change:", mapped_but_unchanged.shape[0])

    # Remove rows where all values in specified columns are zero
    gdf_subset = nei_all_pt_final[nei_all_pt_final['was_mapped'] == True]
    print("subset dataframe size ", gdf_subset.shape, nei_with_powerplant.shape)
    print("subset nei sum ", gdf_subset[['PM2_5_nei', 'NH3_nei', 'VOC_nei', 'NOx_nei', 'SOx_nei']].sum())
    print("subset base sum ", gdf_subset[['PM2_5', 'NH3', 'VOC', 'NOx', 'SOx']].sum())

    # drop the unnecessary columns
    nei_all_pt_final.drop(columns=[f'{k}_diff' for k in pollutant_map], inplace=True)
    #nei_all_pt_final.drop(columns=[f'{k}_nei' for k in pollutant_map], inplace=True)

    return nei_all_pt_final

# Diagnostic to find why eGRID sums don't match after merging with NEI

def diagnose_egrid_nei_mismatch(egrid, nei_all_pt, nei_with_powerplant):
    """
    Diagnose why eGRID sums don't match after filtering/merging with NEI data
    """
    print("=== eGRID-NEI EIS_ID MISMATCH DIAGNOSTIC ===")
    
    pollutant_cols = ['PM2.5_tons_base', 'NH3_tons_base', 'VOC_tons_base', 'NOx_tons_base', 'SO2_tons_base']
    
    # Get unique EIS_IDs from each dataset
    egrid_eids = set(egrid['EIS_ID'].unique())
    nei_eids = set(nei_all_pt['EIS_ID'].unique())
    merged_eids = set(nei_with_powerplant['EIS_ID'].unique())
    
    print(f"Unique EIS_IDs in eGRID: {len(egrid_eids)}")
    print(f"Unique EIS_IDs in NEI: {len(nei_eids)}")
    print(f"Unique EIS_IDs after merge: {len(merged_eids)}")
    
    # Find missing EIS_IDs
    egrid_not_in_nei = egrid_eids - nei_eids
    nei_not_in_egrid = nei_eids - egrid_eids
    egrid_lost_in_merge = egrid_eids - merged_eids
    
    print(f"\nEIS_IDs in eGRID but not in NEI: {len(egrid_not_in_nei)}")
    print(f"EIS_IDs in NEI but not in eGRID: {len(nei_not_in_egrid)}")
    print(f"EIS_IDs from eGRID lost after merge: {len(egrid_lost_in_merge)}")
    
    if len(egrid_not_in_nei) > 0:
        print(f"\n❌ FOUND THE PROBLEM: {len(egrid_not_in_nei)} eGRID facilities have no NEI data")
        
        # Calculate emissions lost due to missing EIS_IDs
        missing_egrid = egrid[egrid['EIS_ID'].isin(egrid_not_in_nei)]

        pollutant_cols = ['PM2.5_tons_final', 'NH3_tons_final', 'VOC_tons_final', 'NOx_tons_final', 'SO2_tons_final']
        
        print("\nFinal Emissions lost from missing EIS_IDs:")
        lost_emissions = missing_egrid[pollutant_cols].sum()
        for col, value in lost_emissions.items():
            print(f"  {col}: {value}") 


        
        print("\nBase Emissions lost from missing EIS_IDs:")
        lost_emissions = missing_egrid[pollutant_cols].sum()
        for col, value in lost_emissions.items():
            print(f"  {col}: {value}")



        print(f"\nAll missing EIS_IDs: {list(egrid_not_in_nei)}")
        
        # Show some details about the missing facilities
        print(f"\nDetails of first few missing facilities:")
        sample_missing = missing_egrid.head()[['EIS_ID'] + pollutant_cols]
        for idx, row in sample_missing.iterrows():
            print(f"  EIS_ID {row['EIS_ID']}: NOx={row['NOx_tons_base']}, PM2.5={row['PM2.5_tons_base']}")
    
    if len(egrid_lost_in_merge) > len(egrid_not_in_nei):
        print(f"\n❌ ADDITIONAL PROBLEM: More EIS_IDs lost in merge than expected")
        extra_lost = egrid_lost_in_merge - egrid_not_in_nei
        print(f"Extra lost EIS_IDs: {len(extra_lost)}")
        print(f"Sample extra lost: {list(extra_lost)[:5]}")
    
    # Verify the math
    original_sum = egrid[pollutant_cols].sum()
    kept_egrid = egrid[egrid['EIS_ID'].isin(merged_eids)]
    kept_sum = kept_egrid[pollutant_cols].sum()
    
    print(f"\n=== EMISSION ACCOUNTING ===")
    print("Original eGRID sums:")
    for col, val in original_sum.items():
        print(f"  {col}: {val}")
    
    print("\nSums for EIS_IDs that made it through merge:")
    for col, val in kept_sum.items():
        print(f"  {col}: {val}")
    
    print("\nDifference (lost emissions):")
    diff = original_sum - kept_sum
    for col, val in diff.items():
        print(f"  {col}: {val}")
    
    return {
        'egrid_not_in_nei': egrid_not_in_nei,
        'lost_emissions': missing_egrid[pollutant_cols].sum() if len(egrid_not_in_nei) > 0 else None,
        'kept_egrid': kept_egrid
    }

# Quick function to check EIS_ID formats
def check_eis_id_formats(egrid, nei_all_pt):
    """Check if EIS_ID formats might be causing mismatch"""
    print("=== EIS_ID FORMAT CHECK ===")
    
    egrid_sample = egrid['EIS_ID'].head(10).tolist()
    nei_sample = nei_all_pt['EIS_ID'].head(10).tolist()
    
    print("Sample eGRID EIS_IDs:", egrid_sample)
    print("Sample NEI EIS_IDs:", nei_sample)
    
    # Check data types
    print(f"\neGRID EIS_ID dtype: {egrid['EIS_ID'].dtype}")
    print(f"NEI EIS_ID dtype: {nei_all_pt['EIS_ID'].dtype}")
    
    # Check for leading/trailing spaces
    egrid_spaces = egrid['EIS_ID'].astype(str).str.contains('^ | $', regex=True).any()
    nei_spaces = nei_all_pt['EIS_ID'].astype(str).str.contains('^ | $', regex=True).any()
    
    if egrid_spaces or nei_spaces:
        print("❌ Found leading/trailing spaces in EIS_IDs")
        print(f"  eGRID has spaces: {egrid_spaces}")
        print(f"  NEI has spaces: {nei_spaces}")
    else:
        print("✅ No leading/trailing spaces found")
    
    # Check lengths
    egrid_lengths = egrid['EIS_ID'].astype(str).str.len().unique()
    nei_lengths = nei_all_pt['EIS_ID'].astype(str).str.len().unique()
    
    print(f"\neGRID EIS_ID lengths: {sorted(egrid_lengths)}")
    print(f"NEI EIS_ID lengths: {sorted(nei_lengths)}")

In [None]:


overall_scenario = 'Food_Agr'
Scenario_dir_path = f'/Users/yunhalee/Documents/LOCAETA/Electrification/{overall_scenario}/'
Scenario_list = ["current_easyhard", "2050_easyhard_noIRA_111D", "2050_easyhard_decarb95"] 

if overall_scenario != 'Full_USA':
    scen_emis_list = { scen: f'{scen}_{overall_scenario}' for scen in Scenario_list} # {overall_scenario: emis_name}
else:
    scen_emis_list = { scen: f'{scen}' for scen in Scenario_list}

for Scenario_name, Emis_name in scen_emis_list.items():

    print ("processing ", Scenario_name)
    #Facilities_file = os.path.join(Scenario_dir_path, f'300MW_national_{Scenario_name}.csv')
    Facilities_file = os.path.join(Scenario_dir_path, f'pp_{Scenario_name}.csv')

    egrid = pd.read_csv(Facilities_file)

    print("original data", egrid[['PM2.5_tons_base', 'NH3_tons_base', 'VOC_tons_base', 'NOx_tons_base', 'SO2_tons_base']].sum())
    egrid = reformat_powerplant(egrid)
    print("after grouping", egrid[['PM2.5_tons_base', 'NH3_tons_base', 'VOC_tons_base', 'NOx_tons_base', 'SO2_tons_base']].sum())



    # Filter NEI rows to only those that exist in eGRID
    nei_with_powerplant = nei_all_pt[nei_all_pt['EIS_ID'].isin(egrid['EIS_ID'])].copy()

    # Subset only for necessary columns
    nei_with_powerplant.drop(columns=['height', 'diam',
        'temp', 'velocity'], inplace=True)

    unique_identifier_columns = find_minimal_unique_identifier_columns(nei_with_powerplant)

    if unique_identifier_columns:
        print("Columns that uniquely identify rows:", unique_identifier_columns)
    else:
        print("No combination of columns uniquely identifies rows.")

    print("filtering", nei_with_powerplant.shape)

    # Merge eGRID emissions
    nei_with_powerplant = nei_with_powerplant.merge(egrid, on='EIS_ID', how='left')
    true_sum = nei_with_powerplant.groupby('EIS_ID')[['PM2.5_tons_base', 'NH3_tons_base', 'VOC_tons_base', 'NOx_tons_base', 'SO2_tons_base']].first().sum()
    print("intial nei_with_powerplant sum", true_sum)

    print("Merging egrid", nei_with_powerplant.shape)

    # DEBUGGING
    check_eis_id_formats(egrid, nei_all_pt)
    diagnostic_results = diagnose_egrid_nei_mismatch(egrid, nei_all_pt, nei_with_powerplant)
    print(diagnostic_results)

    for is_base_emission in [True, False]: 
    #for is_base_emission in [False]: 
        nei_all_pt_final = mapping_powerplant_to_nei(nei_with_powerplant, nei_all_pt, unique_identifier_columns, is_base = is_base_emission)

        # Split into two GeoDataFrames based on was_mapped
        mapped_df = nei_all_pt_final[nei_all_pt_final['was_mapped'] == True].copy()
        unmapped_df = nei_all_pt_final[nei_all_pt_final['was_mapped'] != True].copy()

        print("after filling; mapped_df sum ", mapped_df[['PM2_5', 'NH3', 'VOC', 'NOx', 'SOx']].sum()) 

        print("final size ", mapped_df.shape, unmapped_df.shape)

        # Save outputs with region suffix
        if is_base_emission:

            # Save the mapped data with runname in the filename
            if not mapped_df.empty:
                mapped_filename = os.path.join(Scenario_dir_path, f"{Emis_name}_pp_base.shp")
                mapped_df.to_file(mapped_filename, driver='ESRI Shapefile')
                print(f"Saved mapped data to {mapped_filename}")
        else:
            # Save the mapped data with runname in the filename
            if not mapped_df.empty:
                mapped_filename = os.path.join(Scenario_dir_path, f"{Emis_name}_pp.shp")
                mapped_df.to_file(mapped_filename, driver='ESRI Shapefile')
                print(f"Saved mapped data to {mapped_filename}")

        print(mapped_df['source_fil'].unique())
        print(unmapped_df['source_fil'].unique())

## Read facility data using CCS script

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import warnings
import geopandas as gpd
from pyproj import CRS

# Suppress all warnings
warnings.filterwarnings('ignore')

CCS_raw_file = f'/Users/yunhalee/Documents/LOCAETA/Electrification/{overall_scenario}/facility_easyhard.csv'
Scenario_name, Emis_name = next(iter(scen_emis_list.items())) # get the first item-key which is current_easyhard

if Scenario_name == 'current_easyhard':
    print("Good to go")
else:
    raise Exception("Wrong scenario!!")

# process CCS emissions to merge it with NEI emissions
cs_emis = pd.read_csv(CCS_raw_file)

# ensure scc column to be integer
cs_emis['scc'] = cs_emis['scc'].astype(int)

cs_emis.rename(columns={'eis': 'EIS_ID', 'scc': 'SCC'}, inplace = True)
# Find all duplicate rows, including the first occurrence
all_duplicates = cs_emis[cs_emis.duplicated(keep=False)]
if all_duplicates.empty:
    print("Great, no duplicate")
else:
    print("Arg..there are duplicates")
    print("All duplicate rows:")
    print(all_duplicates)
    
print("cs_emis:", cs_emis.head())

# Check if EIS ID in cs_emis also exist in eGRID
common_ids = cs_emis['EIS_ID'][cs_emis['EIS_ID'].isin(egrid['EIS_ID'])].unique()

if not common_ids:
    print("Great, no duplicated EIS_ID between cs_emis and egrid")
    # use the unmapped nei dataframe for cs_emis emission processing 
    nei_df = unmapped_df.copy()
else:
    print("Wait, there is duplicated EIS_ID between cs_emis and egrid")
    print(common_ids)
    nei_df = nei_all_pt.copy()

print("nei df:", nei_df.head())

In [None]:
def mapping_non_powerplant_to_nei(nei_with_non_powerplant, nei_df, unique_identifier_columns, is_base):

    if is_base: 
        pollutant_map = {
            'NOx': 'NOx_tons_base',
            'PM2_5': 'PM2.5_tons_base',
            'VOC': 'VOC_tons_base',
            'NH3': 'NH3_tons_base',
            'SOx': 'SO2_tons_base'
        }
    else:
        pollutant_map = {
            'NOx': 'NOx_tons_final',
            'PM2_5': 'PM2.5_tons_final',
            'VOC': 'VOC_tons_final',
            'NH3': 'NH3_tons_final',
            'SOx': 'SO2_tons_final'
        }

    # add Boolean to track back the powerplant data later
    nei_with_non_powerplant['was_mapped'] = True  

    # Compute and apply split factors per pollutant
    for nei_col, powerplant_col in pollutant_map.items():
        print(nei_col, powerplant_col)

        # Group sum by (EIS_ID, SCC) now
        total_by_group = nei_with_non_powerplant.groupby(['EIS_ID', 'SCC'])[f'{nei_col}_nei'].transform('sum')

        # Default split factor within (EIS_ID, SCC)
        split_col = f'{nei_col}_split'
        nei_with_non_powerplant[split_col] = (
            nei_with_non_powerplant[f'{nei_col}_nei'] / total_by_group.replace(0, pd.NA)
        )

        # Handle zero-NEI cases where powerplant emissions exist
        mask_zero_total = (
            (total_by_group == 0) &
            nei_with_non_powerplant[powerplant_col].notna() &
            (nei_with_non_powerplant[powerplant_col] != 0)
        )

        print(f"{nei_col}: # fallback allocations due to zero NEI = {mask_zero_total.sum()}")

        for (eid, scc) in (
            nei_with_non_powerplant.loc[mask_zero_total, ['EIS_ID', 'SCC']]
            .drop_duplicates()
            .itertuples(index=False)
        ):
            match_rows = (nei_with_non_powerplant['EIS_ID'] == eid) & (nei_with_non_powerplant['SCC'] == scc)
            n_rows = match_rows.sum()
            nei_with_non_powerplant.loc[match_rows, split_col] = 1.0 / n_rows

        # Scale emissions and overwrite pollutant column
        nei_with_non_powerplant[nei_col] = (
            nei_with_non_powerplant[split_col] * nei_with_non_powerplant[powerplant_col]
        )

    # Merge results back into the full NEI dataset
    nei_all_pt_final = nei_df.merge(
        nei_with_non_powerplant[ 
            unique_identifier_columns + ["was_mapped", 'cambium_gea'] + [f'{k}' for k in pollutant_map]
        ],
        on=unique_identifier_columns,
        how='left'
    )

    gdf_subset = nei_all_pt_final[nei_all_pt_final['was_mapped'] == True]
    print("base dataframe size ", gdf_subset.shape, nei_with_non_powerplant.shape)
    print("before filling; subset nei sum ", gdf_subset[['PM2_5_nei', 'NH3_nei', 'VOC_nei', 'NOx_nei', 'SOx_nei']].sum())
    print("before filling; subset base sum ", gdf_subset[['PM2_5', 'NH3', 'VOC', 'NOx', 'SOx']].sum())
    print("before filling; nei_all_pt_final base sum ", nei_all_pt_final[['PM2_5', 'NH3', 'VOC', 'NOx', 'SOx']].sum()) 

    # Fill missing rows with NEI dataset values
    for k in pollutant_map:
        unmapped_mask = nei_all_pt_final['was_mapped'] != True
        nei_all_pt_final.loc[unmapped_mask, k] = nei_all_pt_final.loc[unmapped_mask, k].fillna(nei_all_pt_final.loc[unmapped_mask, f'{k}_nei'])

    return nei_all_pt_final 
 

In [None]:


#filter 
nei_with_non_powerplant = nei_df.merge(
    cs_emis[['EIS_ID','SCC']], on=['EIS_ID','SCC'], how='inner'
)


# Subset only for necessary columns
nei_with_non_powerplant.drop(columns=['height', 'diam',
    'temp', 'velocity',  'cambium_gea'], inplace=True)

# unique_identifier_columns = find_minimal_unique_identifier_columns(nei_with_non_powerplant)

# if unique_identifier_columns:
#     print("Columns that uniquely identify rows:", unique_identifier_columns)
# else:
#     print("No combination of columns uniquely identifies rows.")
print("cs_emis original base sum ", cs_emis.filter(like='tons_base', axis=1).sum())
print("cs_emis original final sum ", cs_emis.filter(like = 'tons_final', axis=1).sum()) 

print("filtering", nei_df.shape, nei_with_non_powerplant.shape, cs_emis.shape)

# Merge eGRID emissions
nei_with_non_powerplant = nei_with_non_powerplant.merge(cs_emis, on=['EIS_ID', 'SCC'], how='left')

true_sum = nei_with_non_powerplant.groupby(['EIS_ID','SCC'])[['PM2.5_tons_base', 'NH3_tons_base', 'VOC_tons_base', 'NOx_tons_base', 'SO2_tons_base']].first().sum()
print("intial nei_with_non_powerplant sum", true_sum)

print("Merging egrid", nei_with_non_powerplant.shape)

for is_base_emission in [False, True ]: 
#for is_base_emission in [False]: 
    nei_all_pt_final = mapping_non_powerplant_to_nei(nei_with_non_powerplant, nei_all_pt, unique_identifier_columns, is_base = is_base_emission)

    # nei_all_pt_final can't have NULL
    nei_all_pt_final = nei_all_pt_final.fillna(0)

    # Split into two GeoDataFrames based on was_mapped
    mapped_df = nei_all_pt_final[nei_all_pt_final['was_mapped'] == True].copy()
    unmapped_df = nei_all_pt_final[nei_all_pt_final['was_mapped'] != True].copy()

    print("after filling; mapped_df sum ", mapped_df[['PM2_5', 'NH3', 'VOC', 'NOx', 'SOx']].sum()) 

    print("final size ", mapped_df.shape, unmapped_df.shape)

    # Save outputs with region suffix
    if is_base_emission:

        # Save the mapped data with runname in the filename
        if not mapped_df.empty:
            mapped_filename = os.path.join(Scenario_dir_path, f"{Emis_name}_base.shp")
            mapped_df.to_file(mapped_filename, driver='ESRI Shapefile')
            print(f"Saved mapped data to {mapped_filename}")

        # # Save the rest (unmapped data) as rest_NEI
        # if not unmapped_df.empty:
        #     rest_filename = os.path.join(Scenario_dir_path, f"{Scenario_name}_base_rest_NEI.shp")
        #     unmapped_df.to_file(rest_filename, driver='ESRI Shapefile')
        #     print(f"Saved unmapped NEI data to {rest_filename}")
    else:
        # Save the mapped data with runname in the filename
        if not mapped_df.empty:
            mapped_filename = os.path.join(Scenario_dir_path, f"{Emis_name}.shp")
            mapped_df.to_file(mapped_filename, driver='ESRI Shapefile')
            print(f"Saved mapped data to {mapped_filename}")

        # Save the rest (unmapped data) as rest_NEI
        if not unmapped_df.empty:
            rest_filename = os.path.join(Scenario_dir_path, f"{Emis_name}_rest_NEI.shp")
            unmapped_df.to_file(rest_filename, driver='ESRI Shapefile')
            print(f"Saved unmapped NEI data to {rest_filename}")
    print(mapped_df['source_fil'].unique())
    print(unmapped_df['source_fil'].unique())  

In [None]:
cs_sum = cs_emis[['PM2.5_tons_base','NH3_tons_base','VOC_tons_base','NOx_tons_base','SO2_tons_base']].sum()

merged_sum = (
    nei_with_non_powerplant
    .groupby(['EIS_ID','SCC'])[['PM2.5_tons_base','NH3_tons_base','VOC_tons_base','NOx_tons_base','SO2_tons_base']]
    .first()   # ensure one row per pair
    .sum()
)

print("cs_emis sum:\n", cs_sum)
print("merged sum:\n", merged_sum)
print("difference:\n", merged_sum - cs_sum)

## Evaluate new emissions formatted for NEI-SMOKE style

In [None]:
import geopandas as gpd
import os
import matplotlib.pyplot as plt


#overall_scenario = 'Food&Agr'
emis_dir_path = f'/Users/yunhalee/Documents/LOCAETA/Electrification/{overall_scenario}/'

# Column mapping between NEI and eGRID
pollutant_final_map = {
    'NOx': 'NOx_tons_final',
    'PM2_5': 'PM2.5_tons_final',
    'VOC': 'VOC_tons_final',
    'NH3': 'NH3_tons_final',
    'SOx': 'SO2_tons_final'
}

# Column mapping between NEI and eGRID
pollutant_base_map = {
    'NOx': 'NOx_tons_base',
    'PM2_5': 'PM2.5_tons_base',
    'VOC': 'VOC_tons_base',
    'NH3': 'NH3_tons_base',
    'SOx': 'SO2_tons_base'
}

is_base_emission = True

# nei emissions column names
pollutant_cols = ['NOx','PM2_5', 'VOC','NH3', 'SOx']

for scen_name, emis_name in scen_emis_list.items():

    if is_base_emission: 
        pollutant_map = pollutant_base_map
        file_path1 = os.path.join(emis_dir_path, f'{emis_name}_pp_base.shp')
        file_path2 = os.path.join(emis_dir_path, f'current_easyhard_base.shp') 
        if overall_scenario != 'Full_USA': 
            file_path2 = os.path.join(emis_dir_path, f'current_easyhard_{overall_scenario}_base.shp')  

    else:
        pollutant_map = pollutant_final_map
        file_path1 = os.path.join(emis_dir_path, f'{emis_name}_pp.shp')
        file_path2 = os.path.join(emis_dir_path, f'current_easyhard.shp') 
        if overall_scenario != 'Full_USA':  
            file_path2 = os.path.join(emis_dir_path, f'current_easyhard_{overall_scenario}.shp')   

    # read emission scenario
    final_emis1 = gpd.read_file(file_path1) 
    final_emis2 = gpd.read_file(file_path2) 

    # Reset index to ensure proper comparison
    final_emis1.reset_index(drop=True, inplace=True)
    final_emis2.reset_index(drop=True, inplace=True)

    print ("processing ", emis_name)
    original_file1 = os.path.join(emis_dir_path, f'pp_{scen_name}.csv')
    original_emis1 = pd.read_csv(original_file1) 
    original_emis1 = reformat_powerplant(original_emis1)
    original_file2 = os.path.join(emis_dir_path, f'facility_easyhard.csv')
    original_emis2 = pd.read_csv(original_file2) 

    # ensure scc column to be integer
    original_emis2['scc'] = original_emis2['scc'].astype(int)
    original_emis2.rename(columns={'eis_id': 'EIS_ID', 'scc': 'SCC'}, inplace = True)

    # --- sum by pollutant for each dataset ---
    # final_emis already has pollutant_cols
    sum_final1 = final_emis1[pollutant_cols].sum()
    sum_final2 = final_emis2[pollutant_cols].sum()

    # original_emis needs mapped column names
    sum_orig1 = original_emis1[list(pollutant_map.values())].sum()
    sum_orig2 = original_emis2[list(pollutant_map.values())].sum()

    # Rename original_emis sums to match pollutant_cols
    sum_orig1.index = pollutant_cols
    sum_orig2.index = pollutant_cols

    # --- create one DataFrame for plotting ---
    df_compare = pd.DataFrame({
        'pp_base': sum_final1,
        'original_pp': sum_orig1,
        'non_pp_base': sum_final2,
        'original_non_pp': sum_orig2
    })

    print(df_compare)

    # --- plotting ---
    ax = df_compare.plot(
        kind='bar',
        figsize=(10,6),
        width=0.8
    )

    # Add values on top of bars
    for p in ax.patches:
        value = p.get_height()
        if not pd.isna(value):
            ax.annotate(
                f"{value:,.0f}",               # format with commas, no decimals
                (p.get_x() + p.get_width() / 2, value),
                ha='center', va='bottom',
                fontsize=8, rotation=90, xytext=(0, 2), textcoords="offset points"
            )

    plt.title(f"Pollutant Sum Comparison for {emis_name}")
    plt.ylabel("Emissions (tons)")
    plt.xlabel("Pollutants")
    plt.xticks(rotation=0)
    plt.legend(title="Dataset")
    plt.tight_layout()
    plt.show()

## Compare base and final for powerplants and non-powerplant emissions

In [None]:
import geopandas as gpd
import os
import matplotlib.pyplot as plt


emis_dir_path = f'/Users/yunhalee/Documents/LOCAETA/Electrification/{overall_scenario}/'

pollutant_cols = ['NOx','PM2_5','VOC','NH3','SOx']

# storage for results
compare_all = []

# --- case 1: powerplants (loop over emis_list) ---
for emis_name in scen_emis_list.values():
    file_path1 = os.path.join(emis_dir_path, f'{emis_name}_pp_base.shp')
    file_path2 = os.path.join(emis_dir_path, f'{emis_name}_pp.shp')  
    df_name = f'powerplants: {emis_name}'

    # read emission scenario
    final_emis1 = gpd.read_file(file_path1) 
    final_emis2 = gpd.read_file(file_path2) 

    # Reset index
    final_emis1.reset_index(drop=True, inplace=True)
    final_emis2.reset_index(drop=True, inplace=True)

    # --- sum by pollutant ---
    sum_final1 = final_emis1[pollutant_cols].sum()
    sum_final2 = final_emis2[pollutant_cols].sum()

    # --- store results ---
    df_compare = pd.DataFrame({
        f'{df_name}_base': sum_final1,
        f'{df_name}_final': sum_final2,
        'final-base': sum_final2 - sum_final1
    })
    df_compare['source'] = df_name
    compare_all.append(df_compare)

    # # --- plotting ---
    # ax = df_compare.plot(
    #     kind='bar',
    #     figsize=(14, 6),
    #     width=0.8
    # )

    # # Annotate values
    # for p in ax.patches:
    #     value = p.get_height()
    #     if not pd.isna(value):
    #         ax.annotate(
    #             f"{value:,.0f}", 
    #             (p.get_x() + p.get_width() / 2, value),
    #             ha='center', va='bottom',
    #             fontsize=8, rotation=90, xytext=(0, 2), textcoords="offset points"
    #         )

    # plt.title(f"Emissions Comparison for {df_name} ({emis_name})")
    # plt.ylabel("Emissions (tons)")
    # plt.xlabel("Columns")
    # plt.xticks(rotation=90)
    # plt.legend(title="Dataset")
    # plt.tight_layout()
    # plt.show()



# --- case 2: non-powerplants (only once, outside emis_list loop) ---
file_path1 = os.path.join(emis_dir_path, 'current_easyhard_base.shp') 
file_path2 = os.path.join(emis_dir_path, 'current_easyhard.shp')
if overall_scenario != 'Full_USA':
    file_path1 = os.path.join(emis_dir_path, f'current_easyhard_{overall_scenario}_base.shp') 
    file_path2 = os.path.join(emis_dir_path, f'current_easyhard_{overall_scenario}.shp')
df_name = 'others_facilities: same in all runs'

# read emission scenario
final_emis1 = gpd.read_file(file_path1) 
final_emis2 = gpd.read_file(file_path2) 

# Reset index
final_emis1.reset_index(drop=True, inplace=True)
final_emis2.reset_index(drop=True, inplace=True)

# --- sum by pollutant ---
sum_final1 = final_emis1[pollutant_cols].sum()
sum_final2 = final_emis2[pollutant_cols].sum()

# --- store results ---
df_compare = pd.DataFrame({
    f'{df_name}_base': sum_final1,
    f'{df_name}_final': sum_final2,
    'final-base': sum_final2 - sum_final1
})
df_compare['source'] = df_name
compare_all.append(df_compare)

# --- combine into one DataFrame ---
df_all = pd.concat(compare_all)



In [None]:
output_dir = '/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/'+list(scen_emis_list.values())[0]

try:
    os.makedirs(output_dir, exist_ok=True)
    print(f"Directory {output_dir} is created or already exists")
except OSError as e:
    print(f"Problem in creating directory: {e}")


final_diff_emis = df_all.pivot_table(index=df_all.index, columns="source", values="final-base")

# Re-order the columns for the plotting
new_cols = [ f'powerplants: {emis_name}' for emis_name in scen_emis_list.values()]
new_cols.append('others_facilities: same in all runs')

print(f"checking plot bar legend names: {new_cols}")

final_diff_emis = final_diff_emis[new_cols]

# --- plotting ---
ax = final_diff_emis.plot(
    kind="bar",
    figsize=(10,6),
    width=0.8
)

# add values on top of bars
for p in ax.patches:
    value = p.get_height()
    if not pd.isna(value):
        ax.annotate(
            f"{value:,.0f}",
            (p.get_x() + p.get_width() / 2, value/2),
            ha='center', va='bottom',
            fontsize=10, rotation=90, xytext=(0, 2), textcoords="offset points"
        )

plt.title(f"Pollutant Emissions Change (Final - Base) for {overall_scenario}")
plt.ylabel("Emissions change (tons)")
plt.xlabel("Pollutants")
plt.xticks(rotation=0)
plt.legend(title="")
plt.tight_layout()
plt.savefig(output_dir+"/Total_Difference.png", dpi=300, bbox_inches='tight')
plt.show()

## DON'T USE YET - Plot the current_2020_base emissions based on GEA regions

Note that the current_2020_base has different emissions than NEI.

In [None]:
import os
import geopandas as gpd
import pandas as pd

run_name = 'current_easyhard'
Facilities_emis_file = f'{Scenario_dir_path}/{run_name}.shp'
Facilities_emis_file_pp = f'{Scenario_dir_path}/{run_name}_pp.shp'

final_output_dir = f'/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/{run_name}/'
os.makedirs(final_output_dir, exist_ok=True)

# read both emission scenarios
gdf_emis = gpd.read_file(Facilities_emis_file).reset_index(drop=True)
gdf_emis_pp = gpd.read_file(Facilities_emis_file_pp).reset_index(drop=True)

# combine
gdf_emis_all = pd.concat([gdf_emis, gdf_emis_pp], ignore_index=True)
gdf_emis_all = gpd.GeoDataFrame(gdf_emis_all, crs=gdf_emis.crs)

print("Combined shape:", gdf_emis_all.shape)
gdf_emis_all.head()


In [None]:
import matplotlib.pyplot as plt 
import numpy as np 

# emissions column names
cs_pollutants = ['NOx','PM2_5', 'VOC','NH3', 'SOx']
nei_pollutants = [f'{poll}_nei' for poll in cs_pollutants]

# Loop through each pollutant
for cs_col, nei_col in zip(cs_pollutants, nei_pollutants):
    # grouping and summing emissions by each cambium_gea regions
    grouped_sum = gdf_emis.groupby('cambium_ge')[[cs_col, nei_col]].sum().reset_index()

    print(grouped_sum)

    regions = grouped_sum['cambium_ge'].tolist()
    x = np.arange(len(regions))
    width = 0.35 # bar width

    fig, ax = plt.subplots(figsize = (10, 6))
    bars1 = ax.bar(x-width/2, grouped_sum[cs_col], width, label =f'{cs_col}_electrification')
    bars2 = ax.bar(x+width/2, grouped_sum[nei_col], width, label = f'{nei_col}')

    # Add value labels above bars
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.annotate(f'{height:,.0f}',
                        xy = (bar.get_x() + bar.get_width()/2, height/2),
                        xytext = (0, 3),
                        textcoords = 'offset points',
                        ha = 'center',
                        va = 'bottom', fontsize=9, rotation = 90,
                        )
    
    ax.set_xlabel('Cambium_gea region')
    ax.set_ylabel('Total Emissions [tons/yr]')
    ax.set_title(f'Total {cs_col} Emissions by Cambium_gea Region')
    ax.set_xticks(x)
    ax.set_xticklabels(grouped_sum['cambium_ge'], rotation =45, ha='right')
    ax.legend()

    plt.tight_layout()
    plt.savefig(final_output_dir + f'Total_Difference_{cs_col}_by_regions.png', dpi=300, bbox_inches='tight')
    plt.show()
