# Preparing NEI-SMOKE format emissions using eGRID-based output

This is the emission processing for the second "prong" of LOCAETA's three decarbonization strategies, electrification. The goal of the study is assessing the impact of changes of energy demands given a grid scenario (e.g., current and 2050 grid). For example, what will be the emissions increases at power plants in the region of the facility(ies)? We explored the hypothetical scenario which essentially assumes that each power plant in the region (region defined by NREL’s Cambium model) marginally increases its output to collectively meet an additional 300MW load that would be incurred if the data center were connected to the grid.

About the emissions generated from eGRID, it is computed for each powerplant facility (EIS ID is a unique identifier). To include these emissions into NEI-SMOKE formated emissions, I need to split the emisisons using 2020 NEI emissions (per EIS ID and per SCC). 

Here is the emissions scenarios considered and the stretegy I use to prepare NEI-SMOKE style emissions for each scenario: 

* current_2020  - emissions can be prepared by splitting into each SCC by the NEI 2020 emissions weight
* decarb95_2050 - can be prepared by splitting into each SCC by the NEI 2020 emissions weight
* highREcost_2050 -  can be prepared by splitting into each SCC by the NEI 2020 emissions weight


## Step 1: Read eGRID emissions and NEI-SMOKE all point source shapefile

In [None]:
import pandas as pd
import geopandas as gpd
import os

def reformat_egrid(egrid):
    # columns I need
    pollutant_cols = [col for col in egrid.columns if '_tons_final' in col]
    base_cols = [col for col in egrid.columns if '_tons_base' in col]
    egrid_col_names = pollutant_cols + base_cols + ['ghgrp_facility_id', 'oris', 'eis']

    # subset the dataframe 
    egrid = egrid[egrid_col_names] 

    # remove rows if ghgrp_facility_id is NaN, which is the case for renewable energy rows
    egrid = egrid.dropna(subset=['ghgrp_facility_id'])

    # Define columns as integers
    egrid = egrid.astype({'ghgrp_facility_id': 'int64', 'oris': 'int64', 'eis': 'int64'})

    # rename "eis" to "eis_id"
    egrid.rename(columns={'eis': 'EIS_ID', 'oris':'oris_id'}, inplace=True)

    print(egrid.head())

    return egrid

from itertools import combinations

def find_minimal_unique_identifier_columns(df, max_combination_size=30):
    """
    Finds the minimal set of columns that uniquely identify rows in a DataFrame.

    Args:
        df: pandas.DataFrame
        max_combination_size: int, maximum number of columns to consider in combinations (avoid long runtime)

    Returns:
        List of column names or None
    """
    cols = df.columns.tolist()
    for r in range(1, min(len(cols), max_combination_size) + 1):
        for combo in combinations(cols, r):
            if not df.duplicated(subset=combo).any():
                return list(combo)
    return None

def mapping_egrid_to_nei(nei_with_egrid, nei_all_pt, unique_identifier_columns, is_base):

    if is_base: 
        # Column mapping between NEI and eGRID
        pollutant_map = {
            'NOx': 'NOx_tons_base',
            'PM2_5': 'PM2.5_tons_base',
            'VOC': 'VOC_tons_base',
            'NH3': 'NH3_tons_base',
            'SOx': 'SO2_tons_base'
        }
    else:
        pollutant_map = {
            'NOx': 'NOx_tons_final',
            'PM2_5': 'PM2.5_tons_final',
            'VOC': 'VOC_tons_final',
            'NH3': 'NH3_tons_final',
            'SOx': 'SO2_tons_final'
        }

    #print("before", nei_with_egrid.head())
    # Compute and apply split factors per pollutant
    for nei_col, egrid_col in pollutant_map.items():

        print (nei_col, egrid_col)
        # Group sum for each pollutant by EIS_ID
        total_by_eis = nei_with_egrid.groupby('EIS_ID')[f'{nei_col}_nei'].transform('sum')
        nei_with_egrid[f'{nei_col}_total_by_eis'] = total_by_eis

        # Default: compute split factor using NEI emissions
        split_col = f'{nei_col}_split'
        nei_with_egrid[split_col] = nei_with_egrid[f'{nei_col}_nei'] / total_by_eis.replace(0, pd.NA)

        # Find EIS_IDs where total_by_eis is zero but eGRID_col is non-zero
        mask_zero_total = (total_by_eis == 0) & nei_with_egrid[egrid_col].notna() & (nei_with_egrid[egrid_col] != 0)

        # For these EIS_IDs, assign equal split factor across matching rows
        for eid in nei_with_egrid.loc[mask_zero_total, 'EIS_ID'].unique():
            match_rows = nei_with_egrid['EIS_ID'] == eid
            n_rows = match_rows.sum()
            nei_with_egrid.loc[match_rows, split_col] = 1.0 / n_rows

        # Now compute eGRID-scaled emissions and save as nei original name
        nei_with_egrid[f'{nei_col}'] = nei_with_egrid[split_col] * nei_with_egrid[egrid_col]

    # OPTIONAL: Drop intermediate split columns
    #nei_with_egrid.drop(columns=[f'{k}_split' for k in pollutant_map], inplace=True)
    # Merge results back into the full NEI dataset

    # Merge results back into the full NEI dataset
    nei_all_pt_final = nei_all_pt.merge(
        nei_with_egrid[
            unique_identifier_columns + [f'{k}' for k in pollutant_map]
        ],
        on=unique_identifier_columns,
        how='left'
    )

    for k in pollutant_map:
        nei_all_pt_final[f'{k}'] = nei_all_pt_final[f'{k}'].fillna(nei_all_pt_final[f'{k}_nei'])


    return nei_all_pt_final

In [None]:
import geopandas as gpd
import os

egrid_dir_path = '/Users/yunhalee/Documents/LOCAETA/eGRID_emissions/'
nei_pt_emis_file_path = '/Users/yunhalee/Documents/LOCAETA/RCM/INMAP/evaldata_v1.6.1/2020_nei_emissions/combined_NEI2020_pt_oilgas_ptegu_ptnonipm.shp'

# read base and sens emission scenarios
nei_all_pt = gpd.read_file(nei_pt_emis_file_path) 

# Reset index to ensure proper comparison
nei_all_pt.reset_index(drop=True, inplace=True)

print(nei_all_pt.head())

# rename the nei emissions
pollutant_cols = ['NOx','PM2_5', 'VOC','NH3', 'SOx']

col_dict = {}
for poll in pollutant_cols:
    col_dict[poll] = f'{poll}_nei'

nei_all_pt.rename(columns = col_dict, inplace=True)

# remove duplicates
print(nei_all_pt[nei_all_pt.duplicated()])
nei_all_pt = nei_all_pt[~nei_all_pt.duplicated()]

In [None]:


egrids_list = ["highREcost_2050", "decarb95_2050", "current_2020"] 
is_base_emission = True

for egrid_name in egrids_list:

    print ("processing ", egrid_name)
    egrid_file = os.path.join(egrid_dir_path, f'ProjectX_{egrid_name}.csv')
    egrid = pd.read_csv(egrid_file) 

    egrid = reformat_egrid(egrid)

    # Filter NEI rows to only those that exist in eGRID
    nei_with_egrid = nei_all_pt[nei_all_pt['EIS_ID'].isin(egrid['EIS_ID'])].copy()

    # Subset only for necessary columns
    nei_with_egrid.drop(columns=['height', 'diam',
        'temp', 'velocity'], inplace=True)

    unique_identifier_columns = find_minimal_unique_identifier_columns(nei_with_egrid)

    if unique_identifier_columns:
        print("Columns that uniquely identify rows:", unique_identifier_columns)
    else:
        print("No combination of columns uniquely identifies rows.")

    print("filtering", nei_with_egrid.shape)

    # Merge eGRID emissions
    nei_with_egrid = nei_with_egrid.merge(egrid, on='EIS_ID', how='left')

    print("Merging egrid", nei_with_egrid.shape)

    nei_all_pt_final = mapping_egrid_to_nei(nei_with_egrid, nei_all_pt, unique_identifier_columns, is_base = is_base_emission)
    
    if is_base_emission:
        # save output files
        filepath = egrid_dir_path  + egrid_name +"_base_debugging.csv"
        nei_with_egrid.to_csv(filepath)

        filepath = egrid_dir_path  + egrid_name +"_base.shp"
        nei_all_pt_final.to_file(filepath, driver='ESRI Shapefile')
    else:
        # save output files
        filepath = egrid_dir_path  + egrid_name +"_debugging.csv"
        nei_with_egrid.to_csv(filepath)

        filepath = egrid_dir_path  + egrid_name +".shp"
        nei_all_pt_final.to_file(filepath, driver='ESRI Shapefile')

    print(nei_all_pt_final.shape, nei_all_pt.shape)
    print(nei_all_pt_final['PM2_5'].notna().sum())
    print(nei_with_egrid['PM2_5'].notna().sum())
    print(nei_with_egrid['PM2.5_tons_final'].notna().sum())

### Useful debugging script

In [None]:
# check the number of unique IDs
ID_name = 'EIS_ID'
org_egrid_list = egrid[ID_name].unique()
egrid_list = nei_with_egrid[ID_name].unique()
org_nei_list = nei_all_pt[ID_name].unique()
nei_list = nei_all_pt_final[ID_name].unique()

print(len(org_egrid_list), len(egrid_list), len(nei_list),len(org_nei_list))


# find the rows where two columns are different
missing_pm25_egrid = nei_with_egrid[
    nei_with_egrid['PM2_5_egrid'].isna() & 
    nei_with_egrid['PM2.5_tons_final'].notna()
]

# filtering column based on string
missing_pm25_egrid.filter(regex ='tons_final|total_by_eis')


# get a certain ID facility
print(nei_all_pt[(nei_all_pt['EIS_ID'] == 1028611)])

# Compare NEI2020 against Current grid base emissions

In [None]:
import matplotlib.pyplot as plt

# Column mapping between NEI and eGRID
pollutant_map = {
    'NOx': 'NOx_tons_base',
    'PM2_5': 'PM2.5_tons_base',
    'VOC': 'VOC_tons_base',
    'NH3': 'NH3_tons_base',
    'SOx': 'SO2_tons_base'

}

# Compute and apply split factors per pollutant
for nei_col, egrid_col in pollutant_map.items():
    # Scatter plot of total_by_eis vs. eGRID
    plt.figure(figsize=(6, 6))
    plt.scatter(
        nei_with_egrid[f'{nei_col}_total_by_eis'],
        nei_with_egrid[egrid_col] - nei_with_egrid[f'{nei_col}_total_by_eis'],
        alpha=0.5
    )
    plt.xlabel(f"Total {nei_col} by EIS_ID in NEI")
    plt.ylabel(f"{egrid_col} - '{nei_col}_total_by_eis'")
    plt.title(f"{nei_col} NEI Total vs difference with eGRID base Emissions")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()



## Evaluate new egrid emissions formatted for NEI-SMOKE style

In [None]:
import geopandas as gpd
import os


egrid_dir_path = '/Users/yunhalee/Documents/LOCAETA/eGRID_emissions/'
egrids_list = ["current_2020","decarb95_2050","highREcost_2050"]


# Column mapping between NEI and eGRID
pollutant_final_map = {
    'NOx': 'NOx_tons_final',
    'PM2_5': 'PM2.5_tons_final',
    'VOC': 'VOC_tons_final',
    'NH3': 'NH3_tons_final',
    'SOx': 'SO2_tons_final'
}

# Column mapping between NEI and eGRID
pollutant_base_map = {
    'NOx': 'NOx_tons_base',
    'PM2_5': 'PM2.5_tons_base',
    'VOC': 'VOC_tons_base',
    'NH3': 'NH3_tons_base',
    'SOx': 'SO2_tons_base'
}

is_base_emission = True

# nei emissions column names
pollutant_cols = ['NOx','PM2_5', 'VOC','NH3', 'SOx']

for egrid_name in egrids_list:

    if is_base_emission: 
        pollutant_map = pollutant_base_map
        file_path = os.path.join(egrid_dir_path, f'{egrid_name}_base.shp')  
    else:
        pollutant_map = pollutant_final_map
        file_path = os.path.join(egrid_dir_path, f'{egrid_name}.shp')  

    # read emission scenario
    final_egrid_emis = gpd.read_file(file_path) 

    # Reset index to ensure proper comparison
    final_egrid_emis.reset_index(drop=True, inplace=True)

    # Subset rows where actual egrid emissions are available (egrid ≠ nei for any pollutant)
    mask = pd.concat([
        final_egrid_emis[k] != final_egrid_emis[f'{k}_nei']
        for k in pollutant_cols
    ], axis=1).any(axis=1)

    final_egrid_emis = final_egrid_emis[mask]

    # Compute group sums for each pollutant by EIS_ID
    group_sums = final_egrid_emis.groupby('EIS_ID')[[k for k in pollutant_cols]].sum().reset_index()

    group_sums.head()

    print ("processing ", egrid_name)
    original_egrid_file = os.path.join(egrid_dir_path, f'ProjectX_{egrid_name}.csv')
    original_egrid = pd.read_csv(original_egrid_file) 
    original_egrid = reformat_egrid(original_egrid)


    # Merge for comparison
    comparison_df = group_sums.merge(original_egrid, on='EIS_ID')
    comparison_df.head()

    # Scatter plots
    for nei, egrid in pollutant_map.items():
        x = comparison_df[nei]
        y = comparison_df[egrid] - comparison_df[nei]

        plt.figure(figsize=(6, 6))
        plt.scatter(x, y, alpha=0.6, edgecolors='k')
        plt.xlabel(f'{nei} NEI egrid')
        plt.ylabel(f'{nei} original egrid - nei egrid')
        plt.ylim(-1, 1)
        plt.title(f'{nei} Comparison - {egrid_name}')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()


## Plot the total emissions (either final or diff_final) by Species for all egrid scenarios

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Dictionary to store total emissions
egrid_sum = {}

# Column mapping between NEI and eGRID
pollutant_diff_map = {
    'NOx': 'NOx_tons_dif_final',
    'PM2_5': 'PM2.5_tons_dif_final',
    'VOC': 'VOC_tons_dif_final',
    'NH3': 'NH3_tons_dif_final',
    'SOx': 'SO2_tons_dif_final'
}

# Column mapping between NEI and eGRID
pollutant_final_map = {
    'NOx': 'NOx_tons_final',
    'PM2_5': 'PM2.5_tons_final',
    'VOC': 'VOC_tons_final',
    'NH3': 'NH3_tons_final',
    'SOx': 'SO2_tons_final'
}

pollutant_map = pollutant_diff_map

for egrid_name in egrids_list: 
    print ("processing ", egrid_name)
    original_egrid_file = os.path.join(egrid_dir_path, f'ProjectX_{egrid_name}.csv')
    original_egrid = pd.read_csv(original_egrid_file) 

    # Store totals per pollutant
    for nei, egrid in pollutant_map.items():
        egrid_sum.setdefault(nei, {})[egrid_name] = original_egrid[egrid].sum()

    print(egrid_sum)

# Convert to DataFrame: rows = pollutant, columns = egrid cases
emissions_df = pd.DataFrame(egrid_sum).T  # Transpose so pollutants are rows

# Plotting
ax = emissions_df.plot(kind='bar', figsize=(10, 6))
ax.set_ylabel("Total Emissions")
ax.set_title("Total Difference Emissions by Pollutant and eGRID Case")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend(title="eGRID Case")
plt.grid(True)
plt.show()
