# Debugging and Checking emissions

#### This script is almost identical to the emission_analysis.ipynb under LOCAETA-reports (jupyter notebook). I created this script for debugging and checking emission scenarios. 

Author : Yunha Lee

Date: March 12, 2025 

# 1. Read emissions file

Here are the point-source facilities emissions where amine-based CCS technology is applied. The case presented in here is over the Louisanna State. The data in the below presents the NEI2020 emissions (as "_old") as well as modified emissions by the CCS technology (without "_old") for major air pollutants. 

In [None]:
import geopandas as gpd
import os


# read base and sens emission scenarios
gdf_emis = gpd.read_file(
    #'/Users/yunhalee/Documents/LOCAETA/RCM/INMAP/evaldata_v1.6.1/2020_nei_emissions/combined_NEI2020_pt_oilgas_ptegu_ptnonipm.shp')
 #'/Users/yunhalee/Documents/LOCAETA/CS_emissions/USA_point_CCS_reduced_emis.shp')

 '/Users/yunhalee/Documents/LOCAETA/CS_emissions/new_LA_point_CCS_reduced_emis.shp') # Colorado_point_CCS_reduced_emis.shp' ) #new_LA_point_CCS.shp') #)

output_dir = '/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/LA_CCS/'
national_scale_on = True # Set it False if the scenario is only for single State

if not os.path.exists(output_dir ):
    os.makedirs(output_dir )

# Reset index to ensure proper comparison
gdf_emis.reset_index(drop=True, inplace=True)

print(gdf_emis.head())
# def subset_data(final_df, state_fips=None):
#     if state_fips:
#         # Match the first two digits of FIPS with State FIPS code
#         return final_df[final_df['FIPS'].astype(str).str[:2] == str(state_fips)]
#     return final_df

# # Dictionary to loop over
# regions = {"LA": '22'} #, "Nation": None}

# final_df = gpd.GeoDataFrame(gdf_emis, geometry= "geometry")
# type(final_df)

# for region_name, state_fips in regions.items():
    
#     # Subset the DataFrame based on the chosen state or national
#     final_df_subset = subset_data(final_df, state_fips)

#     print(f"Processing data for: {region_name}")

# IMPORTANT NOTE : This subset function doesn't include total emissions by EIS_ID. Some cases,     
def subset_pollutants_with_difference(df, pollutants):
    mask = False  # Initialize a mask to filter rows
    for pollutant in pollutants:
        col_current = pollutant
        col_old = f'{pollutant}_old'
        
        if col_current in df.columns and col_old in df.columns:
            # Check where the pollutant and its "_old" version differ
            mask |= (df[col_current] != df[col_old])

    # Subset the dataframe based on the mask
    return df[mask]

pollutants = ['VOC', 'NOx', 'NH3', 'SOx', 'PM2_5']
subset_df = subset_pollutants_with_difference(gdf_emis, pollutants)

subset_df


In [None]:
gdf_emis.columns

The cobenefits code, which process the air quality emissions based on the CCS technology, generates higher emissions of NOx and SOx for 1-2 facilities, which appears bug in the code. The dataframe below shows two facilities with higher SOx emission by CCS. 

The dataframe below shows the facility with higher NOx emission by CCS. The incorrect emissions are very small (total 0.5 tons for NOx and 4 tons for SOx), and it is unlikely affect our overall results. Nevertheless, these emissions should be fixed from the cobenefits code in future. Otherwise, I need to manually remove these emissions. 

In [None]:
import contextily as ctx
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import os

def plot_spatial_distribution_percent_change_with_basemap(gdf, output_dir, national_scale = False):
    pollutants = ['NH3', 'VOC', 'NOx', 'SOx', 'PM2_5']
    
    # Ensure the GeoDataFrame is in the correct CRS for basemaps
    gdf = gdf.to_crs(epsg=3857)
    
    for pollutant in pollutants:
        # Use gridspec for more precise layout control
        fig = plt.figure(figsize=(10, 6))
        gs = fig.add_gridspec(1, 20)  # Create a 1×20 grid for precise width control
        
        # Main plot takes 18/20 of the width
        ax = fig.add_subplot(gs[0, :19])
        
        col_current = f'{pollutant}'
        col_old = f'{pollutant}_old'
        
        if col_current not in gdf.columns or col_old not in gdf.columns:
            print(f'Columns {col_current} or {col_old} do not exist in the data.')
            continue
        
        # Calculate percent change
        gdf['percent_change'] = ((gdf[col_current] - gdf[col_old]) / gdf[col_old].replace(0, float('nan'))) * 100
        
        # debugging
        gdf[gdf['percent_change'] > 0].to_csv(output_dir + 'debuggin_USA_positive_PM_changes.csv', index=False)
        
        # Set color scale
        vmin, vmax = -100, 100
        if pollutant == 'NH3':
            vmin, vmax = -200, 200
        
        # Plot data
        gdf.plot(column='percent_change', cmap='coolwarm', vmin=vmin, vmax=vmax, 
                legend=False, edgecolor='black', ax=ax, markersize=30, alpha=0.95)
        
        if national_scale:
            # Add basemap
            ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, zoom=4)
        else:   
            # Add basemap
            ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, zoom=10)
            
        # Set title
        ax.set_title(f'Percent Change in {pollutant} emissions by amine-based CCS')
        
        # Calculate statistics
        total_current = gdf[col_current].sum()
        total_old = gdf[col_old].sum()
        total_percent_change = ((total_current - total_old) / total_old) * 100
        max_percent_change = gdf['percent_change'].max()
        min_percent_change = gdf['percent_change'].min()
        
        # Display statistics
        ax.text(0.5, -0.15, f'Total Percent Change: {total_percent_change:.2f}%\nMax Percent Change: {max_percent_change:.2f}%\nMin Percent Change: {min_percent_change:.2f}%',
                ha='center', va='center', transform=ax.transAxes, fontsize=12, color='black')
        
        # Create colorbar in the last 1/20 columns of the grid
        cbar_ax = fig.add_subplot(gs[0, 19:])
        sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=mcolors.TwoSlopeNorm(vmin=vmin, vcenter=0, vmax=vmax))
        sm._A = []
        cbar = fig.colorbar(sm, cax=cbar_ax)
        cbar.set_label(f'{pollutant} Percent Change', rotation=270, labelpad=15)
        
        # Minimize spacing between elements
        plt.subplots_adjust(wspace=0.05)  # Very small spacing between map and colorbar
        
        # Save figure
        plt.savefig(os.path.join(output_dir, f'{pollutant}_percent_change_with_basemap.png'), dpi=300, bbox_inches='tight')
        plt.close()

# Example usage
plot_spatial_distribution_percent_change_with_basemap(subset_df, output_dir, national_scale = national_scale_on)


In [None]:
import contextily as ctx
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import os

# Function to plot the percent change of each pollutant and its "_old" version with a basemap
def plot_spatial_distribution_relative_difference_with_basemap(gdf, output_dir, national_scale=False):
    pollutants = ['NH3', 'VOC', 'NOx',  'SOx', 'PM2_5'] #  
    
    # Ensure the GeoDataFrame is in the correct CRS for basemaps (Web Mercator)
    gdf = gdf.to_crs(epsg=3857)

    for pollutant in pollutants:
        # Use gridspec for more precise layout control
        fig = plt.figure(figsize=(10, 6))
        gs = fig.add_gridspec(1, 20)  # Create a 1×20 grid for precise width control

        # Main plot takes 18/20 of the width
        ax = fig.add_subplot(gs[0, :19])
        
        col_current = f'{pollutant}'
        col_old = f'{pollutant}_old'

        # Ensure both the current and "_old" columns exist in the GeoDataFrame
        if col_current not in gdf.columns or col_old not in gdf.columns:
            print(f'Columns {col_current} or {col_old} do not exist in the data.')
            continue
        
        # Calculate the percent change, avoiding division by zero
        gdf['reverse_percent_change'] = ((gdf[col_old] - gdf[col_current]) / gdf[col_current].replace(0, float('nan'))) * 100

        # Plot the spatial distribution of the percent change
        vmin, vmax = -100, 100  # Fixed color scale from -50% to 50%

        if pollutant == 'NH3':
            vmin, vmax = -200, 200 

        gdf.plot(column='reverse_percent_change', cmap='coolwarm', vmin=vmin, vmax=vmax, legend=False, edgecolor='black', 
                 ax=ax, markersize=30, alpha=0.95)  # Increase marker size and reduce transparency

        if national_scale:
            # Add basemap
            ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, zoom=4)
        else:   
            # Add basemap
            ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik, zoom=10)

        ax.set_title(f'Relative Difference in {pollutant} NEI 2020 Emissions Compared to Amine-Based CCS Emissions')

        # Calculate the total, max, and min percent change
        total_current = gdf[col_current].sum()
        total_old = gdf[col_old].sum()
        total_percent_change = ((total_old - total_current) / total_current) * 100
        max_percent_change = gdf['reverse_percent_change'].max()
        min_percent_change = gdf['reverse_percent_change'].min()

        # Display the total, max, and min percent changes on the plot
        ax.text(0.5, -0.15, f'Total Relative Difference: {total_percent_change:.2f}%\nMax Relative Difference: {max_percent_change:.2f}%\nMin Relative Difference: {min_percent_change:.2f}%', 
                ha='center', va='center', transform=ax.transAxes, fontsize=12, color='black')

        # Create colorbar in the last 1/20 columns of the grid
        cbar_ax = fig.add_subplot(gs[0, 19:])
        sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=mcolors.TwoSlopeNorm(vmin=vmin, vcenter=0, vmax=vmax))
        sm._A = []
        cbar = fig.colorbar(sm, cax=cbar_ax)
        cbar.set_label(f'{pollutant} Relative Difference', rotation=270, labelpad=15)
        
        # Minimize spacing between elements
        plt.subplots_adjust(wspace=0.05)  # Very small spacing between map and colorbar

        # Save the figure for each pollutant as a separate file
        plt.savefig(os.path.join(output_dir, f'{pollutant}_Relative_Difference_with_basemap.png'), dpi=300, bbox_inches='tight')
        plt.close()  # Close the figure to avoid overlapping plots

plot_spatial_distribution_relative_difference_with_basemap(subset_df, output_dir, national_scale = national_scale_on)

To spatial distribution of the emissions changes by amine-based CCS technology are presented here. It shows the percent changes computed using the difference between the emissions with CCS and the original NEI2020 emissions divided by the NEI2020 emissions. For Primary PM2.5, NOx and SOx, it shows negative percent changes because the CCS tech reduces their emissions. For VOC, the percent changes are only slightly positive. 
<table>
  <tr>
    <td><img src="/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/CO_CCS/PM2_5_percent_change_with_basemap.png" alt="PM2.5" width="400"/></td>
    <td><img src="/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/CO_CCS/VOC_percent_change_with_basemap.png" alt="VOC" width="400"/></td>
  </tr>
  <tr>
    <td><img src="/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/CO_CCS/NOx_percent_change_with_basemap.png" alt="NOx" width="400"/></td>
    <td><img src="/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/CO_CCS/SOx_percent_change_with_basemap.png" alt="SOx" width="400"/></td>
  </tr>
</table>

The amine-based CCS technology increases NH3 emissions significatly. Several point-source facilities with zero NH3 emissions in the NEI2020 emission inventory have positive NH3 emissions. Since the percent changes calculation used for other pollutants uses the NEI2020 emissions as dinominator, it drops out several facilities with zero NH3 emissions in the NEI2020 inventory. Thus, the percent change calculation used for NH3 is based on the emissions with CCS, instead of the NEI2020 emissions - it computes using the difference between the original NEI2020 emissions and the emissions with CCS, divided by the CCS emissions. The percent changes for other pollutants show how the CCS technology changes their emissions from the NEI2020 inventory while the one for NH3 shows how the NEI2020 inventory changes its emission from the CCS emissions. 
<td><img src="/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/CO_CCS/NH3_Relative_Difference_with_basemap.png" alt="NH3" width="800"/></td>

The changes in emissions by the CCS technology is also presented in barplot. It shows total emissions in tonnes by each county (FIPS code) for each pollutant. 

In [None]:
import numpy as np

# sum by FIPS
grouped_df = subset_df.drop(columns='geometry').groupby('EIS_ID').sum().reset_index()

pollutants = [ 'PM2_5' , 'NOx',  'SOx','NH3', 'VOC'] # 
pollutants_old = [f'{pollutant}_old' for pollutant in pollutants]

# Calculate total sums for each pollutant
totals = {pollutant: subset_df[pollutant].sum() for pollutant in pollutants_old}
totals_CCS ={pollutant: subset_df[pollutant].sum() for pollutant in pollutants}

# Plotting
fig, axes = plt.subplots(nrows=len(pollutants), ncols=1, figsize=(20, 20) ) #, sharey=True)
bar_width = 0.35  # Width of the bars
for i, pollutant in enumerate(pollutants):
    ax = axes[i]
    indices = np.arange(len(grouped_df))
    # Plot CCS emissions
    ax.bar(indices, grouped_df[pollutant], bar_width, label=f'{pollutant}')
    
    # Plot NEI2020
    ax.bar(indices + bar_width, grouped_df[pollutants_old[i]], bar_width, label=f'{pollutants_old[i]}')
    
    total_original = totals[f'{pollutant}_old']
    total_new = totals_CCS[f'{pollutant}']
    ax.set_title(f'{pollutant}\nTotal: {total_original:.0f} | Change : {(total_new - total_original):.0f} [tons]', fontsize=20)
    ax.set_xlabel('EIS_ID')
    ax.set_ylabel('Total emissions [tons]')
    ax.set_xticks(indices + bar_width / 2)
    ax.set_xticklabels(grouped_df['EIS_ID'], rotation=90)
    ax.legend(["with CCS tech", "w/o CCS tech"])

plt.tight_layout()
plt.savefig(os.path.join(output_dir, f'Total_Difference.png'), dpi=300, bbox_inches='tight')

plt.show()

In [None]:
from IPython.display import display, Markdown

markdown_text = f"The number of facilties applied with the amine-based CCS technology is **{len(subset_df['EIS_ID'].unique())}**, which is all located in the CO state. "
                 
for i, pollutant in enumerate(pollutants):
    total_original = totals[f'{pollutant}_old']
    change = totals_CCS[f'{pollutant}'] - total_original

    # Use Markdown to display the computed value
    markdown_text +=f"For {pollutant}, the total NEI2020 emissions are **{total_original:.0f}** in tons, and it is changed by **{change:.0f}** with amine-based CCS technology. "


# Display the entire paragraph in one go
display(Markdown(markdown_text))

# Check a facility emissions from NEI inventory



In [None]:
import geopandas as gpd

# read base and sens emission scenarios
gdf_emis = gpd.read_file('/Users/yunhalee/Documents/LOCAETA/CS_emissions/Colorado_CCS_combined_NEI_point_oilgas_ptegu_ptnonimps.shp')

# Reset index to ensure proper comparison
gdf_emis.reset_index(drop=True, inplace=True)

# Apply CCS emissions to a facility of interest
facility_eis_id = {'Suncor':1099511 } # 'Cherokee':3555811 } 'Landfill': 2001411} # 
                   #'Suncor':1099511, } # 17445711} # Suncor frs_id = '1007923'
		# Cherokee plant alone (facility ID 1007207)  EIS_ID = 17445711
        # NEI emission without landfill facility ID 1007709

species_list = ['NOx', 'SOx', 'PM2_5'] # VOC and NH3 are excluded because the input emissions has NEI emissions for VOC and NH3. 

for key, id in facility_eis_id.items():

    if isinstance(id, int):
        print(f"{id} is integer")
    else:
        print(f"{id} must be integer")
        id = int(id)

    print(f"matching facility: {gdf_emis[gdf_emis['EIS_ID'] == id]}" )

    subset_df = gdf_emis[gdf_emis['EIS_ID'] == id]

subset_df[['PM2_5']].sum()

In [None]:
subset_df[['PM2_5_old']].sum()

## Debugging NEI emissions (for duplicates)

Date : May 6, 2025 

Whole USA scenario, EIS_ID and SCC no longer a unquie identifier, so there are some duplicates, resulting in incorrect final NEI-CCS emissions. The script below is checking what causes the duplicates. 

In [None]:
import geopandas as gpd
import os

# LA_CCS and CO_CCS don't have the duplicates case for EIS_ID and SCC

# read base and sens emission scenarios
gdf_debug = gpd.read_file(
#  '/Users/yunhalee/Documents/LOCAETA/RCM/INMAP/evaldata_v1.6.1/2020_nei_emissions/combined_NEI2020_pt_oilgas_ptegu_ptnonipm.shp')
 '/Users/yunhalee/Documents/LOCAETA/CS_emissions/USA_point_CCS.shp')

debug_output_dir = '/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/USA_CCS/'

# Reset index to ensure proper comparison
gdf_debug.reset_index(drop=True, inplace=True)

#print(gdf_debug[(gdf_debug['EIS_ID'].astype(int) == 715711) & (gdf_debug['SCC'].astype(int) == 10100601)]) 
#print(gdf_debug[(gdf_debug['EIS_ID'].astype(int) == 12611) & (gdf_debug['SCC'].astype(int) == 10200602)])  
print(gdf_debug[(gdf_debug['EIS_ID'].astype(int) == 15662811)])

print(gdf_debug[(gdf_debug['EIS_ID'].astype(int) == 3982711)])    

# ptnonipm_2 has PM2_5 0.99218
# ptegu_1 has PM2_5 9.899564

In [None]:
# Identify duplicates
duplicate_keys = (
    gdf_debug.groupby(['EIS_ID', 'SCC'])
    .size()
    .reset_index(name='count')
    .query('count > 1')[['EIS_ID', 'SCC']]
)
duplicates = gdf_debug.merge(duplicate_keys, on=['EIS_ID', 'SCC'], how='inner')
duplicates['row_key'] = duplicates.index  # Track original index

# Case 1: Same EIS_ID, SCC, ghgrp_faci but different PM2_5
case1_keys = (
    duplicates.groupby(['EIS_ID', 'SCC', 'ghgrp_faci'])['PM2_5']
    .nunique()
    .reset_index(name='pm25_variety')
    .query('pm25_variety > 1')[['EIS_ID', 'SCC', 'ghgrp_faci']]
)
case1 = duplicates.merge(case1_keys, on=['EIS_ID', 'SCC', 'ghgrp_faci'])
case1_row_keys = set(case1['row_key'])

# Exclude Case 1 rows before doing Case 2
case_others = duplicates[~duplicates['row_key'].isin(case1_row_keys)]

# Case 2: Multiple ghgrp_faci for the same NEI (EIS_ID + SCC)
case2_keys = (
    case_others.groupby(['EIS_ID', 'SCC'])['ghgrp_faci']
    .nunique()
    .reset_index(name='ghgrp_faci_count')
    .query('ghgrp_faci_count > 1')[['EIS_ID', 'SCC']]
)
case2 = case_others.merge(case2_keys, on=['EIS_ID', 'SCC'])
case2_row_keys = set(case2['row_key'])

# Case 3: Remaining (All IDs are same but two subparts, C and D, results in different NH3/VOC increase)
remaining_row_keys = set(duplicates['row_key']) - case1_row_keys - case2_row_keys
case3 = duplicates[duplicates['row_key'].isin(remaining_row_keys)]

# Output
print(f"Total Duplicates: {len(duplicates)} rows")
print(f"Case 1: {len(case1)} rows")
print(f"Case 2: {len(case2)} rows")
print(f"Case 3: {len(case3)} rows")
print(f"Sum of all cases: {len(case1) + len(case2) + len(case3)} rows")

#sort data based on EIS_ID and SCC
case1 = case1.sort_values(by=['EIS_ID', 'SCC'], ascending=[True, True])
case2 = case2.sort_values(by=['EIS_ID', 'SCC'], ascending=[True, True])
case3 = case3.sort_values(by=['EIS_ID', 'SCC'], ascending=[True, True])

case1.to_csv(debug_output_dir + 'Case1_one_ghgrp_fac_multiple_NEI_duplicates.csv', index=False)
case2.to_csv(debug_output_dir + 'Case2_more_than_one_ghgrp_faci_per_NEI_duplicates.csv', index=False)
case3.to_csv(debug_output_dir + 'Case3_other_duplicates.csv', index=False)

## Debugging NEI combined point source

Date: May 16, 2025

During the process of egrid emissions for NEI, I found some duplicates of EIS_ID and SCC in the combined nei point source file.  I am going to check how many duplicates are existed and what would be the potential impact. 


In [None]:
import geopandas as gpd
import os


egrid_dir_path = '/Users/yunhalee/Documents/LOCAETA/eGRID_emissions/'
nei_pt_path = '/Users/yunhalee/Documents/LOCAETA/RCM/INMAP/evaldata_v1.6.1/2020_nei_emissions/'
file1 = "NEI2020_pt_oilgas_ptegu_ptnonipm.shp" 
file2 = "combined_NEI2020_pt_oilgas_ptegu_ptnonipm.shp"

# read base and sens emission scenarios
nei_correct = gpd.read_file(nei_pt_path + file1) 
nei_correct.reset_index(drop=True, inplace=True)

print("duplicate", nei_correct.duplicated().sum())


nei_wrong = gpd.read_file(nei_pt_path + file2) 
nei_wrong.reset_index(drop=True, inplace=True)

print("duplicate", nei_wrong.duplicated().sum())

In [None]:
print("duplicate1", nei_correct[nei_correct.duplicated()])

print("duplicate2", nei_wrong[nei_wrong.duplicated()])

