# Preparing Data Center emissions into NEI-SMOKE format


This is the emission processing for the second "prong" of LOCAETA's three decarbonization strategies, electrification. The goal of the study is assessing the impact of changes of energy demands given a grid scenario (e.g., current and 2050 grid). For example, what will be the emissions increases at power plants in the region of the facility(ies)? We explored the hypothetical scenario which essentially assumes that each power plant in the region (region defined by NREL’s Cambium model) marginally increases its output to collectively meet an additional 300MW load that would be incurred if the data center were connected to the grid.

About the emissions generated from df, it is computed for each powerplant facility (EIS ID is a unique identifier). To include these emissions into NEI-SMOKE formated emissions, I need to split the emisisons using 2020 NEI emissions (per EIS ID and per SCC). 

Here is the emissions scenarios considered and the stretegy I use to prepare NEI-SMOKE style emissions for each scenario: 

* current_2020  - emissions can be prepared by splitting into each SCC by the NEI 2020 emissions weight



## Step 1: Read Data Center emissions and NEI-SMOKE all point source shapefile

In [None]:
import geopandas as gpd
from pyproj import CRS
import os, sys

# Add the path to the main package directory
package_path = os.path.abspath('/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/LOCAETA_AQ')
if package_path not in sys.path:
    sys.path.append(package_path)

import emission_processing

# Read the point source emissions
original_emis ='/Users/yunhalee/Documents/LOCAETA/RCM/INMAP/evaldata_v1.6.1/2020_nei_emissions/combined_NEI2020_pt_oilgas_ptegu_ptnonipm_w_sectors.shp'
nei_all_pt = gpd.read_file(original_emis)

# Reset index to ensure proper comparison
nei_all_pt.reset_index(drop=True, inplace=True)

# rename the nei emissions
pollutant_cols = ['NOx','PM2_5', 'VOC','NH3', 'SOx']

col_dict = {}
for poll in pollutant_cols:
    col_dict[poll] = f'{poll}_nei'

nei_all_pt.rename(columns = col_dict, inplace=True)

nei_all_pt.head()

In [None]:
import pandas as pd
import geopandas as gpd
import os

def reformat_DataCenter(df):

    # columns I need
    pollutant_cols = [col for col in df.columns if '_tons_final' in col]
    base_cols = [col for col in df.columns if '_tons_base' in col]
    DataCenter_col_names = pollutant_cols + base_cols + ['eis','cambium_gea','DOE/EIA ORIS plant or facility code']

    # subset the dataframe 
    df = df[DataCenter_col_names] 

    # Total before grouping
    total_before = df[pollutant_cols + base_cols].sum()

    # drop the rows if eis is missing
    df = df.dropna(subset=['eis'])

    # Total after grouping
    total_after = df[pollutant_cols + base_cols].sum()

    # Define columns as integers
    df = df.astype({'eis': 'int64', 'DOE/EIA ORIS plant or facility code': 'int64'})

    # rename columns
    df.rename(columns={'eis': 'EIS_ID', 'DOE/EIA ORIS plant or facility code': "oris_ID"}, inplace=True)

    # Group by EIS_ID and aggregate emissions and cambium_gea
    df_grouped = df.groupby('EIS_ID').agg({
        **{col: 'sum' for col in pollutant_cols + base_cols},
        'cambium_gea': 'first' 
    }).reset_index()


    # Check if Totals are preserved
    print('before :', total_before, 'after: ', total_after) 
    #print("Reformatted df with cambium_gea:", df_grouped.head())

    return df_grouped

from itertools import combinations

def find_minimal_unique_identifier_columns(df, max_combination_size=30):
    """
    Finds the minimal set of columns that uniquely identify rows in a DataFrame.

    Args:
        df: pandas.DataFrame
        max_combination_size: int, maximum number of columns to consider in combinations (avoid long runtime)

    Returns:
        List of column names or None
    """
    cols = df.columns.tolist()
    for r in range(1, min(len(cols), max_combination_size) + 1):
        for combo in combinations(cols, r):
            if not df.duplicated(subset=combo).any():
                return list(combo)
    return None

def mapping_DataCenter_to_nei(nei_with_DataCenter, nei_all_pt, unique_identifier_columns, is_base):

    if is_base: 
        # Column mapping between NEI and eGRID
        pollutant_map = {
            'NOx': 'NOx_tons_base',
            'PM2_5': 'PM2.5_tons_base',
            'VOC': 'VOC_tons_base',
            'NH3': 'NH3_tons_base',
            'SOx': 'SO2_tons_base'
        }
    else:
        pollutant_map = {
            'NOx': 'NOx_tons_final',
            'PM2_5': 'PM2.5_tons_final',
            'VOC': 'VOC_tons_final',
            'NH3': 'NH3_tons_final',
            'SOx': 'SO2_tons_final'
        }

    # add Boolean to track back the data center data later
    nei_with_DataCenter['was_mapped'] = True  # add flag


    # Compute and apply split factors per pollutant
    for nei_col, DataCenter_col in pollutant_map.items():

        print (nei_col, DataCenter_col)
        # Group sum for each pollutant by EIS_ID
        total_by_eis = nei_with_DataCenter.groupby('EIS_ID')[f'{nei_col}_nei'].transform('sum')
        nei_with_DataCenter[f'{nei_col}_total_by_eis'] = total_by_eis

        # Default: compute split factor using NEI emissions
        split_col = f'{nei_col}_split'
        nei_with_DataCenter[split_col] = nei_with_DataCenter[f'{nei_col}_nei'] / total_by_eis.replace(0, pd.NA)

        # Find EIS_IDs where total_by_eis is zero but DataCenter_col is non-zero
        mask_zero_total = (total_by_eis == 0) & nei_with_DataCenter[DataCenter_col].notna() & (nei_with_DataCenter[DataCenter_col] != 0)

        print(f"{nei_col}: # fallback allocations due to zero NEI = {mask_zero_total.sum()}")

        # For these EIS_IDs, assign equal split factor across matching rows
        for eid in nei_with_DataCenter.loc[mask_zero_total, 'EIS_ID'].unique():
            match_rows = nei_with_DataCenter['EIS_ID'] == eid
            n_rows = match_rows.sum()
            nei_with_DataCenter.loc[match_rows, split_col] = 1.0 / n_rows

        # Now compute eGRID-scaled emissions and save as nei original name
        nei_with_DataCenter[f'{nei_col}'] = nei_with_DataCenter[split_col] * nei_with_DataCenter[DataCenter_col]

    # OPTIONAL: Drop intermediate split columns
    #nei_with_DataCenter.drop(columns=[f'{k}_split' for k in pollutant_map], inplace=True)
    # Merge results back into the full NEI dataset

    # Merge results back into the full NEI dataset
    nei_all_pt_final = nei_all_pt.merge(
        nei_with_DataCenter[ 
            unique_identifier_columns + ["was_mapped",'cambium_gea'] + [f'{k}' for k in pollutant_map]
        ],
        on=unique_identifier_columns,
        how='left'
    )

    gdf_subset = nei_all_pt_final[nei_all_pt_final['was_mapped'] == True]
    print("base dataframe size ", gdf_subset.shape, nei_with_DataCenter.shape)
    print("before filling; subset nei sum ", gdf_subset[['PM2_5_nei', 'NH3_nei', 'VOC_nei', 'NOx_nei', 'SOx_nei']].sum())
    print("before filling; subset base sum ", gdf_subset[['PM2_5', 'NH3', 'VOC', 'NOx', 'SOx']].sum())
    print("before filling; nei_all_pt_final base sum ", nei_all_pt_final[['PM2_5', 'NH3', 'VOC', 'NOx', 'SOx']].sum()) 

    # fill the empty rows with NEI dataset
    for k in pollutant_map:
        nei_all_pt_final[f'{k}'] = nei_all_pt_final[f'{k}'].fillna(nei_all_pt_final[f'{k}_nei'])
        nei_all_pt_final[f'{k}_diff'] = nei_all_pt_final[f'{k}'] - nei_all_pt_final[f'{k}_nei']

    # Define difference columns
    diff_cols = ['VOC_diff', 'NH3_diff', 'NOx_diff', 'SOx_diff', 'PM2_5_diff']

    # Mask for rows that were mapped
    mapped_mask = nei_all_pt_final['was_mapped'] == True

    # Mask for rows with no difference in any pollutant
    no_change_mask = (nei_all_pt_final[diff_cols] == 0).all(axis=1)

    # Combine masks
    mapped_but_unchanged = nei_all_pt_final[mapped_mask & no_change_mask]

    # Show result
    print("Number of rows where emissions were mapped but did not change:", mapped_but_unchanged.shape[0])

    # Remove rows where all values in specified columns are zero
    gdf_subset = nei_all_pt_final[nei_all_pt_final['was_mapped'] == True]
    print("subset dataframe size ", gdf_subset.shape, nei_with_DataCenter.shape)
    print("subset nei sum ", gdf_subset[['PM2_5_nei', 'NH3_nei', 'VOC_nei', 'NOx_nei', 'SOx_nei']].sum())
    print("subset base sum ", gdf_subset[['PM2_5', 'NH3', 'VOC', 'NOx', 'SOx']].sum())

    # drop the unnecessary columns
    nei_all_pt_final.drop(columns=[f'{k}_diff' for k in pollutant_map], inplace=True)
    #nei_all_pt_final.drop(columns=[f'{k}_nei' for k in pollutant_map], inplace=True)

    return nei_all_pt_final

In [None]:


#DataCenter_dir_path = '/Users/yunhalee/Documents/LOCAETA/DataCenter_emissions/'
#DataCenter_dir_path = '/Users/yunhalee/Documents/LOCAETA/Electrification/UpdatedSCC_noelecwaste/'
DataCenter_list = ["current_easyhard"] 

for DataCenter_name in DataCenter_list:

    print ("processing ", DataCenter_name)
    #DataCenter_file = os.path.join(DataCenter_dir_path, f'300MW_national_{DataCenter_name}.csv')
    DataCenter_file = os.path.join(DataCenter_dir_path, f'UpdatedSCC_nowaste_pp_{DataCenter_name}.csv')

    egrid = pd.read_csv(DataCenter_file)

    print("original data", egrid[['PM2.5_tons_base', 'NH3_tons_base', 'VOC_tons_base', 'NOx_tons_base', 'SO2_tons_base']].sum())
    egrid = reformat_DataCenter(egrid)
    print("after grouping", egrid[['PM2.5_tons_base', 'NH3_tons_base', 'VOC_tons_base', 'NOx_tons_base', 'SO2_tons_base']].sum())


    # Filter NEI rows to only those that exist in eGRID
    nei_with_DataCenter = nei_all_pt[nei_all_pt['EIS_ID'].isin(egrid['EIS_ID'])].copy()

    # Subset only for necessary columns
    nei_with_DataCenter.drop(columns=['height', 'diam',
        'temp', 'velocity'], inplace=True)

    unique_identifier_columns = find_minimal_unique_identifier_columns(nei_with_DataCenter)

    if unique_identifier_columns:
        print("Columns that uniquely identify rows:", unique_identifier_columns)
    else:
        print("No combination of columns uniquely identifies rows.")

    print("filtering", nei_with_DataCenter.shape)

    # Merge eGRID emissions
    nei_with_DataCenter = nei_with_DataCenter.merge(egrid, on='EIS_ID', how='left')

    print("Merging egrid", nei_with_DataCenter.shape)


    # for is_base_emission in [True, False]: 
    for is_base_emission in [False]: 
        nei_all_pt_final = mapping_DataCenter_to_nei(nei_with_DataCenter, nei_all_pt, unique_identifier_columns, is_base = is_base_emission)

        # Split into two GeoDataFrames based on was_mapped
        mapped_df = nei_all_pt_final[nei_all_pt_final['was_mapped'] == True].copy()
        unmapped_df = nei_all_pt_final[nei_all_pt_final['was_mapped'] != True].copy()

        print("final size ", mapped_df.shape, unmapped_df.shape)

        # Save outputs with region suffix
        if is_base_emission:

            # Save the mapped data with runname in the filename
            if not mapped_df.empty:
                mapped_filename = os.path.join(DataCenter_dir_path, f"{DataCenter_name}_base.shp")
                mapped_df.to_file(mapped_filename, driver='ESRI Shapefile')
                print(f"Saved mapped data to {mapped_filename}")

            # Save the rest (unmapped data) as rest_NEI
            if not unmapped_df.empty:
                rest_filename = os.path.join(DataCenter_dir_path, f"{DataCenter_name}_base_rest_NEI.shp")
                unmapped_df.to_file(rest_filename, driver='ESRI Shapefile')
                print(f"Saved unmapped NEI data to {rest_filename}")
        else:
            # Save the mapped data with runname in the filename
            if not mapped_df.empty:
                mapped_filename = os.path.join(DataCenter_dir_path, f"{DataCenter_name}.shp")
                mapped_df.to_file(mapped_filename, driver='ESRI Shapefile')
                print(f"Saved mapped data to {mapped_filename}")

            # Save the rest (unmapped data) as rest_NEI
            if not unmapped_df.empty:
                rest_filename = os.path.join(DataCenter_dir_path, f"{DataCenter_name}_rest_NEI.shp")
                unmapped_df.to_file(rest_filename, driver='ESRI Shapefile')
                print(f"Saved unmapped NEI data to {rest_filename}")



# Sub region case for Data Center emissions

For sub-region case, it needs to use the all "base" emissions except for the sub-region, which needs to use "final" emissions. 


In [None]:
# Optional: Set region to a specific cambium_gea value, or set to None to skip subsetting
subset_region_list = ["NorthernGrid_West", "MISO_South","SPP_North", "PJM_East", "MISO_Central", "CAISO" ] #  ,  # Example: "WECC", "ERCOT", etc. Set to None to process all regions

DataCenter_dir_path = '/Users/yunhalee/Documents/LOCAETA/DataCenter_emissions/'
DataCenter_list = ["current_2020"] 

pollutant_cols = ['NOx','PM2_5', 'VOC','NH3', 'SOx']

emission_summary = []

for DataCenter_name in DataCenter_list:

    print ("processing ", DataCenter_name)
    DataCenter_file = os.path.join(DataCenter_dir_path, f'300MW_national_{DataCenter_name}.csv')
    egrid = pd.read_csv(DataCenter_file) 

    # Read the point source emissions
    base_emis =f'/Users/yunhalee/Documents/LOCAETA/DataCenter_emissions/{DataCenter_name}_base.shp'
    gdf_base = gpd.read_file(base_emis)
    gdf_base.reset_index(drop=True, inplace=True)

    # Read the point source emissions
    final_emis =f'/Users/yunhalee/Documents/LOCAETA/DataCenter_emissions/{DataCenter_name}.shp'
    gdf_final = gpd.read_file(final_emis)
    gdf_final.reset_index(drop=True, inplace=True)

    for subset_region in subset_region_list:

        print("subset is happening for ", subset_region)

        base_subset = gdf_base[gdf_base['cambium_ge'] != subset_region]
        final_subset = gdf_final[gdf_final['cambium_ge'] == subset_region]

        # Compare two sums of emissions
        base_region = gdf_base[gdf_base['cambium_ge'] == subset_region]
        summary_entry = {
            'Region': subset_region,
        }

        for pol in pollutant_cols:
            summary_entry[f'{pol}_tons_base'] = base_region[pol].sum()
            summary_entry[f'{pol}_tons_final'] = final_subset[pol].sum()

        emission_summary.append(summary_entry)
                  
        combined_gdf = pd.concat([base_subset, final_subset], ignore_index=True)

        print("# of rows must be same: ", gdf_base.shape, gdf_final.shape, combined_gdf.shape)

        if combined_gdf.shape[0] == base_subset.shape[0] + final_subset.shape[0]: 
            print (f"GOOD : # of row by {subset_region} is {final_subset.shape[0]}")
        else:
            print (f"BAD : {subset_region} doesn't result in same total rows {base_subset.shape[0]} {final_subset.shape[0]}  {combined_gdf.shape[0]}")

        filename = os.path.join(DataCenter_dir_path, f"{DataCenter_name}_{subset_region}.shp")
        combined_gdf.to_file(filename, driver='ESRI Shapefile')
        print(f"Saved {subset_region} emissions to {filename}")

# Convert to DataFrame
summary_df = pd.DataFrame(emission_summary)

# Save to CSV
output_csv = f'/Users/yunhalee/Documents/LOCAETA/DataCenter_emissions/{DataCenter_name}_emission_summary_by_region.csv'
summary_df.to_csv(output_csv, index=False)

print(f"Emission summary saved to {output_csv}")


In [None]:
egrid_reform = reformat_DataCenter(egrid)
egrid_reform.drop(columns=['EIS_ID'], inplace=True)
egrid_grouped = egrid_reform.groupby(by ='cambium_gea').sum().reset_index()
egrid_grouped.to_csv(f'/Users/yunhalee/Documents/LOCAETA/DataCenter_emissions/{DataCenter_name}_emission_summary_by_region_from_csv.csv')

egrid_subset = egrid_grouped[ (egrid_grouped['cambium_gea'].isin(summary_df['Region']))]

egrid_subset.sort_values(by = 'cambium_gea', inplace=True)
summary_df.sort_values(by = 'Region', inplace=True)
egrid_subset.sort_index(axis=1, inplace=True)
summary_df.sort_index(axis=1, inplace=True)

summary_df.rename(columns={'SOx_tons_base': 'SO2_tons_base', 'SOx_tons_final': 'SO2_tons_final', 'PM2.5_tons_base': 'PM2_5_tons_base', 'PM2.5_tons_final': 'PM2_5_tons_final', 'Region': 'cambium_gea'}, inplace=True)


In [None]:
summary_df

In [None]:
egrid_subset

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Make sure pollutant order is consistent
summary_df['Pollutant'] = pd.Categorical(summary_df['Pollutant'], categories=pollutant_cols, ordered=True)

# Melt for easier plotting
df_melted = summary_df.melt(
    id_vars=['DataCenter', 'Region', 'Pollutant'],
    value_vars=['Base_Sum', 'Final_Sum'],
    var_name='Scenario',
    value_name='Emissions'
)

# Plot: Facet by Region
g = sns.catplot(
    data=df_melted,
    x='Pollutant', y='Emissions', hue='Scenario',
    col='Region', col_wrap=3,
    kind='bar', height=4, aspect=1.2,
    sharey=False
)
g.set_titles("{col_name}")
g.set_axis_labels("Pollutant", "Emissions (tons)")
g.tight_layout()

# Check how much data center emissions are from ptnonipm sectors

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Define pollutant columns
pollutants = ['VOC', 'NOx', 'SOx', 'PM2_5', 'NH3']
#pollutants = [item +"_nei" for item in pollutants]
print(pollutants)

# Step 1: Find EIS_ID & SCC pairs that include "ptnonipm"
ptnonipm_keys = (
    nei_with_DataCenter[nei_with_DataCenter['source_fil'].str.contains('ptnonipm', case=False, na=False)]
    [['EIS_ID', 'SCC']]
    .drop_duplicates()
)

# Step 2: Filter original dataframe to include only those EIS_ID & SCCs
mask = nei_with_DataCenter.set_index(['EIS_ID', 'SCC']).index.isin(ptnonipm_keys.set_index(['EIS_ID', 'SCC']).index)
filtered_df = nei_with_DataCenter[mask].copy()

# Step 3: Group by sector and sum pollutants
emissions_by_sector = filtered_df.groupby('source_fil')[pollutants].sum()

# Step 4: Plot as bar chart (one bar per pollutant, grouped by sector)
emissions_by_sector.plot(kind='bar', figsize=(12, 6))
plt.ylabel('Total Emissions (tons/year)')
plt.title('Total Emissions by Sector (where any EIS_ID+SCC includes "ptnonipm")')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


### Useful debugging script

In [None]:
# check the number of unique IDs
ID_name = 'EIS_ID'
org_DataCenter_list = egrid[ID_name].unique()
DataCenter_list = nei_with_DataCenter[ID_name].unique()
org_nei_list = nei_all_pt[ID_name].unique()
nei_list = nei_all_pt_final[ID_name].unique()

print(len(org_DataCenter_list), len(DataCenter_list), len(nei_list),len(org_nei_list))


# find the rows where two columns are different
missing_pm25_DataCenter = nei_with_DataCenter[
    nei_with_DataCenter['PM2_5_DataCenter'].isna() & 
    nei_with_DataCenter['PM2.5_tons_final'].notna()
]

# filtering column based on string
missing_pm25_DataCenter.filter(regex ='tons_final|total_by_eis')


# get a certain ID facility
print(nei_all_pt[(nei_all_pt['EIS_ID'] == 1028611)])

# Compare NEI2020 against Current grid base emissions

In [None]:
import matplotlib.pyplot as plt

# Column mapping between NEI and eGRID
pollutant_map = {
    'NOx': 'NOx_tons_base',
    'PM2_5': 'PM2.5_tons_base',
    'VOC': 'VOC_tons_base',
    'NH3': 'NH3_tons_base',
    'SOx': 'SO2_tons_base'

}

# Compute and apply split factors per pollutant
for nei_col, DataCenter_col in pollutant_map.items():
    # Scatter plot of total_by_eis vs. eGRID
    plt.figure(figsize=(6, 6))
    plt.scatter(
        nei_with_DataCenter[f'{nei_col}_total_by_eis'],
        nei_with_DataCenter[DataCenter_col] - nei_with_DataCenter[f'{nei_col}_total_by_eis'],
        alpha=0.5
    )
    plt.xlabel(f"Total {nei_col} by EIS_ID in NEI")
    plt.ylabel(f"{DataCenter_col} - '{nei_col}_total_by_eis'")
    plt.title(f"{nei_col} NEI Total vs difference with eGRID base Emissions")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()



## Evaluate new egrid emissions formatted for NEI-SMOKE style

In [None]:
import geopandas as gpd
import os


DataCenter_dir_path = '/Users/yunhalee/Documents/LOCAETA/DataCenter_emissions/'
egrids_list = ["current_2020"]


# Column mapping between NEI and eGRID
pollutant_final_map = {
    'NOx': 'NOx_tons_final',
    'PM2_5': 'PM2.5_tons_final',
    'VOC': 'VOC_tons_final',
    'NH3': 'NH3_tons_final',
    'SOx': 'SO2_tons_final'
}

# Column mapping between NEI and eGRID
pollutant_base_map = {
    'NOx': 'NOx_tons_base',
    'PM2_5': 'PM2.5_tons_base',
    'VOC': 'VOC_tons_base',
    'NH3': 'NH3_tons_base',
    'SOx': 'SO2_tons_base'
}

is_base_emission = True

# nei emissions column names
pollutant_cols = ['NOx','PM2_5', 'VOC','NH3', 'SOx']

for DataCenter_name in egrids_list:

    if is_base_emission: 
        pollutant_map = pollutant_base_map
        file_path = os.path.join(DataCenter_dir_path, f'{DataCenter_name}_base.shp')  
    else:
        pollutant_map = pollutant_final_map
        file_path = os.path.join(DataCenter_dir_path, f'{DataCenter_name}.shp')  

    # read emission scenario
    final_DataCenter_emis = gpd.read_file(file_path) 

    # Reset index to ensure proper comparison
    final_DataCenter_emis.reset_index(drop=True, inplace=True)

    # Subset rows where actual egrid emissions are available (egrid ≠ nei for any pollutant)
    mask = pd.concat([
        final_DataCenter_emis[k] != final_DataCenter_emis[f'{k}_nei']
        for k in pollutant_cols
    ], axis=1).any(axis=1)

    final_DataCenter_emis = final_DataCenter_emis[mask]

    # Compute group sums for each pollutant by EIS_ID
    group_sums = final_DataCenter_emis.groupby('EIS_ID')[[k for k in pollutant_cols]].sum().reset_index()

    group_sums.head()

    print ("processing ", DataCenter_name)
    original_DataCenter_file = os.path.join(DataCenter_dir_path, f'300MW_national_{DataCenter_name}.csv')
    original_DataCenter = pd.read_csv(original_DataCenter_file) 
    original_DataCenter = reformat_DataCenter(original_DataCenter)


    # Merge for comparison
    comparison_df = group_sums.merge(original_DataCenter, on='EIS_ID')
    comparison_df.head()

    # Scatter plots
    for nei, egrid in pollutant_map.items():
        x = comparison_df[nei]
        y = comparison_df[egrid] - comparison_df[nei]

        plt.figure(figsize=(6, 6))
        plt.scatter(x, y, alpha=0.6, edgecolors='k')
        plt.xlabel(f'{nei} NEI')
        plt.ylabel(f'{nei} base - nei')
        plt.ylim(-1, 1)
        plt.title(f'{nei} Comparison - {DataCenter_name}')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()


## Plot the total emissions (either final or diff_final) by Species for all egrid scenarios

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Dictionary to store total emissions
DataCenter_sum = {}

# Column mapping between NEI and eGRID
pollutant_diff_map = {
    'NOx': 'NOx_tons_dif_final',
    'PM2_5': 'PM2.5_tons_dif_final',
    'VOC': 'VOC_tons_dif_final',
    'NH3': 'NH3_tons_dif_final',
    'SOx': 'SO2_tons_dif_final'
}

# Column mapping between NEI and eGRID
pollutant_final_map = {
    'NOx': 'NOx_tons_final',
    'PM2_5': 'PM2.5_tons_final',
    'VOC': 'VOC_tons_final',
    'NH3': 'NH3_tons_final',
    'SOx': 'SO2_tons_final'
}

pollutant_map = pollutant_diff_map

egrids_list = ["current_2020"]

for DataCenter_name in egrids_list: 
    print ("processing ", DataCenter_name)
    original_DataCenter_file = os.path.join(DataCenter_dir_path, f'300MW_national_{DataCenter_name}.csv')
    original_DataCenter = pd.read_csv(original_DataCenter_file) 

    # Store totals per pollutant
    for nei, egrid in pollutant_map.items():
        DataCenter_sum.setdefault(nei, {})[DataCenter_name] = original_DataCenter[egrid].sum()

    print(DataCenter_sum)

output_dir = f'/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/{egrids_list[0]}/'    
os.makedirs(output_dir, exist_ok =True)

# Convert to DataFrame: rows = pollutant, columns = egrid cases
emissions_df = pd.DataFrame(DataCenter_sum).T  # Transpose so pollutants are rows

# Plotting
ax = emissions_df.plot(kind='bar', figsize=(10, 6))
ax.set_ylabel("Total Emissions")
ax.set_title("Total Difference Emissions by Pollutant and eGRID Case")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend(title="Data Center Scenario")
plt.grid(True)
plt.savefig(os.path.join(output_dir, f'Total_Difference.png'), dpi=300, bbox_inches='tight')
plt.show()


## Emissions changes from current_2020

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os


# Column mapping between NEI and eGRID
pollutant_diff_map = {
    'NOx': 'NOx_tons_dif_final',
    'PM2_5': 'PM2.5_tons_dif_final',
    'VOC': 'VOC_tons_dif_final',
    'NH3': 'NH3_tons_dif_final',
    'SOx': 'SO2_tons_dif_final'
}

# Column mapping between NEI and eGRID
pollutant_final_map = {
    'NOx': 'NOx_tons_final',
    'PM2_5': 'PM2.5_tons_final',
    'VOC': 'VOC_tons_final',
    'NH3': 'NH3_tons_final',
    'SOx': 'SO2_tons_final'
}

# Column mapping between NEI and eGRID
pollutant_base_map = {
    'NOx': 'NOx_tons_base',
    'PM2_5': 'PM2.5_tons_base',
    'VOC': 'VOC_tons_base',
    'NH3': 'NH3_tons_base',
    'SOx': 'SO2_tons_base'
}

egrids_list = ["current_2020"] # ,"decarb95_2050","highREcost_2050"]

def convert_DataCenter_csv_to_df(egrids_list, pollutant_map):
    # Dictionary to store total emissions
    DataCenter_sum = {}


    for DataCenter_name in egrids_list: 
        print ("processing ", DataCenter_name)
        original_DataCenter_file = os.path.join(DataCenter_dir_path, f'300MW_national_{DataCenter_name}.csv')
        original_DataCenter = pd.read_csv(original_DataCenter_file) 

        # Store totals per pollutant
        for nei, egrid in pollutant_map.items():
            DataCenter_sum.setdefault(nei, {})[DataCenter_name] = original_DataCenter[egrid].sum()

        print(DataCenter_sum)

    output_dir = f'/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/{egrids_list[0]}/'    
    os.makedirs(output_dir, exist_ok =True)

    # Convert to DataFrame: rows = pollutant, columns = egrid cases
    emissions_df = pd.DataFrame(DataCenter_sum).T  # Transpose so pollutants are rows
    return emissions_df

emissions_diff = convert_DataCenter_csv_to_df(egrids_list, pollutant_diff_map)

# Plotting
ax = emissions_diff.plot(kind='bar', figsize=(10, 6))
ax.set_ylabel("Total Emissions")
ax.set_title("Total Difference Emissions by Pollutant and eGRID Case")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend(title="eGRID Case")
plt.grid(True)
plt.savefig(os.path.join(output_dir, f'Total_Difference.png'), dpi=300, bbox_inches='tight')
plt.show()



In [None]:

emissions_base = convert_DataCenter_csv_to_df(egrids_list, pollutant_base_map)

emissions_base['decarb95_2050 - current_2020'] = emissions_base['decarb95_2050'] - emissions_base['current_2020']
emissions_base['highREcost_2050 - current_2020'] = emissions_base['highREcost_2050'] - emissions_base['current_2020']

emissions_base.drop(columns=egrids_list, inplace=True)
emissions_base

output_dir = f'/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/{egrids_list[1]}_base/' 
os.makedirs(output_dir, exist_ok =True)

# Plotting
ax = emissions_base.plot(kind='bar', figsize=(10, 6))
ax.set_ylabel("Total Emissions")
ax.set_title("Total Difference Emissions: future eGRID scenario - current_2020")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend(title="eGRID scenarios")
plt.grid(True)
plt.savefig(os.path.join(output_dir, f'Total_Difference_based_on_current_2020.png'), dpi=300, bbox_inches='tight')
plt.show()

## Plot the current_2020_base emissions based on GEA regions

Note that the current_2020_base has different emissions than NEI.

In [None]:
import pandas as pd
import geopandas as gpd
import os

run_name = 'current_2020_base'
DataCenter_emis_file = f'/Users/yunhalee/Documents/LOCAETA/DataCenter_emissions/{run_name}.shp'
final_output_dir = f'/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/{run_name}/'

os.makedirs(final_output_dir, exist_ok=True)

# read emission scenario
gdf_emis = gpd.read_file(DataCenter_emis_file) 
# Reset index to ensure proper comparison
gdf_emis.reset_index(drop=True, inplace=True)
gdf_emis.head()


In [None]:
import matplotlib.pyplot as plt 
import numpy as np 

# emissions column names
cs_pollutants = ['NOx','PM2_5', 'VOC','NH3', 'SOx']
nei_pollutants = [f'{poll}_nei' for poll in cs_pollutants]

# Loop through each pollutant
for cs_col, nei_col in zip(cs_pollutants, nei_pollutants):
    # grouping and summing emissions by each cambium_gea regions
    grouped_sum = gdf_emis.groupby('cambium_ge')[[cs_col, nei_col]].sum().reset_index()

    print(grouped_sum)

    regions = grouped_sum['cambium_ge'].tolist()
    x = np.arange(len(regions))
    width = 0.35 # bar width

    fig, ax = plt.subplots(figsize = (10, 6))
    bars1 = ax.bar(x-width/2, grouped_sum[cs_col], width, label =f'{cs_col}_ccs')
    bars2 = ax.bar(x+width/2, grouped_sum[nei_col], width, label = f'{nei_col}')

    # Add value labels above bars
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.annotate(f'{height:,.0f}',
                        xy = (bar.get_x() + bar.get_width()/2, height/2),
                        xytext = (0, 3),
                        textcoords = 'offset points',
                        ha = 'center',
                        va = 'bottom', fontsize=9, rotation = 90,
                        )
    
    ax.set_xlabel('Cambium_gea region')
    ax.set_ylabel('Total Emissions [tons/yr]')
    ax.set_title(f'Total {cs_col} Emissions by Cambium_gea Region')
    ax.set_xticks(x)
    ax.set_xticklabels(grouped_sum['cambium_ge'], rotation =45, ha='right')
    ax.legend()

    plt.tight_layout()
    plt.savefig(final_output_dir + f'Total_Difference_{cs_col}_by_regions.png', dpi=300, bbox_inches='tight')
    plt.show()


In [None]:
import pandas as pd
import geopandas as gpd
import os
import matplotlib.pyplot as plt 
import numpy as np 

def plot_diff_DC_emis(gdf_emis, output_dir, emis_region):
        
    # Container to collect filtered grouped_sum results
    grouped_combined = pd.DataFrame()

    # emissions column names
    pollutants = ['PM2_5', 'NOx','SOx', 'VOC','NH3']
    base_pollutants = [f'{poll}_base' for poll in pollutants]
    sens_pollutants = [f'{poll}_sens' for poll in pollutants]

    multi_region_indices = []

    # Loop through each pollutant
    for i, (base_col, sens_col) in enumerate(zip(base_pollutants, sens_pollutants)):
        # grouping and summing emissions by each cambium_gea regions
        grouped_sum = gdf_emis.groupby('cambium_ge_sens')[[base_col, sens_col]].sum().reset_index()
        print(grouped_sum)

        if len(grouped_sum) > 1:
            multi_region_indices.append(i)

        else:
            # First time: initialize with region column
            if grouped_combined.empty:
                grouped_combined['cambium_ge_sens'] = grouped_sum['cambium_ge_sens']
            
            grouped_combined[base_col] = grouped_sum[base_col].values
            grouped_combined[sens_col] = grouped_sum[sens_col].values


    if multi_region_indices:
        n = len(multi_region_indices)
        fig, axes = plt.subplots(nrows=n, ncols=1, figsize=(10, 6*n))

        for idx, ax in zip(multi_region_indices, axes):
            base_col = base_pollutants[idx]
            sens_col = sens_pollutants[idx]
            grouped_sum = gdf_emis.groupby('cambium_ge_sens')[[base_col, sens_col]].sum().reset_index()
            grouped_sum.sort_values(by = base_col, ascending=False, inplace=True)
            x = np.arange(len(grouped_sum))
            width = 0.35

            bars1 = ax.bar(x-width/2, grouped_sum[base_col], width, label =f'{base_col}')
            bars2 = ax.bar(x+width/2, grouped_sum[sens_col], width, label = f'{sens_col}')

            # Add value labels above bars
            for bars in [bars1, bars2]:
                for bar in bars:
                    height = bar.get_height()
                    ax.annotate(f'{height:,.0f}',
                                xy = (bar.get_x() + bar.get_width()/2, height/2),
                                xytext = (0, 3),
                                textcoords = 'offset points',
                                ha = 'center',
                                va = 'bottom', fontsize=9, rotation = 90,
                                )
            
            ax.set_xlabel('Cambium_gea region')
            ax.set_ylabel('Total Emissions [tons/yr]')
            substring_to_remove = "_base"
            col = base_col.replace(substring_to_remove, "")
            ax.set_title(f'Total {col} Emissions by Cambium Regions')
            ax.set_xticks(x)
            ax.set_xticklabels(grouped_sum['cambium_ge_sens'], rotation =45, ha='right')
            ax.legend()

    if not grouped_combined.empty:
        
        base_vals = [grouped_combined[f'{p}_base'].iloc[0] for p in pollutants]
        sens_vals = [grouped_combined[f'{p}_sens'].iloc[0] for p in pollutants]

        x = np.arange(len(pollutants))
        width = 0.35 # bar width

        fig, ax = plt.subplots(figsize=(10, 6))
        bars1 = ax.bar(x-width/2,  base_vals, width, label ='Base')
        bars2 = ax.bar(x+width/2, sens_vals, width, label = 'Sens')

        # Add value labels above bars
        for bars in [bars1, bars2]:
            for bar in bars:
                height = bar.get_height()
                ax.annotate(f'{height:,.0f}',
                            xy=(bar.get_x() + bar.get_width()/2, height),
                            xytext=(0, 3),
                            textcoords='offset points',
                            ha='center', va='bottom', fontsize=9)

        ax.set_xlabel('Pollutant')
        ax.set_ylabel('Total Emissions [tons/yr]')
        ax.set_title(f'Total Emissions by Pollutant (Base vs Sens) at {emis_region}')
        ax.set_xticks(x)
        ax.set_xticklabels(pollutants, rotation =45, ha='right')
        ax.legend()

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'Total_Difference.png'), dpi=300, bbox_inches='tight')
    plt.show()


# Plot the base vs each_region emissions

In [None]:
DataCenter_emis_dir = '/Users/yunhalee/Documents/LOCAETA/DataCenter_emissions/' 
emis_output_dir = '/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/'
base_emis = 'current_2020_base.shp'
emis_regions = ['All','CAISO','PJM_East','MISO_Central','MISO_South','NorthernGrid_West','SPP_North'] # 


# read emission scenario
gdf_base_emis = gpd.read_file(DataCenter_emis_dir + base_emis) 
gdf_base_emis.reset_index(drop=True, inplace=True)
print(gdf_base_emis.head())


for emis_region in emis_regions: 
    if emis_region == 'All':
        gdf_sens_emis = gpd.read_file(f'{DataCenter_emis_dir}current_2020.shp') 
        final_output_dir = f'{emis_output_dir}current_2020'
    else:
        gdf_sens_emis = gpd.read_file(f'{DataCenter_emis_dir}current_2020_{emis_region}.shp') 
        final_output_dir = f'{emis_output_dir}current_2020_{emis_region}'
    gdf_sens_emis.reset_index(drop=True, inplace=True)
    print(gdf_sens_emis.head())

    # Make the output dir if not exits
    os.makedirs(final_output_dir, exist_ok=True)

    # Columns to merge on
    key_cols = ['EIS_ID', 'SCC', 'rel_point_', 'source_fil', 'was_mapped']

    # Columns to compare (everything except keys + geometry)
    compare_cols = gdf_sens_emis.columns.difference(key_cols + ['geometry'])

    # Merge on the key columns
    merged = gdf_base_emis[key_cols + list(compare_cols)].merge(
        gdf_sens_emis[key_cols + list(compare_cols)],
        on=key_cols,
        suffixes=('_base', '_sens')
    )

    # Create mask where any compare column differs
    diff_mask = (merged[[f"{col}_base" for col in compare_cols]].values !=
                merged[[f"{col}_sens" for col in compare_cols]].values).any(axis=1)

    # Get only differing rows
    diff_combined = merged[diff_mask]

    # Optional: bring back geometry from base emissions
    diff_combined = diff_combined.merge(
        gdf_base_emis[key_cols + ['geometry']],
        on=key_cols,
        how='left'
    )
    # Make it a GeoDataFrame again
    diff_combined = gpd.GeoDataFrame(diff_combined, geometry='geometry')

    plot_diff_DC_emis(diff_combined, final_output_dir, emis_region)