# Preparing Data Center emissions into NEI-SMOKE format


This is the emission processing for the second "prong" of LOCAETA's three decarbonization strategies, electrification. The goal of the study is assessing the impact of changes of energy demands given a grid scenario (e.g., current and 2050 grid). For example, what will be the emissions increases at power plants in the region of the facility(ies)? We explored the hypothetical scenario which essentially assumes that each power plant in the region (region defined by NREL’s Cambium model) marginally increases its output to collectively meet an additional 300MW load that would be incurred if the data center were connected to the grid.

About the emissions generated from df, it is computed for each powerplant facility (EIS ID is a unique identifier). To include these emissions into NEI-SMOKE formated emissions, I need to split the emisisons using 2020 NEI emissions (per EIS ID and per SCC). 

Here is the emissions scenarios considered and the stretegy I use to prepare NEI-SMOKE style emissions for each scenario: 

* current_2020  - emissions can be prepared by splitting into each SCC by the NEI 2020 emissions weight



## Step 1: Read Data Center emissions and NEI-SMOKE all point source shapefile

In [None]:
import geopandas as gpd
from pyproj import CRS
import os, sys

# Add the path to the main package directory
package_path = os.path.abspath('/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/LOCAETA_AQ')
if package_path not in sys.path:
    sys.path.append(package_path)

import emission_processing

# Read the point source emissions
original_emis ='/Users/yunhalee/Documents/LOCAETA/RCM/INMAP/evaldata_v1.6.1/2020_nei_emissions/combined_NEI2020_pt_oilgas_ptegu_ptnonipm_w_sectors.shp'
nei_all_pt = gpd.read_file(original_emis)

# Reset index to ensure proper comparison
nei_all_pt.reset_index(drop=True, inplace=True)

# rename the nei emissions
pollutant_cols = ['NOx','PM2_5', 'VOC','NH3', 'SOx']

col_dict = {}
for poll in pollutant_cols:
    col_dict[poll] = f'{poll}_nei'

nei_all_pt.rename(columns = col_dict, inplace=True)

nei_all_pt.head()

In [None]:
import pandas as pd
import geopandas as gpd
import os

def reformat_DataCenter(df):

    # drop the rows if eis is missing
    df = df.dropna(subset=['eis'])

    # columns I need
    pollutant_cols = [col for col in df.columns if '_tons_final' in col]
    base_cols = [col for col in df.columns if '_tons_base' in col]
    DataCenter_col_names = pollutant_cols + base_cols + ['eis','cambium_gea']

    # subset the dataframe 
    df = df[DataCenter_col_names] 

    # Define columns as integers
    df = df.astype({'eis': 'int64'})

    # rename "eis" to "eis_id"
    df.rename(columns={'eis': 'EIS_ID'}, inplace=True)

    print(df.head())

    return df

from itertools import combinations

def find_minimal_unique_identifier_columns(df, max_combination_size=30):
    """
    Finds the minimal set of columns that uniquely identify rows in a DataFrame.

    Args:
        df: pandas.DataFrame
        max_combination_size: int, maximum number of columns to consider in combinations (avoid long runtime)

    Returns:
        List of column names or None
    """
    cols = df.columns.tolist()
    for r in range(1, min(len(cols), max_combination_size) + 1):
        for combo in combinations(cols, r):
            if not df.duplicated(subset=combo).any():
                return list(combo)
    return None

def mapping_DataCenter_to_nei(nei_with_DataCenter, nei_all_pt, unique_identifier_columns, is_base):

    if is_base: 
        # Column mapping between NEI and eGRID
        pollutant_map = {
            'NOx': 'NOx_tons_base',
            'PM2_5': 'PM2.5_tons_base',
            'VOC': 'VOC_tons_base',
            'NH3': 'NH3_tons_base',
            'SOx': 'SO2_tons_base'
        }
    else:
        pollutant_map = {
            'NOx': 'NOx_tons_final',
            'PM2_5': 'PM2.5_tons_final',
            'VOC': 'VOC_tons_final',
            'NH3': 'NH3_tons_final',
            'SOx': 'SO2_tons_final'
        }

    #print("before", nei_with_DataCenter.head())
    # Compute and apply split factors per pollutant
    for nei_col, DataCenter_col in pollutant_map.items():

        print (nei_col, DataCenter_col)
        # Group sum for each pollutant by EIS_ID
        total_by_eis = nei_with_DataCenter.groupby('EIS_ID')[f'{nei_col}_nei'].transform('sum')
        nei_with_DataCenter[f'{nei_col}_total_by_eis'] = total_by_eis

        # Default: compute split factor using NEI emissions
        split_col = f'{nei_col}_split'
        nei_with_DataCenter[split_col] = nei_with_DataCenter[f'{nei_col}_nei'] / total_by_eis.replace(0, pd.NA)

        # Find EIS_IDs where total_by_eis is zero but DataCenter_col is non-zero
        mask_zero_total = (total_by_eis == 0) & nei_with_DataCenter[DataCenter_col].notna() & (nei_with_DataCenter[DataCenter_col] != 0)

        # For these EIS_IDs, assign equal split factor across matching rows
        for eid in nei_with_DataCenter.loc[mask_zero_total, 'EIS_ID'].unique():
            match_rows = nei_with_DataCenter['EIS_ID'] == eid
            n_rows = match_rows.sum()
            nei_with_DataCenter.loc[match_rows, split_col] = 1.0 / n_rows

        # Now compute eGRID-scaled emissions and save as nei original name
        nei_with_DataCenter[f'{nei_col}'] = nei_with_DataCenter[split_col] * nei_with_DataCenter[DataCenter_col]

    # OPTIONAL: Drop intermediate split columns
    #nei_with_DataCenter.drop(columns=[f'{k}_split' for k in pollutant_map], inplace=True)
    # Merge results back into the full NEI dataset

    # Merge results back into the full NEI dataset
    nei_all_pt_final = nei_all_pt.merge(
        nei_with_DataCenter[
            unique_identifier_columns + [f'{k}' for k in pollutant_map]
        ],
        on=unique_identifier_columns,
        how='left'
    )

    for k in pollutant_map:
        nei_all_pt_final[f'{k}'] = nei_all_pt_final[f'{k}'].fillna(nei_all_pt_final[f'{k}_nei'])


    return nei_all_pt_final

In [None]:


DataCenter_dir_path = '/Users/yunhalee/Documents/LOCAETA/DataCenter_emissions/'
DataCenter_list = ["current_2020"] 
is_base_emission = False

# Optional: Set region to a specific cambium_gea value, or set to None to skip subsetting
subset_region = "NorthernGrid_West"  # "MISO_South" # "SPP_North" #  "PJM_East" # "MISO_Central" # "CAISO"  #  ,  # Example: "WECC", "ERCOT", etc. Set to None to process all regions


for DataCenter_name in DataCenter_list:

    print ("processing ", DataCenter_name)
    DataCenter_file = os.path.join(DataCenter_dir_path, f'300MW_national_{DataCenter_name}.csv')

    egrid = pd.read_csv(DataCenter_file) 
    egrid = reformat_DataCenter(egrid)

    if subset_region:
        egrid = egrid[egrid['cambium_gea'] == subset_region]
        if egrid.empty:
            print(f"No records found for region '{subset_region}' in {DataCenter_name}")
            continue
        output_suffix = f"_{subset_region}"
    else:
        output_suffix = ""

    # Filter NEI rows to only those that exist in eGRID
    nei_with_DataCenter = nei_all_pt[nei_all_pt['EIS_ID'].isin(egrid['EIS_ID'])].copy()

    # Subset only for necessary columns
    nei_with_DataCenter.drop(columns=['height', 'diam',
        'temp', 'velocity'], inplace=True)

    unique_identifier_columns = find_minimal_unique_identifier_columns(nei_with_DataCenter)

    if unique_identifier_columns:
        print("Columns that uniquely identify rows:", unique_identifier_columns)
    else:
        print("No combination of columns uniquely identifies rows.")

    print("filtering", nei_with_DataCenter.shape)

    # Merge eGRID emissions
    nei_with_DataCenter = nei_with_DataCenter.merge(egrid, on='EIS_ID', how='left')

    print("Merging egrid", nei_with_DataCenter.shape)

    nei_all_pt_final = mapping_DataCenter_to_nei(nei_with_DataCenter, nei_all_pt, unique_identifier_columns, is_base = is_base_emission)
    
    # Save outputs with region suffix
    if is_base_emission:
        filepath = os.path.join(DataCenter_dir_path, f"{DataCenter_name}_base_debugging_{output_suffix}.csv")
        nei_with_DataCenter.to_csv(filepath)

        filepath = os.path.join(DataCenter_dir_path, f"{DataCenter_name}_base_{output_suffix}.shp")
        nei_all_pt_final.to_file(filepath, driver='ESRI Shapefile')
    else:
        filepath = os.path.join(DataCenter_dir_path, f"{DataCenter_name}_debugging_{output_suffix}.csv")
        nei_with_DataCenter.to_csv(filepath)

        filepath = os.path.join(DataCenter_dir_path, f"{DataCenter_name}_{output_suffix}.shp")
        nei_all_pt_final.to_file(filepath, driver='ESRI Shapefile')

    print(nei_all_pt_final.shape, nei_all_pt.shape)
    print(nei_all_pt_final['PM2_5'].notna().sum())
    print(nei_with_DataCenter['PM2_5'].notna().sum())
    print(nei_with_DataCenter['PM2.5_tons_final'].notna().sum())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Define pollutant columns
pollutants = ['VOC', 'NOx', 'SOx', 'PM2_5', 'NH3']
pollutants = [item +"_nei" for item in pollutants]
print(pollutants)

# Step 1: Find EIS_ID & SCC pairs that include "ptnonipm"
ptnonipm_keys = (
    nei_with_DataCenter[nei_with_DataCenter['source_fil'].str.contains('ptnonipm', case=False, na=False)]
    [['EIS_ID', 'SCC']]
    .drop_duplicates()
)

# Step 2: Filter original dataframe to include only those EIS_ID & SCCs
mask = nei_with_DataCenter.set_index(['EIS_ID', 'SCC']).index.isin(ptnonipm_keys.set_index(['EIS_ID', 'SCC']).index)
filtered_df = nei_with_DataCenter[mask].copy()

# Step 3: Group by sector and sum pollutants
emissions_by_sector = filtered_df.groupby('source_fil')[pollutants].sum()

# Step 4: Plot as bar chart (one bar per pollutant, grouped by sector)
emissions_by_sector.plot(kind='bar', figsize=(12, 6))
plt.ylabel('Total Emissions (tons/year)')
plt.title('Total Emissions by Sector (where any EIS_ID+SCC includes "ptnonipm")')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


In [None]:
filtered_df

### Useful debugging script

In [None]:
# check the number of unique IDs
ID_name = 'EIS_ID'
org_DataCenter_list = egrid[ID_name].unique()
DataCenter_list = nei_with_DataCenter[ID_name].unique()
org_nei_list = nei_all_pt[ID_name].unique()
nei_list = nei_all_pt_final[ID_name].unique()

print(len(org_DataCenter_list), len(DataCenter_list), len(nei_list),len(org_nei_list))


# find the rows where two columns are different
missing_pm25_DataCenter = nei_with_DataCenter[
    nei_with_DataCenter['PM2_5_DataCenter'].isna() & 
    nei_with_DataCenter['PM2.5_tons_final'].notna()
]

# filtering column based on string
missing_pm25_DataCenter.filter(regex ='tons_final|total_by_eis')


# get a certain ID facility
print(nei_all_pt[(nei_all_pt['EIS_ID'] == 1028611)])

# Compare NEI2020 against Current grid base emissions

In [None]:
import matplotlib.pyplot as plt

# Column mapping between NEI and eGRID
pollutant_map = {
    'NOx': 'NOx_tons_base',
    'PM2_5': 'PM2.5_tons_base',
    'VOC': 'VOC_tons_base',
    'NH3': 'NH3_tons_base',
    'SOx': 'SO2_tons_base'

}

# Compute and apply split factors per pollutant
for nei_col, DataCenter_col in pollutant_map.items():
    # Scatter plot of total_by_eis vs. eGRID
    plt.figure(figsize=(6, 6))
    plt.scatter(
        nei_with_DataCenter[f'{nei_col}_total_by_eis'],
        nei_with_DataCenter[DataCenter_col] - nei_with_DataCenter[f'{nei_col}_total_by_eis'],
        alpha=0.5
    )
    plt.xlabel(f"Total {nei_col} by EIS_ID in NEI")
    plt.ylabel(f"{DataCenter_col} - '{nei_col}_total_by_eis'")
    plt.title(f"{nei_col} NEI Total vs difference with eGRID base Emissions")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()



## Evaluate new egrid emissions formatted for NEI-SMOKE style

In [None]:
import geopandas as gpd
import os


DataCenter_dir_path = '/Users/yunhalee/Documents/LOCAETA/DataCenter_emissions/'
egrids_list = ["current_2020"]


# Column mapping between NEI and eGRID
pollutant_final_map = {
    'NOx': 'NOx_tons_final',
    'PM2_5': 'PM2.5_tons_final',
    'VOC': 'VOC_tons_final',
    'NH3': 'NH3_tons_final',
    'SOx': 'SO2_tons_final'
}

# Column mapping between NEI and eGRID
pollutant_base_map = {
    'NOx': 'NOx_tons_base',
    'PM2_5': 'PM2.5_tons_base',
    'VOC': 'VOC_tons_base',
    'NH3': 'NH3_tons_base',
    'SOx': 'SO2_tons_base'
}

is_base_emission = True

# nei emissions column names
pollutant_cols = ['NOx','PM2_5', 'VOC','NH3', 'SOx']

for DataCenter_name in egrids_list:

    if is_base_emission: 
        pollutant_map = pollutant_base_map
        file_path = os.path.join(DataCenter_dir_path, f'{DataCenter_name}_base.shp')  
    else:
        pollutant_map = pollutant_final_map
        file_path = os.path.join(DataCenter_dir_path, f'{DataCenter_name}.shp')  

    # read emission scenario
    final_DataCenter_emis = gpd.read_file(file_path) 

    # Reset index to ensure proper comparison
    final_DataCenter_emis.reset_index(drop=True, inplace=True)

    # Subset rows where actual egrid emissions are available (egrid ≠ nei for any pollutant)
    mask = pd.concat([
        final_DataCenter_emis[k] != final_DataCenter_emis[f'{k}_nei']
        for k in pollutant_cols
    ], axis=1).any(axis=1)

    final_DataCenter_emis = final_DataCenter_emis[mask]

    # Compute group sums for each pollutant by EIS_ID
    group_sums = final_DataCenter_emis.groupby('EIS_ID')[[k for k in pollutant_cols]].sum().reset_index()

    group_sums.head()

    print ("processing ", DataCenter_name)
    original_DataCenter_file = os.path.join(DataCenter_dir_path, f'300MW_national_{DataCenter_name}.csv')
    original_DataCenter = pd.read_csv(original_DataCenter_file) 
    original_DataCenter = reformat_DataCenter(original_DataCenter)


    # Merge for comparison
    comparison_df = group_sums.merge(original_DataCenter, on='EIS_ID')
    comparison_df.head()

    # Scatter plots
    for nei, egrid in pollutant_map.items():
        x = comparison_df[nei]
        y = comparison_df[egrid] - comparison_df[nei]

        plt.figure(figsize=(6, 6))
        plt.scatter(x, y, alpha=0.6, edgecolors='k')
        plt.xlabel(f'{nei} NEI')
        plt.ylabel(f'{nei} base - nei')
        plt.ylim(-1, 1)
        plt.title(f'{nei} Comparison - {DataCenter_name}')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()


## Plot the total emissions (either final or diff_final) by Species for all egrid scenarios

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Dictionary to store total emissions
DataCenter_sum = {}

# Column mapping between NEI and eGRID
pollutant_diff_map = {
    'NOx': 'NOx_tons_dif_final',
    'PM2_5': 'PM2.5_tons_dif_final',
    'VOC': 'VOC_tons_dif_final',
    'NH3': 'NH3_tons_dif_final',
    'SOx': 'SO2_tons_dif_final'
}

# Column mapping between NEI and eGRID
pollutant_final_map = {
    'NOx': 'NOx_tons_final',
    'PM2_5': 'PM2.5_tons_final',
    'VOC': 'VOC_tons_final',
    'NH3': 'NH3_tons_final',
    'SOx': 'SO2_tons_final'
}

pollutant_map = pollutant_diff_map

egrids_list = ["current_2020"]

for DataCenter_name in egrids_list: 
    print ("processing ", DataCenter_name)
    original_DataCenter_file = os.path.join(DataCenter_dir_path, f'300MW_national_{DataCenter_name}.csv')
    original_DataCenter = pd.read_csv(original_DataCenter_file) 

    # Store totals per pollutant
    for nei, egrid in pollutant_map.items():
        DataCenter_sum.setdefault(nei, {})[DataCenter_name] = original_DataCenter[egrid].sum()

    print(DataCenter_sum)

output_dir = f'/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/{egrids_list[0]}/'    
os.makedirs(output_dir, exist_ok =True)

# Convert to DataFrame: rows = pollutant, columns = egrid cases
emissions_df = pd.DataFrame(DataCenter_sum).T  # Transpose so pollutants are rows

# Plotting
ax = emissions_df.plot(kind='bar', figsize=(10, 6))
ax.set_ylabel("Total Emissions")
ax.set_title("Total Difference Emissions by Pollutant and eGRID Case")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend(title="Data Center Scenario")
plt.grid(True)
plt.savefig(os.path.join(output_dir, f'Total_Difference.png'), dpi=300, bbox_inches='tight')
plt.show()


## Emissions changes from current_2020

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os


# Column mapping between NEI and eGRID
pollutant_diff_map = {
    'NOx': 'NOx_tons_dif_final',
    'PM2_5': 'PM2.5_tons_dif_final',
    'VOC': 'VOC_tons_dif_final',
    'NH3': 'NH3_tons_dif_final',
    'SOx': 'SO2_tons_dif_final'
}

# Column mapping between NEI and eGRID
pollutant_final_map = {
    'NOx': 'NOx_tons_final',
    'PM2_5': 'PM2.5_tons_final',
    'VOC': 'VOC_tons_final',
    'NH3': 'NH3_tons_final',
    'SOx': 'SO2_tons_final'
}

# Column mapping between NEI and eGRID
pollutant_base_map = {
    'NOx': 'NOx_tons_base',
    'PM2_5': 'PM2.5_tons_base',
    'VOC': 'VOC_tons_base',
    'NH3': 'NH3_tons_base',
    'SOx': 'SO2_tons_base'
}

egrids_list = ["current_2020"] # ,"decarb95_2050","highREcost_2050"]

def convert_DataCenter_csv_to_df(egrids_list, pollutant_map):
    # Dictionary to store total emissions
    DataCenter_sum = {}


    for DataCenter_name in egrids_list: 
        print ("processing ", DataCenter_name)
        original_DataCenter_file = os.path.join(DataCenter_dir_path, f'300MW_national_{DataCenter_name}.csv')
        original_DataCenter = pd.read_csv(original_DataCenter_file) 

        # Store totals per pollutant
        for nei, egrid in pollutant_map.items():
            DataCenter_sum.setdefault(nei, {})[DataCenter_name] = original_DataCenter[egrid].sum()

        print(DataCenter_sum)

    output_dir = f'/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/{egrids_list[0]}/'    
    os.makedirs(output_dir, exist_ok =True)

    # Convert to DataFrame: rows = pollutant, columns = egrid cases
    emissions_df = pd.DataFrame(DataCenter_sum).T  # Transpose so pollutants are rows
    return emissions_df

emissions_diff = convert_DataCenter_csv_to_df(egrids_list, pollutant_diff_map)

# Plotting
ax = emissions_diff.plot(kind='bar', figsize=(10, 6))
ax.set_ylabel("Total Emissions")
ax.set_title("Total Difference Emissions by Pollutant and eGRID Case")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend(title="eGRID Case")
plt.grid(True)
plt.savefig(os.path.join(output_dir, f'Total_Difference.png'), dpi=300, bbox_inches='tight')
plt.show()



In [None]:

emissions_base = convert_DataCenter_csv_to_df(egrids_list, pollutant_base_map)

emissions_base['decarb95_2050 - current_2020'] = emissions_base['decarb95_2050'] - emissions_base['current_2020']
emissions_base['highREcost_2050 - current_2020'] = emissions_base['highREcost_2050'] - emissions_base['current_2020']

emissions_base.drop(columns=egrids_list, inplace=True)
emissions_base

output_dir = f'/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/{egrids_list[1]}_base/' 
os.makedirs(output_dir, exist_ok =True)

# Plotting
ax = emissions_base.plot(kind='bar', figsize=(10, 6))
ax.set_ylabel("Total Emissions")
ax.set_title("Total Difference Emissions: future eGRID scenario - current_2020")
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend(title="eGRID scenarios")
plt.grid(True)
plt.savefig(os.path.join(output_dir, f'Total_Difference_based_on_current_2020.png'), dpi=300, bbox_inches='tight')
plt.show()