## Debugging NEI emissions (for duplicates)

Date : May 6, 2025 

Whole USA scenario, EIS_ID and SCC no longer a unquie identifier, so there are some duplicates, resulting in incorrect final NEI-CCS emissions. The script below is checking what causes the duplicates. 

In [None]:
import geopandas as gpd
import os

# LA_CCS and CO_CCS don't have the duplicates case for EIS_ID and SCC

# read base and sens emission scenarios
gdf_debug = gpd.read_file(
#  '/Users/yunhalee/Documents/LOCAETA/RCM/INMAP/evaldata_v1.6.1/2020_nei_emissions/combined_NEI2020_pt_oilgas_ptegu_ptnonipm.shp')
 '/Users/yunhalee/Documents/LOCAETA/CS_emissions/USA_point_CCS.shp')

debug_output_dir = '/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/emissions/USA_CCS/'

# Reset index to ensure proper comparison
gdf_debug.reset_index(drop=True, inplace=True)

#print(gdf_debug[(gdf_debug['EIS_ID'].astype(int) == 715711) & (gdf_debug['SCC'].astype(int) == 10100601)]) 
#print(gdf_debug[(gdf_debug['EIS_ID'].astype(int) == 12611) & (gdf_debug['SCC'].astype(int) == 10200602)])  
print(gdf_debug[(gdf_debug['EIS_ID'].astype(int) == 15662811)])

print(gdf_debug[(gdf_debug['EIS_ID'].astype(int) == 3982711)])    

# ptnonipm_2 has PM2_5 0.99218
# ptegu_1 has PM2_5 9.899564

In [None]:
# Identify duplicates
duplicate_keys = (
    gdf_debug.groupby(['EIS_ID', 'SCC'])
    .size()
    .reset_index(name='count')
    .query('count > 1')[['EIS_ID', 'SCC']]
)
duplicates = gdf_debug.merge(duplicate_keys, on=['EIS_ID', 'SCC'], how='inner')
duplicates['row_key'] = duplicates.index  # Track original index

# Case 1: Same EIS_ID, SCC, ghgrp_faci but different PM2_5
case1_keys = (
    duplicates.groupby(['EIS_ID', 'SCC', 'ghgrp_faci'])['PM2_5']
    .nunique()
    .reset_index(name='pm25_variety')
    .query('pm25_variety > 1')[['EIS_ID', 'SCC', 'ghgrp_faci']]
)
case1 = duplicates.merge(case1_keys, on=['EIS_ID', 'SCC', 'ghgrp_faci'])
case1_row_keys = set(case1['row_key'])

# Exclude Case 1 rows before doing Case 2
case_others = duplicates[~duplicates['row_key'].isin(case1_row_keys)]

# Case 2: Multiple ghgrp_faci for the same NEI (EIS_ID + SCC)
case2_keys = (
    case_others.groupby(['EIS_ID', 'SCC'])['ghgrp_faci']
    .nunique()
    .reset_index(name='ghgrp_faci_count')
    .query('ghgrp_faci_count > 1')[['EIS_ID', 'SCC']]
)
case2 = case_others.merge(case2_keys, on=['EIS_ID', 'SCC'])
case2_row_keys = set(case2['row_key'])

# Case 3: Remaining (All IDs are same but two subparts, C and D, results in different NH3/VOC increase)
remaining_row_keys = set(duplicates['row_key']) - case1_row_keys - case2_row_keys
case3 = duplicates[duplicates['row_key'].isin(remaining_row_keys)]

# Output
print(f"Total Duplicates: {len(duplicates)} rows")
print(f"Case 1: {len(case1)} rows")
print(f"Case 2: {len(case2)} rows")
print(f"Case 3: {len(case3)} rows")
print(f"Sum of all cases: {len(case1) + len(case2) + len(case3)} rows")

#sort data based on EIS_ID and SCC
case1 = case1.sort_values(by=['EIS_ID', 'SCC'], ascending=[True, True])
case2 = case2.sort_values(by=['EIS_ID', 'SCC'], ascending=[True, True])
case3 = case3.sort_values(by=['EIS_ID', 'SCC'], ascending=[True, True])

case1.to_csv(debug_output_dir + 'Case1_one_ghgrp_fac_multiple_NEI_duplicates.csv', index=False)
case2.to_csv(debug_output_dir + 'Case2_more_than_one_ghgrp_faci_per_NEI_duplicates.csv', index=False)
case3.to_csv(debug_output_dir + 'Case3_other_duplicates.csv', index=False)

In [None]:
import pandas as pd

def solve_case1_duplicates(case1_df):
    """
    Case 1 Solution: Split CCS emissions proportionally based on NEI emissions weights

    For the same EIS_ID, SCC, and ghgrp_faci, NEI-SMOKE processed emissions are split into multiple sources.
    Each species uses its corresponding original emission for weighting.
    """
    print("Solving Case 1 duplicates...")
    
    # Define original emissions columns (NEI-SMOKE processed)
    original_emission_cols = ['VOC', 'NOx', 'NH3', 'SOx', 'PM2_5']
    
    # Define new emissions columns that need to be split
    new_emission_cols = ['VOC_out_su', 'NOX_out_su', 'NH3_out_su', 'SO2_out_su', 'PM25_out_s']
    
    # Mapping between original and new emission columns
    emission_mapping = {
        'VOC_out_su': 'VOC',
        'NOX_out_su': 'NOx',
        'NH3_out_su': 'NH3',
        'SO2_out_su': 'SOx',
        'PM25_out_s': 'PM2_5'
    }
    
    solved_rows = []
    
    # Group by EIS_ID, SCC, ghgrp_faci
    for group_key, group_df in case1_df.groupby(['EIS_ID', 'SCC', 'ghgrp_faci']):
        if len(group_df) > 1:
            for idx, row in group_df.iterrows():
                # Create new row
                new_row = row.copy()
                
                # Apply species-specific weighting to new emissions columns
                for new_col in new_emission_cols:
                    if new_col in new_row.index and pd.notna(new_row[new_col]):
                        # Get corresponding original emission column
                        orig_col = emission_mapping[new_col]
                        
                        # Calculate weight based on this species' original emission
                        total_orig_emission = group_df[orig_col].sum()
                        if total_orig_emission > 0:
                            weight = row[orig_col] / total_orig_emission
                        else:
                            weight = 1 / len(group_df)  # Equal split if no original emission
                        
                        # Apply weighting
                        new_row[new_col] = new_row[new_col] * weight
                
                # Keep original emissions unchanged
                solved_rows.append(new_row)
        else:
            # Single row, no splitting needed
            solved_rows.append(group_df.iloc[0])
    
    return pd.DataFrame(solved_rows)


def solve_case2_duplicates(case2_df):
    """
    Case 2 Solution: Sum NH3 and VOC increase emissions across multiple ghgrp facilities
    
    For the same EIS_ID and SCC, multiple ghgrp facilities may be linked.
    Verify that non-summed new emissions match their original counterparts.
    """
    print("Solving Case 2 duplicates...")
    
    # Define original emissions columns
    original_emission_cols = ['VOC', 'NOx', 'NH3', 'SOx', 'PM2_5']
    
    # Define new emissions columns
    new_emission_cols = ['VOC_out_su', 'NOX_out_su', 'NH3_out_su', 'SO2_out_su', 'PM25_out_s']
    
    # Define which columns to sum (NH3 and VOC increase emissions)
    sum_cols = ['NH3_out_su', 'VOC_out_su']
    
    # Define columns that shouldn't be summed (other species)
    no_sum_cols = ['NOX_out_su', 'SO2_out_su', 'PM25_out_s']
    
    # Mapping for verification
    verification_mapping = {
        'NOX_out_su': 'NOx',
        'SO2_out_su': 'SOx',
        'PM25_out_s': 'PM2_5'
    }
    
    solved_rows = []
    
    # Group by EIS_ID and SCC
    for group_key, group_df in case2_df.groupby(['EIS_ID', 'SCC']):
        if len(group_df) > 1:
            # Verify that no_sum_cols values are consistent with original emissions
            for new_col in no_sum_cols:
                if new_col in group_df.columns and new_col in verification_mapping:
                    orig_col = verification_mapping[new_col]
                    if orig_col in group_df.columns:
                        # Check if all values in the group are the same for both columns
                        new_values = group_df[new_col].dropna().unique()
                        orig_values = group_df[orig_col].dropna().unique()
                        
                        if len(new_values) > 1 or len(orig_values) > 1:
                            print(f"Warning: Inconsistent values in group {group_key} for {new_col}/{orig_col}")
                            print(f"  {new_col} values: {new_values}")
                            print(f"  {orig_col} values: {orig_values}")
            
            # Create a consolidated row
            consolidated_row = group_df.iloc[0].copy()  # Start with first row
            
            # Sum NH3 and VOC increase emissions
            for col in sum_cols:
                if col in group_df.columns:
                    consolidated_row[col] = group_df[col].sum()
            
            # For other species, keep the first value (shouldn't be summed)
            for col in no_sum_cols:
                if col in group_df.columns:
                    consolidated_row[col] = group_df[col].iloc[0]
            
            # Concatenate ghgrp_faci IDs to show which facilities were combined
            consolidated_row['ghgrp_faci'] = ';'.join(group_df['ghgrp_faci'].astype(str).unique())
            
            solved_rows.append(consolidated_row)
        else:
            # Single row, no consolidation needed
            solved_rows.append(group_df.iloc[0])
    
    return pd.DataFrame(solved_rows)


def solve_case3_duplicates(case3_df):
    """
    Case 3 Solution: Sum NH3 and VOC increase emissions from both subparts
    Similar to Case 1, this duplicate comes from having two subparts (C and D), 
    which result in different NH3 and VOC emissions increase.
    Verify that non-summed new emissions match their original counterparts.
    """
    print("Solving Case 3 duplicates...")
    
    # Define original emissions columns
    original_emission_cols = ['VOC', 'NOx', 'NH3', 'SOx', 'PM2_5']
    
    # Define new emissions columns
    new_emission_cols = ['VOC_out_su', 'NOX_out_su', 'NH3_out_su', 'SO2_out_su', 'PM25_out_s']
    
    # Define which columns to sum (NH3 and VOC increase emissions)
    sum_cols = ['NH3_out_su', 'VOC_out_su']
    
    # Define columns that shouldn't be summed (other species)
    no_sum_cols = ['NOX_out_su', 'SO2_out_su', 'PM25_out_s']
    
    # Mapping for verification
    verification_mapping = {
        'NOX_out_su': 'NOx',
        'SO2_out_su': 'SOx',
        'PM25_out_s': 'PM2_5'
    }
    
    solved_rows = []
    
    # Group by EIS_ID and SCC (assuming subparts are differentiated within these groups)
    for group_key, group_df in case3_df.groupby(['EIS_ID', 'SCC']):
        if len(group_df) > 1:
            # Verify that no_sum_cols values are consistent with original emissions
            for new_col in no_sum_cols:
                if new_col in group_df.columns and new_col in verification_mapping:
                    orig_col = verification_mapping[new_col]
                    if orig_col in group_df.columns:
                        # Check if all values in the group are the same for both columns
                        new_values = group_df[new_col].dropna().unique()
                        orig_values = group_df[orig_col].dropna().unique()
                        
                        if len(new_values) > 1 or len(orig_values) > 1:
                            print(f"Warning: Inconsistent values in group {group_key} for {new_col}/{orig_col}")
                            print(f"  {new_col} values: {new_values}")
                            print(f"  {orig_col} values: {orig_values}")
            
            # Create a consolidated row
            consolidated_row = group_df.iloc[0].copy()  # Start with first row
            
            # Sum NH3 and VOC increase emissions
            for col in sum_cols:
                if col in group_df.columns:
                    consolidated_row[col] = group_df[col].sum()
            
            # For other species, keep the first value (shouldn't be summed)
            for col in no_sum_cols:
                if col in group_df.columns:
                    consolidated_row[col] = group_df[col].iloc[0]
            
            # Add a note about subparts being combined if subpart column exists
            if 'subpart' in group_df.columns:
                consolidated_row['subpart'] = ';'.join(group_df['subpart'].astype(str).unique())
            
            solved_rows.append(consolidated_row)
        else:
            # Single row, no consolidation needed
            solved_rows.append(group_df.iloc[0])
    
    return pd.DataFrame(solved_rows)


def apply_duplicate_solutions(gdf_debug, debug_output_dir):
    """
    Main function to apply all duplicate solutions and update the original dataframe
    """
    # Get the row keys that need to be removed from original dataframe
    all_duplicate_row_keys = case1_row_keys.union(case2_row_keys).union(remaining_row_keys)
    
    # Remove duplicates from original dataframe
    gdf_clean = gdf_debug[~gdf_debug.index.isin(all_duplicate_row_keys)].copy()
    
    # Solve each case
    case1_solved = solve_case1_duplicates(case1)
    case2_solved = solve_case2_duplicates(case2)
    case3_solved = solve_case3_duplicates(case3)
    
    # Remove row_key column if it exists (used for tracking)
    for df in [case1_solved, case2_solved, case3_solved]:
        if 'row_key' in df.columns:
            df.drop('row_key', axis=1, inplace=True)
    
    # Combine solved cases back to clean dataframe
    gdf_final = pd.concat([gdf_clean, case1_solved, case2_solved, case3_solved], 
                         ignore_index=True)
    
    # Save solved cases for verification
    case1_solved.to_csv(debug_output_dir + 'Case1_solved.csv', index=False)
    case2_solved.to_csv(debug_output_dir + 'Case2_solved.csv', index=False)
    case3_solved.to_csv(debug_output_dir + 'Case3_solved.csv', index=False)
    
    print(f"Original dataframe: {len(gdf_debug)} rows")
    print(f"After removing duplicates: {len(gdf_clean)} rows")
    print(f"Case 1 solved: {len(case1_solved)} rows")
    print(f"Case 2 solved: {len(case2_solved)} rows")
    print(f"Case 3 solved: {len(case3_solved)} rows")
    print(f"Final dataframe: {len(gdf_final)} rows")
    
    return gdf_final




In [None]:
# Apply the solutions
gdf_final = apply_duplicate_solutions(gdf_debug, debug_output_dir)

# Verify no duplicates remain
final_duplicates = (
    gdf_final.groupby(['EIS_ID', 'SCC'])
    .size()
    .reset_index(name='count')
    .query('count > 1')
)

print(f"Remaining duplicates after solution: {len(final_duplicates)} groups")

# Save final cleaned dataframe
gdf_final.to_csv(debug_output_dir + 'gdf_final_no_duplicates.csv', index=False)