## Current_2020 regional PM2.5 analysis

In [None]:
import geopandas as gpd
import numpy as np
import sys
import os

package_path = os.path.abspath('/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/LOCAETA_AQ')
if package_path not in sys.path:
    sys.path.append(package_path)

import inmap_analysis

inmap_run_dir = '/Users/yunhalee/Documents/LOCAETA/RCM/INMAP/inmap-1.9.6-gridsplit/outputs/'
output_dir = '/Users/yunhalee/Documents/LOCAETA/LOCAETA_AQ/outputs/model_analysis/'

# Define pairs of base and sensitivity runs
run_pairs = {
    'current_2020':{
        'base': 'current_2020_base/2020nei_output_run_steady.shp',
        'sens':'current_2020/2020nei_output_run_steady.shp'
    }}

inmap_columns = ['AsianD', 'BlackD', 'LatinoD', 'NativeD', 'WhitNoLatD', 'TotalPopD']
source_receptor_columns = ['deathsK', 'deathsL']

for run_name, paths in run_pairs.items():
    gdf_diff = inmap_analysis.process_run_pair(run_name, paths, inmap_run_dir)

    columns_list, output_type, area_weight_list = inmap_analysis.determine_output_type(gdf_diff,inmap_columns,source_receptor_columns)
    print(f"The data is from an {output_type} output.")

    # create a directory for each run pair
    run_output_dir = os.path.join(output_dir, run_name)
    if not os.path.exists(run_output_dir):
        os.makedirs(run_output_dir)

    # Remove the row with the minimum TotalPopD for inmap_run
    if output_type == 'inmap_run':

        # compare the changes of PopD matches PM25.
        inmap_analysis.compare_pm25_mortality_changes(gdf_diff,run_output_dir, run_name)

        # Somehow one grid has larger mortality change than population..
        to_check = gdf_diff[(abs(gdf_diff['TotalPopD_base']) > abs(gdf_diff['TotalPop_base']))]
        print("Rows to be deleted due to wrong mortality:\n", to_check)

        gdf_diff = gdf_diff.drop(to_check.index)



In [None]:
gdf_diff.columns

In [None]:
import pandas as pd

regions = gpd.read_file("/Users/yunhalee/Documents/LOCAETA/DataCenter_emissions/gea_mappings_and_shapefiles/transgrp/transgrp.shp")
print("regions", regions.head())

regions = regions[['geometry','transgrp']]

# Ensure same CRS for spatial join
gdf_diff = gdf_diff.to_crs(regions.crs)

# Step 1: Spatial join census_data to regions
inmap_with_region = gpd.sjoin(gdf_diff, regions, how="inner", predicate='within')

inmap_with_region['area_km2'] = inmap_with_region.geometry.area/1e6 

# Separate PM and death columns
pm_cols = ['TotalPM25', 'PrimPM25', 'PNH4', 'PNO3', 'PSO4', 'SOA']
death_cols = ['AsianD', 'BlackD', 'LatinoD', 'NativeD', 'WhitNoLatD', 'TotalPopD']


# --- Area-weighted mean for PM columns ---
for col in pm_cols:
    inmap_with_region[f'{col}_times_area'] = inmap_with_region[col] * inmap_with_region['area_km2']

# Weighted sums and area sums by region
weighted_pm_sums = inmap_with_region.groupby('transgrp')[[f'{col}_times_area' for col in pm_cols]].sum()
area_sums = inmap_with_region.groupby('transgrp')['area_km2'].sum()
area_weighted_means = weighted_pm_sums.div(area_sums, axis=0)
area_weighted_means.columns = [col.replace('_times_area', '') for col in area_weighted_means.columns]

# Death column sums
death_sums = inmap_with_region.groupby('transgrp')[death_cols].sum()

# Merge PM means + death sums
combined_data = pd.concat([area_weighted_means, death_sums], axis=1)

# Get representative geometry per region using dissolve
geometry_gdf = inmap_with_region[['transgrp', 'geometry']].dissolve(by='transgrp', as_index=True)

# Merge geometry with combined data
combined_gdf = geometry_gdf.join(combined_data)

# Reset index and rename 'transgrp' to 'cambium_gea'
combined_gdf = combined_gdf.reset_index().rename(columns={'transgrp': 'cambium_gea'})

# Final GeoDataFrame
combined_gdf = gpd.GeoDataFrame(combined_gdf, geometry='geometry')

In [None]:
import matplotlib.pyplot as plt

# Step 1: Sort final_gdf by population once
final_gdf_sorted = combined_gdf.sort_values(by='TotalPM25', ascending=False).reset_index(drop=True)

target_cols = ['AsianD', 'BlackD', 'LatinoD', 'NativeD', 'WhitNoLatD', 'TotalPopD']
pollutants = ['PrimPM25','PNH4','PNO3', 'PSO4',  'SOA']

# Step 2: Set up figure and axis
fig, ax1 = plt.subplots(figsize=(14, 6))

# Step 3: Plot stacked bar for emissions
poll_data = final_gdf_sorted.set_index('cambium_gea')[pollutants]
poll_data.plot(kind='bar', stacked=True, ax=ax1, cmap='tab10')

ax1.set_ylabel('PM2.5 Difference (ug m-3)')
ax1.set_xlabel('Region (cambium_gea)')
ax1.set_title('PM2.5 Difference and Premature Mortality by Region (Sorted by TotalPM25)')
ax1.tick_params(axis='x', rotation=45)

# Step 5: Add population on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(final_gdf_sorted['cambium_gea'], final_gdf_sorted['TotalPopD'], color='black', marker='o', label='TotalPopD')
ax2.set_ylabel('Premature Mortality', color='black')
ax2.tick_params(axis='y', labelcolor='black')

# Step 6: Add legend
bars_labels = ax1.get_legend_handles_labels()
pop_label = ax2.get_legend_handles_labels()
ax1.legend(bars_labels[0] + pop_label[0], bars_labels[1] + pop_label[1], bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()


In [None]:
combined_gdf.plot('TotalPM25', legend=True)

In [None]:
gdf_diff.plot('TotalPM25', legend=True)

In [None]:
inmap_with_region.plot('TotalPM25', legend=True)

## US-wide data center scenarios

This is a preliminary analysis to understand the impact of 300 MW data center building in a region, which follows the Cambiam  uses total 18 regions over USA CONUS. 

In [None]:
import geopandas as gpd
import numpy as np

# Load the census data
census_data = gpd.read_file("/Users/yunhalee/Documents/LOCAETA/NEI_emissions/NEI_2020_gaftp_Jun2024/emiss_shp2020/Census/ACS_2020_5YR_BG_pop_hu.shp")
print("census data", census_data.head())
census_data = census_data [['geometry','POP2020']]

In [None]:
regions = gpd.read_file("/Users/yunhalee/Documents/LOCAETA/eGRID_emissions/gea_mappings_and_shapefiles/transgrp/transgrp.shp")
print("regions", regions.head())

regions = regions[['geometry','transgrp']]

In [None]:
import pandas as pd

emis  = pd.read_csv("/Users/yunhalee/Documents/LOCAETA/eGRID_emissions/300MW_national_current_2020.csv")
print("emis", emis.head())

In [None]:

# Ensure same CRS for spatial join
census_data = census_data.to_crs(regions.crs)

# Step 1: Spatial join census_data to regions
census_with_region = gpd.sjoin(census_data, regions, how="inner", predicate='within')

# Step 2: Sum POP2020 by region
pop_by_region = census_with_region.groupby('transgrp')['POP2020'].sum().reset_index()

# Merge population with regions
regions_with_pop = regions.merge(pop_by_region, on='transgrp', how='left')

regions_with_pop.rename(columns={'transgrp': 'cambium_gea'}, inplace=True)
regions_with_pop

In [None]:
# Aggregate emissions by cambium_gea regions
emissions_columns = [
    'NOx_tons_base', 'SO2_tons_base', 'PM2.5_tons_base', 'NH3_tons_base', 'VOC_tons_base',
    'NOx_tons_final', 'SO2_tons_final', 'PM2.5_tons_final', 'NH3_tons_final', 'VOC_tons_final',
    'NOx_tons_dif_final', 'SO2_tons_dif_final', 'PM2.5_tons_dif_final', 'NH3_tons_dif_final',
    'VOC_tons_dif_final'
]

emis_sum_by_region = emis.groupby('cambium_gea')[emissions_columns].sum().reset_index()

# Merge emissions with regions
final_gdf = regions_with_pop.merge(emis_sum_by_region, left_on='cambium_gea', right_on='cambium_gea')

In [None]:
import matplotlib.pyplot as plt

# Step 1: Sort by population
final_gdf_sorted = final_gdf.sort_values(by='POP2020', ascending=False)

# Step 2: Select emissions difference columns
emissions_diff_cols = [col for col in final_gdf_sorted.columns if 'tons_dif_final' in col]

# Step 3: Set up figure and axis
fig, ax1 = plt.subplots(figsize=(14, 6))

# Step 4: Plot stacked bar for emissions
emissions_data = final_gdf_sorted.set_index('cambium_gea')[emissions_diff_cols]
emissions_data.plot(kind='bar', stacked=True, ax=ax1, cmap='tab10')

ax1.set_ylabel('Emissions Difference (tons)')
ax1.set_xlabel('Region (cambium_gea)')
ax1.set_title('Emissions Difference and Population by Region (Sorted by POP2020)')
ax1.tick_params(axis='x', rotation=45)

# Step 5: Add population on secondary y-axis
ax2 = ax1.twinx()
ax2.plot(final_gdf_sorted['cambium_gea'], final_gdf_sorted['POP2020'], color='black', marker='o', label='POP2020')
ax2.set_ylabel('Population (2020)', color='black')
ax2.tick_params(axis='y', labelcolor='black')

# Step 6: Add legend
bars_labels = ax1.get_legend_handles_labels()
pop_label = ax2.get_legend_handles_labels()
ax1.legend(bars_labels[0] + pop_label[0], bars_labels[1] + pop_label[1], bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()



In [None]:
import matplotlib.pyplot as plt

# Step 1: Sort final_gdf by population once
final_gdf_sorted = final_gdf.sort_values(by='POP2020', ascending=False).reset_index(drop=True)

# Step 2: Identify emissions difference columns
emissions_diff_cols = [col for col in final_gdf_sorted.columns if 'tons_dif_final' in col]

# Step 3: Loop through each pollutant
for col in emissions_diff_cols:
    fig, ax1 = plt.subplots(figsize=(12, 5))

    x = final_gdf_sorted['cambium_gea']
    y = final_gdf_sorted[col]

    # Bar plot
    bars = ax1.bar(x, y, color='skyblue')
    ax1.set_ylabel(f'{col} (tons)', color='skyblue')
    ax1.set_xlabel('Region (cambium_gea)')
    ax1.set_title(f'{col} and Population by Region (Sorted by Population)')
    ax1.tick_params(axis='x', rotation=90)

    # Step 4: Compute emission rankings
    emission_ranks = y.rank(ascending=False, method='min')  # smaller rank = higher emission

    # Step 5: Label each bar with its emission rank
    for i, bar in enumerate(bars):
        height = bar.get_height()
        if pd.notnull(height):
            rank = int(emission_ranks.iloc[i])
            ax1.text(
                bar.get_x() + bar.get_width() / 2,
                height + (0.01 * max(y.fillna(0))),
                f'{rank}',
                ha='center', va='bottom', fontsize=9, color='blue'
            )

    # Step 6: Add population line on secondary y-axis
    ax2 = ax1.twinx()
    ax2.plot(x, final_gdf_sorted['POP2020'], color='black', marker='o', label='POP2020')
    ax2.set_ylabel('Population (2020)', color='black')
    ax2.tick_params(axis='y', labelcolor='black')

    # Step 7: Add legend
    ax2.legend(loc='upper right')

    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd

# Make a copy of final_gdf sorted by population (same order as plots)
final_gdf_sorted = final_gdf.sort_values(by='POP2020', ascending=False).reset_index(drop=True)

# Identify emissions difference columns
emissions_diff_cols = [col for col in final_gdf_sorted.columns if 'tons_dif_final' in col]

# Create a new DataFrame to hold rankings, starting with region identifier
rankings_df = pd.DataFrame()
rankings_df['cambium_gea'] = final_gdf_sorted['cambium_gea']

# Compute rankings for each pollutant (1 = highest emission)
for col in emissions_diff_cols:
    rankings_df[col + '_rank'] = final_gdf_sorted[col].rank(ascending=False, method='min').astype(int)

# Save to CSV
rankings_df.to_csv('emission_rankings_by_region.csv', index=False)

print("Emission rankings saved to 'emission_rankings_by_region.csv'")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Sort regions by population
final_gdf_sorted = final_gdf.sort_values(by='POP2020', ascending=False).reset_index(drop=True)

# Identify emissions difference columns
emissions_diff_cols = [col for col in final_gdf_sorted.columns if 'tons_dif_final' in col]

# Compute rankings (1 = highest emission) for each pollutant
rankings_df = pd.DataFrame()
rankings_df['cambium_gea'] = final_gdf_sorted['cambium_gea']

for col in emissions_diff_cols:
    rankings_df[col] = final_gdf_sorted[col].rank(ascending=False, method='min')

# Set the region as index for better plotting
rankings_df.set_index('cambium_gea', inplace=True)

# Plot heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(
    rankings_df,
    cmap='Reds_r',            # reversed Reds so low ranks (1,2) are dark red (high emphasis)
    linewidths=0.5,
    linecolor='gray',
    cbar_kws={'label': 'Emission Rank (1=Highest)'},
    annot=False               # no numbers in cells, just colors
)

plt.title('Emission Rankings Heatmap by Region and Pollutant\n(Regions sorted by Population)')
plt.ylabel('Region (cambium_gea)')
plt.xlabel('Pollutant Emission Differences')
plt.yticks(rotation=0)        # keep region labels horizontal
plt.xticks(rotation=45)       # rotate pollutant names for clarity
plt.tight_layout()
plt.show()
