In [4]:
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
from sklearn.preprocessing import StandardScaler

In [5]:
df = pd.read_csv("MasterDataset.csv")

df = df.drop(columns=['snow','snwd']) # drop snow and snwd

df["month"] = pd.to_datetime(df["month"], format="%Y-%m").dt.to_period("M")

df["year"] = df["month"].dt.year
df["month_num"] = df["month"].dt.month

# fill dates from Jan 2018 and Jul 2025
states = sorted(df["state"].unique())
all_months = pd.period_range("2018-01", "2025-07", freq="M") 

base_panel = (
    pd.MultiIndex.from_product([states, all_months], names=["state", "month"])
    .to_frame(index=False)
)

# mering
full = base_panel.merge(
    df,
    on=["state", "month"],
    how="left",
    indicator=True
)

# 6. Mark which rows were originally present
full["observed"] = full["_merge"].eq("both")
full.drop(columns="_merge", inplace=True)


#sort
full = full.sort_values(["state", "month"]).reset_index(drop=True)

full.to_csv("MasterDataset_2018_2025_fullpanel.csv", index=False)
print(full.shape)
print(full.head())


(4550, 37)
     state    month     vmt      ndvi  flights   co_mean  co_max1_value  \
0  Alabama  2018-01  5101.0  0.552518   6819.0  0.270312       0.527258   
1  Alabama  2018-02  4975.0  0.541536   6417.0  0.219343       0.402143   
2  Alabama  2018-03  5952.0  0.565882   7507.0  0.226716       0.407704   
3  Alabama  2018-04  6145.0  0.679999   7541.0  0.214604       0.392311   
4  Alabama  2018-05  6253.0  0.761965   8002.0  0.302609       0.490366   

   co_max1_hour    co_aqi   no2_mean  ...  o3_max1_hour     o3_aqi      awnd  \
0     10.153226  5.209677  12.320599  ...     11.854839  30.725806  3.119355   
1     10.383929  3.946429   8.593592  ...     21.219246  22.709325  3.514286   
2      8.952151  3.747312   9.844573  ...     10.468993  40.165997  3.558065   
3      8.936111  3.705556   8.937356  ...     10.388948  45.017963  3.526667   
4      9.212366  4.946237  11.115237  ...      9.495067  45.172110  2.141935   

    prcp       tavg       tmax       tmin  year  month_nu

In [6]:
# # Count how many months are actually populated per state-year in the full panel
# month_counts = (
#     full.groupby(["state", "year"])["month"]
#         .nunique()
#         .reset_index(name="n_months")
# )
full = full.sort_values(["state", "month"])

numeric_cols = full.select_dtypes(include=["float64", "int64"]).columns.tolist()
for drop_col in ["year", "month_num", "month"]:
    if drop_col in numeric_cols:
        numeric_cols.remove(drop_col)

# polynomial interpolation per state w degree 2
full[numeric_cols] = full.groupby("state")[numeric_cols].transform(
    lambda g: g.interpolate(method="polynomial", order=2)
)

full[numeric_cols] = full.groupby("state")[numeric_cols].transform(
    lambda g: g.ffill().bfill()
)

print(full[numeric_cols].isnull().sum().sum())

full.to_csv("MasterDataset_interpolated.csv", index=False)


1820


In [7]:
## to fill the 1820 (they're fully missing so interpolation won't work
#print(full.isna().sum())

# Groups of columns that still have NA
cols_no2 = ["no2_mean", "no2_max1_value", "no2_max1_hour", "no2_aqi"]
cols_pm10 = ["pm10_mean", "pm10_max1_value", "pm10_max1_hour", "pm10_aqi"]
cols_tavg = ["tavg"]

# For each month, fill remaining NAs with monthly mean
for cols in [cols_no2, cols_pm10, cols_tavg]:
    for col in cols:
        full[col] = full.groupby("month")[col].transform(
            lambda s: s.fillna(s.mean())
        )

print("Total remaining NAs:", full.isna().sum().sum())
print(full[cols_no2 + cols_pm10 + cols_tavg].isna().sum())

#
full.to_csv("MasterDataset_interpolated_final.csv", index=False)

Total remaining NAs: 0
no2_mean           0
no2_max1_value     0
no2_max1_hour      0
no2_aqi            0
pm10_mean          0
pm10_max1_value    0
pm10_max1_hour     0
pm10_aqi           0
tavg               0
dtype: int64


In [11]:
df = full.copy()

df["month"] = df["month"].astype("period[M]")

# Sort
df = df.sort_values(["state", "month"]).reset_index(drop=True)

# Target
target_col = "pm25_mean"   

continuous_cols = [
    'vmt','ndvi','flights',
    'co_mean','co_max1_value','co_max1_hour','co_aqi',
    'no2_mean','no2_max1_value','no2_max1_hour','no2_aqi',
    'pm25_max1_value','pm25_max1_hour','pm25_aqi',
    'pm10_mean','pm10_max1_value','pm10_max1_hour','pm10_aqi',
    'so2_mean','so2_max1_value','so2_max1_hour','so2_aqi',
    'o3_mean','o3_max1_value','o3_max1_hour','o3_aqi',
    'awnd','prcp','tavg','tmax','tmin'
] 
feature_cols = continuous_cols + ['month_num']

print("len(feature_cols):", len(feature_cols))
print(feature_cols)

train_end = pd.Period("2021-12", freq="M")
val_end   = pd.Period("2023-12", freq="M")

train_df = df[df["month"] <= train_end]
val_df   = df[(df["month"] > train_end) & (df["month"] <= val_end)]
test_df  = df[df["month"] > val_end]

print("Train:", train_df["month"].min(), "->", train_df["month"].max())
print("Val:  ", val_df["month"].min(),   "->", val_df["month"].max())
print("Test: ", test_df["month"].min(),  "->", test_df["month"].max())

# scale features
scaler = StandardScaler().fit(train_df[feature_cols])

def apply_scaler(local_df):
    d = local_df.copy()
    d[feature_cols] = scaler.transform(d[feature_cols].values)
    return d

train_df_s = apply_scaler(train_df)
val_df_s   = apply_scaler(val_df)
test_df_s  = apply_scaler(test_df)


len(feature_cols): 32
['vmt', 'ndvi', 'flights', 'co_mean', 'co_max1_value', 'co_max1_hour', 'co_aqi', 'no2_mean', 'no2_max1_value', 'no2_max1_hour', 'no2_aqi', 'pm25_max1_value', 'pm25_max1_hour', 'pm25_aqi', 'pm10_mean', 'pm10_max1_value', 'pm10_max1_hour', 'pm10_aqi', 'so2_mean', 'so2_max1_value', 'so2_max1_hour', 'so2_aqi', 'o3_mean', 'o3_max1_value', 'o3_max1_hour', 'o3_aqi', 'awnd', 'prcp', 'tavg', 'tmax', 'tmin', 'month_num']
Train: 2018-01 -> 2021-12
Val:   2022-01 -> 2023-12
Test:  2024-01 -> 2025-07




In [9]:
seq_len = 12
def build_sequences(df, seq_len, feature_cols, target_col):
    sequences_X, sequences_y = [], []
    
    for state in df['state'].unique():
        state_data = df[df['state'] == state].sort_values('month')
        state_features = state_data[feature_cols].values
        state_target = state_data[target_col].values
        
        for i in range(len(state_data) - seq_len):
            sequences_X.append(state_features[i:(i + seq_len)])
            sequences_y.append(state_target[i + seq_len])
    
    return np.array(sequences_X), np.array(sequences_y)

train_X, train_y = build_sequences(train_df_s, seq_len, feature_cols, target_col)
val_X,   val_y   = build_sequences(val_df_s,   seq_len, feature_cols, target_col)
test_X,  test_y  = build_sequences(test_df_s,  seq_len, feature_cols, target_col)

print("train_X:", train_X.shape)
print("train_y:", train_y.shape)
print("val_X:",   val_X.shape)
print("val_y:",   val_y.shape)
print("test_X:",  test_X.shape)
print("test_y:",  test_y.shape)

train_X: (1800, 12, 32)
train_y: (1800,)
val_X: (600, 12, 32)
val_y: (600,)
test_X: (350, 12, 32)
test_y: (350,)


In [10]:
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers

n_timesteps = train_X.shape[1]
n_features  = train_X.shape[2]

l2_reg = regularizers.l2(1e-4) #regularisation

model = models.Sequential([
    layers.Input(shape=(n_timesteps, n_features)),
    layers.LSTM(64, return_sequences=True,kernel_regularizer=l2_reg, recurrent_regularizer=l2_reg),
    layers.Dropout(0.2),
    layers.LSTM(32,return_sequences=False,kernel_regularizer=l2_reg,recurrent_regularizer=l2_reg),
    layers.Dropout(0.2),
    layers.Dense(32, activation="relu",kernel_regularizer=l2_reg),
    layers.Dense(32, activation="relu",kernel_regularizer=l2_reg),
    layers.Dense(16,activation="relu"),
    layers.Dense(1) 
])

model.compile(
    loss="mse",
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    metrics=["mae"]
)

model.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 12, 64)            24832     
                                                                 
 dropout (Dropout)           (None, 12, 64)            0         
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense (Dense)               (None, 32)                1056      
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 dense_2 (Dense)             (None, 16)                5

In [12]:
## training

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=7,
    min_lr=1e-5
)

history = model.fit(
    train_X, train_y,
    validation_data=(val_X, val_y),
    epochs=150,
    batch_size=32,
    callbacks=[early_stop,reduce_lr],
    verbose=1
)


Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
from sklearn.metrics import mean_squared_error
from collections import Counter

def build_lstm_model(n_timesteps, n_features, l2_lambda=1e-4):
    l2_reg = regularizers.l2(l2_lambda)
    model = models.Sequential([
        layers.Input(shape=(n_timesteps, n_features)),
        layers.LSTM(64, return_sequences=True,
                    kernel_regularizer=l2_reg,
                    recurrent_regularizer=l2_reg),
        layers.Dropout(0.2),
        layers.LSTM(32, return_sequences=False,
                    kernel_regularizer=l2_reg,
                    recurrent_regularizer=l2_reg),
        layers.Dropout(0.2),
        layers.Dense(32, activation="relu", kernel_regularizer=l2_reg),
        layers.Dense(32, activation="relu", kernel_regularizer=l2_reg),
        layers.Dense(16, activation="relu"),
        layers.Dense(1)
    ])

    model.compile(
        loss="mse",
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        metrics=["mae"]
    )
    return model

def permutation_importance_lstm(model, X_val, y_val, feature_cols):
    """
    Returns dict: {feature_name: ΔMSE} for validation set.
    """
    # Baseline performance
    y_pred = model.predict(X_val, verbose=0)
    baseline_mse = mean_squared_error(y_val, y_pred)

    importances = {}

    for j, fname in enumerate(feature_cols):
        X_perm = X_val.copy()
        for t in range(X_perm.shape[1]):
            np.random.shuffle(X_perm[:, t, j])

        y_pred_perm = model.predict(X_perm, verbose=0)
        mse_perm = mean_squared_error(y_val, y_pred_perm)

        importances[fname] = mse_perm - baseline_mse

    return importances, baseline_mse

top_k = 5  
feature_counter = Counter()
all_target_importances = {}  #

for target_col in feature_cols:
    print("\n" + "="*70)
    print(f"Training LSTM with target: {target_col}")
    print("="*70)

    # Build sequences for this target
    train_X, train_y = build_sequences(train_df_s, seq_len, feature_cols, target_col)
    val_X,   val_y   = build_sequences(val_df_s,   seq_len, feature_cols, target_col)

    # Skip if not enough data
    if len(train_X) == 0 or len(val_X) == 0:
        print(f"  [SKIP] Not enough sequences for target {target_col}")
        continue

    n_timesteps = train_X.shape[1]
    n_features  = train_X.shape[2]

    # Build and train model
    model = build_lstm_model(n_timesteps, n_features, l2_lambda=1e-4)

    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=5,
        restore_best_weights=True
    )
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss",
        factor=0.5,
        patience=7,
        min_lr=1e-5
    )

    history = model.fit(
        train_X, train_y,
        validation_data=(val_X, val_y),
        epochs=150,
        batch_size=32,
        callbacks=[early_stop, reduce_lr],
        verbose=0  
    )

    # Permutation importance for this target
    importances, baseline_mse = permutation_importance_lstm(
        model, val_X, val_y, feature_cols
    )
    all_target_importances[target_col] = importances

    # Sort features by MSE (descending)
    sorted_imps = sorted(importances.items(), key=lambda x: x[1], reverse=True)

    # exclude the target itself from the ranking:
    sorted_imps = [item for item in sorted_imps if item[0] != target_col]

    # Take top-k important features for this target
    top_feats_for_target = [f for f, imp in sorted_imps[:top_k]]

    print(f"  Top {top_k} features for target {target_col}:")
    for f, imp in sorted_imps[:top_k]:
        print(f"    {f}: ΔMSE = {imp:.4f}")

    feature_counter.update(top_feats_for_target)

summary_df = pd.DataFrame({
    "feature": list(feature_counter.keys()),
    "count_in_topk": list(feature_counter.values())
}).sort_values("count_in_topk", ascending=False).reset_index(drop=True)

print("\n=== Features most frequently appearing in top-{} across all targets ===".format(top_k))
print(summary_df)

summary_df.to_csv("lstm_permutation_global_importance_topk.csv", index=False)





Training LSTM with target: vmt




  Top 5 features for target vmt:
    no2_max1_value: ΔMSE = 0.0081
    o3_max1_value: ΔMSE = 0.0066
    o3_max1_hour: ΔMSE = 0.0038
    pm25_aqi: ΔMSE = 0.0037
    pm25_max1_value: ΔMSE = 0.0033

Training LSTM with target: ndvi




  Top 5 features for target ndvi:
    month_num: ΔMSE = 0.0909
    tavg: ΔMSE = 0.0216
    pm25_aqi: ΔMSE = 0.0188
    tmin: ΔMSE = 0.0163
    tmax: ΔMSE = 0.0137

Training LSTM with target: flights




  Top 5 features for target flights:
    vmt: ΔMSE = 0.1540
    o3_aqi: ΔMSE = 0.0128
    awnd: ΔMSE = 0.0110
    pm10_max1_hour: ΔMSE = 0.0085
    pm10_mean: ΔMSE = 0.0073

Training LSTM with target: co_mean




  Top 5 features for target co_mean:
    co_aqi: ΔMSE = 0.0659
    ndvi: ΔMSE = 0.0646
    co_max1_value: ΔMSE = 0.0553
    flights: ΔMSE = 0.0443
    so2_max1_hour: ΔMSE = 0.0326

Training LSTM with target: co_max1_value




  Top 5 features for target co_max1_value:
    co_aqi: ΔMSE = 0.1054
    month_num: ΔMSE = 0.0431
    ndvi: ΔMSE = 0.0218
    co_mean: ΔMSE = 0.0216
    pm10_max1_value: ΔMSE = 0.0190

Training LSTM with target: co_max1_hour




  Top 5 features for target co_max1_hour:
    month_num: ΔMSE = 0.0462
    no2_mean: ΔMSE = 0.0371
    ndvi: ΔMSE = 0.0241
    so2_mean: ΔMSE = 0.0204
    vmt: ΔMSE = 0.0194

Training LSTM with target: co_aqi




  Top 5 features for target co_aqi:
    co_mean: ΔMSE = 0.0492
    co_max1_value: ΔMSE = 0.0360
    month_num: ΔMSE = 0.0204
    ndvi: ΔMSE = 0.0203
    awnd: ΔMSE = 0.0142

Training LSTM with target: no2_mean




  Top 5 features for target no2_mean:
    month_num: ΔMSE = 0.0391
    no2_aqi: ΔMSE = 0.0326
    no2_max1_value: ΔMSE = 0.0249
    tavg: ΔMSE = 0.0147
    pm25_max1_hour: ΔMSE = 0.0114

Training LSTM with target: no2_max1_value




  Top 5 features for target no2_max1_value:
    month_num: ΔMSE = 0.0727
    no2_aqi: ΔMSE = 0.0637
    no2_mean: ΔMSE = 0.0610
    pm25_max1_hour: ΔMSE = 0.0114
    tmax: ΔMSE = 0.0095

Training LSTM with target: no2_max1_hour




  Top 5 features for target no2_max1_hour:
    ndvi: ΔMSE = 0.0176
    month_num: ΔMSE = 0.0163
    flights: ΔMSE = 0.0104
    tavg: ΔMSE = 0.0102
    o3_aqi: ΔMSE = 0.0102

Training LSTM with target: no2_aqi




  Top 5 features for target no2_aqi:
    no2_max1_value: ΔMSE = 0.0798
    month_num: ΔMSE = 0.0593
    no2_mean: ΔMSE = 0.0247
    o3_aqi: ΔMSE = 0.0176
    pm10_aqi: ΔMSE = 0.0146

Training LSTM with target: pm25_max1_value




  Top 5 features for target pm25_max1_value:
    o3_mean: ΔMSE = 0.0298
    pm25_max1_hour: ΔMSE = 0.0172
    co_max1_hour: ΔMSE = 0.0137
    o3_max1_value: ΔMSE = 0.0084
    pm25_aqi: ΔMSE = 0.0051

Training LSTM with target: pm25_max1_hour




  Top 5 features for target pm25_max1_hour:
    awnd: ΔMSE = 0.0098
    so2_max1_value: ΔMSE = 0.0064
    so2_mean: ΔMSE = 0.0060
    prcp: ΔMSE = 0.0057
    no2_max1_value: ΔMSE = 0.0050

Training LSTM with target: pm25_aqi




  Top 5 features for target pm25_aqi:
    flights: ΔMSE = 0.0835
    o3_aqi: ΔMSE = 0.0613
    ndvi: ΔMSE = 0.0424
    o3_max1_value: ΔMSE = 0.0324
    vmt: ΔMSE = 0.0316

Training LSTM with target: pm10_mean




  Top 5 features for target pm10_mean:
    o3_mean: ΔMSE = 0.0540
    o3_max1_value: ΔMSE = 0.0245
    month_num: ΔMSE = 0.0210
    vmt: ΔMSE = 0.0150
    o3_aqi: ΔMSE = 0.0128

Training LSTM with target: pm10_max1_value




  Top 5 features for target pm10_max1_value:
    month_num: ΔMSE = 0.0471
    o3_mean: ΔMSE = 0.0457
    o3_aqi: ΔMSE = 0.0269
    no2_aqi: ΔMSE = 0.0106
    o3_max1_value: ΔMSE = 0.0102

Training LSTM with target: pm10_max1_hour




  Top 5 features for target pm10_max1_hour:
    ndvi: ΔMSE = 0.0065
    pm25_aqi: ΔMSE = 0.0062
    pm25_max1_value: ΔMSE = 0.0053
    no2_max1_value: ΔMSE = 0.0038
    o3_max1_value: ΔMSE = 0.0030

Training LSTM with target: pm10_aqi




  Top 5 features for target pm10_aqi:
    month_num: ΔMSE = 0.0602
    o3_aqi: ΔMSE = 0.0179
    flights: ΔMSE = 0.0155
    o3_mean: ΔMSE = 0.0154
    o3_max1_value: ΔMSE = 0.0141

Training LSTM with target: so2_mean




  Top 5 features for target so2_mean:
    pm10_aqi: ΔMSE = 0.0363
    pm10_mean: ΔMSE = 0.0301
    ndvi: ΔMSE = 0.0210
    o3_mean: ΔMSE = 0.0200
    o3_max1_hour: ΔMSE = 0.0141

Training LSTM with target: so2_max1_value




  Top 5 features for target so2_max1_value:
    so2_mean: ΔMSE = 0.7076
    so2_aqi: ΔMSE = 0.0878
    o3_aqi: ΔMSE = 0.0383
    pm10_max1_hour: ΔMSE = 0.0175
    month_num: ΔMSE = 0.0150

Training LSTM with target: so2_max1_hour
  Top 5 features for target so2_max1_hour:
    pm25_max1_hour: ΔMSE = 0.0147
    pm10_max1_hour: ΔMSE = 0.0129
    o3_mean: ΔMSE = 0.0113
    o3_max1_value: ΔMSE = 0.0094
    awnd: ΔMSE = 0.0088

Training LSTM with target: so2_aqi




  Top 5 features for target so2_aqi:
    so2_mean: ΔMSE = 0.0234
    pm10_aqi: ΔMSE = 0.0190
    flights: ΔMSE = 0.0143
    co_aqi: ΔMSE = 0.0116
    o3_max1_value: ΔMSE = 0.0108

Training LSTM with target: o3_mean




  Top 5 features for target o3_mean:
    month_num: ΔMSE = 0.0752
    o3_max1_hour: ΔMSE = 0.0399
    ndvi: ΔMSE = 0.0301
    o3_max1_value: ΔMSE = 0.0179
    co_mean: ΔMSE = 0.0147

Training LSTM with target: o3_max1_value




  Top 5 features for target o3_max1_value:
    o3_max1_hour: ΔMSE = 0.0893
    month_num: ΔMSE = 0.0631
    o3_aqi: ΔMSE = 0.0262
    ndvi: ΔMSE = 0.0248
    o3_mean: ΔMSE = 0.0205

Training LSTM with target: o3_max1_hour




  Top 5 features for target o3_max1_hour:
    month_num: ΔMSE = 0.1284
    vmt: ΔMSE = 0.0613
    pm10_max1_value: ΔMSE = 0.0460
    co_max1_hour: ΔMSE = 0.0251
    pm10_mean: ΔMSE = 0.0189

Training LSTM with target: o3_aqi




  Top 5 features for target o3_aqi:
    month_num: ΔMSE = 0.1105
    o3_max1_hour: ΔMSE = 0.0659
    ndvi: ΔMSE = 0.0359
    tavg: ΔMSE = 0.0308
    pm10_aqi: ΔMSE = 0.0196

Training LSTM with target: awnd




  Top 5 features for target awnd:
    month_num: ΔMSE = 0.0449
    pm10_mean: ΔMSE = 0.0183
    flights: ΔMSE = 0.0139
    pm25_aqi: ΔMSE = 0.0133
    prcp: ΔMSE = 0.0106

Training LSTM with target: prcp




  Top 5 features for target prcp:
    pm25_aqi: ΔMSE = 0.0252
    ndvi: ΔMSE = 0.0209
    co_max1_hour: ΔMSE = 0.0169
    month_num: ΔMSE = 0.0110
    o3_max1_value: ΔMSE = 0.0073

Training LSTM with target: tavg




  Top 5 features for target tavg:
    month_num: ΔMSE = 0.1175
    tmax: ΔMSE = 0.0138
    no2_max1_hour: ΔMSE = 0.0121
    no2_aqi: ΔMSE = 0.0119
    o3_mean: ΔMSE = 0.0111

Training LSTM with target: tmax




  Top 5 features for target tmax:
    month_num: ΔMSE = 0.2211
    o3_mean: ΔMSE = 0.0145
    tavg: ΔMSE = 0.0107
    vmt: ΔMSE = 0.0103
    tmin: ΔMSE = 0.0093

Training LSTM with target: tmin




  Top 5 features for target tmin:
    month_num: ΔMSE = 0.1907
    o3_mean: ΔMSE = 0.0133
    tmax: ΔMSE = 0.0120
    tavg: ΔMSE = 0.0097
    vmt: ΔMSE = 0.0065

Training LSTM with target: month_num
  Top 5 features for target month_num:
    prcp: ΔMSE = 0.0140
    so2_max1_value: ΔMSE = 0.0094
    tavg: ΔMSE = 0.0071
    tmin: ΔMSE = 0.0039
    tmax: ΔMSE = 0.0033

=== Features most frequently appearing in top-5 across all targets ===
            feature  count_in_topk
0         month_num             21
1              ndvi             12
2     o3_max1_value             11
3           o3_mean             10
4            o3_aqi              9
5              tavg              7
6               vmt              7
7          pm25_aqi              6
8           flights              6
9    no2_max1_value              5
10     o3_max1_hour              5
11             tmax              5
12   pm25_max1_hour              4
13          no2_aqi              4
14         so2_mean              4


In [35]:
test_loss, test_mae = model.evaluate(test_X, test_y, verbose=0)
print("Test MSE:", test_loss)
print("Test MAE:", test_mae)

val_loss, val_mae = model.evaluate(val_X, val_y)
print("Validation MAE:", val_mae)

# Make predictions
y_pred = model.predict(val_X)

Test MSE: 2.5730667114257812
Test MAE: 1.1804025173187256
Validation MAE: 1.7542928457260132


In [3]:
!python --version

Python 3.8.20


In [15]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers

# Build AR dataset: pm25_mean + selected features

df_ar = full.copy()  
df_ar["month"] = df_ar["month"].dt.to_timestamp()
df_ar = df_ar.sort_values(["state", "month"]).reset_index(drop=True)

target_col = "pm25_mean"

# Time features
df_ar["month_num"] = df_ar["month"].dt.month
df_ar["month_sin"] = np.sin(2 * np.pi * df_ar["month_num"] / 12)
df_ar["month_cos"] = np.cos(2 * np.pi * df_ar["month_num"] / 12)

ar_feature_cols = ["pm25_mean", "month_sin", "month_cos"]
print("Autoregressive feature cols:", ar_feature_cols)

Autoregressive feature cols: ['pm25_mean', 'month_sin', 'month_cos']


In [16]:
train_end = pd.to_datetime("2021-12-01")
val_end   = pd.to_datetime("2023-12-01")

train_df_ar = df_ar[df_ar["month"] <= train_end].copy()
val_df_ar   = df_ar[(df_ar["month"] > train_end) & (df_ar["month"] <= val_end)].copy()
test_df_ar  = df_ar[df_ar["month"] > val_end].copy()

print("AR Train:", train_df_ar["month"].min(), "->", train_df_ar["month"].max())
print("AR Val:  ", val_df_ar["month"].min(),   "->", val_df_ar["month"].max())
print("AR Test: ", test_df_ar["month"].min(),  "->", test_df_ar["month"].max())


# Build AR sequences: past 12 steps -> next step

seq_len = 12

def build_ar_sequences(df, seq_len, feature_cols, target_col):
    X_list, y_list = [], []
    for state in df["state"].unique():
        g = df[df["state"] == state].sort_values("month")
        feat = g[feature_cols].values   
        targ = g[target_col].values 
        if len(g) <= seq_len:
            continue
        for i in range(len(g) - seq_len):
            X_list.append(feat[i:i+seq_len])
            y_list.append(targ[i+seq_len])
    return np.array(X_list), np.array(y_list)

train_X_ar, train_y_ar = build_ar_sequences(train_df_ar, seq_len, ar_feature_cols, target_col)
val_X_ar,   val_y_ar   = build_ar_sequences(val_df_ar,   seq_len, ar_feature_cols, target_col)
test_X_ar,  test_y_ar  = build_ar_sequences(test_df_ar,  seq_len, ar_feature_cols, target_col)

print("AR train:", train_X_ar.shape, train_y_ar.shape)
print("AR val:  ", val_X_ar.shape,   val_y_ar.shape)
print("AR test: ", test_X_ar.shape,  test_y_ar.shape)


AR Train: 2018-01-01 00:00:00 -> 2021-12-01 00:00:00
AR Val:   2022-01-01 00:00:00 -> 2023-12-01 00:00:00
AR Test:  2024-01-01 00:00:00 -> 2025-07-01 00:00:00
AR train: (1800, 12, 3) (1800,)
AR val:   (600, 12, 3) (600,)
AR test:  (350, 12, 3) (350,)


In [18]:
all_target_cols = [
    'vmt', 'ndvi', 'flights',
    'co_mean', 'co_max1_value', 'co_max1_hour', 'co_aqi',
    'no2_mean', 'no2_max1_value', 'no2_max1_hour', 'no2_aqi',
    'pm25_max1_value', 'pm25_max1_hour', 'pm25_aqi',
    'pm10_mean', 'pm10_max1_value', 'pm10_max1_hour', 'pm10_aqi',
    'so2_mean', 'so2_max1_value', 'so2_max1_hour', 'so2_aqi',
    'o3_mean', 'o3_max1_value', 'o3_max1_hour', 'o3_aqi',
    'awnd', 'prcp', 'tavg', 'tmax', 'tmin'
]

print(f"Total targets to predict: {len(all_target_cols)}")

Total targets to predict: 31


In [19]:
# AR LSTM FOR ALL FEATURES WITH TOP EXOGENOUS DRIVERS

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers

df_ar = full.copy()

# Ensure month is Timestamp
if isinstance(df_ar["month"].dtype, pd.PeriodDtype):
    df_ar["month"] = df_ar["month"].dt.to_timestamp()
else:
    df_ar["month"] = pd.to_datetime(df_ar["month"])

df_ar = df_ar.sort_values(["state", "month"]).reset_index(drop=True)

# Time features
df_ar["month_num"] = df_ar["month"].dt.month
df_ar["month_sin"] = np.sin(2 * np.pi * df_ar["month_num"] / 12)
df_ar["month_cos"] = np.cos(2 * np.pi * df_ar["month_num"] / 12)

# Targets to forecast
ar_targets = ["pm25_mean"] + all_target_cols
ar_targets = list(dict.fromkeys(ar_targets))  # remove duplicates if any

print("All targets to forecast:", ar_targets)

# Top global drivers from above LSTM
top_exog = ["month_num", "ndvi", "o3_max1_value", "o3_mean", "o3_aqi", "tavg"]

# Exogenous subset that actually needs forecasting (exclude purely time month_num)
top_exog_to_forecast = ["ndvi", "o3_max1_value", "o3_mean", "o3_aqi", "tavg"]

print("Top exogenous features (to be forecasted first):", top_exog_to_forecast)

train_end = pd.to_datetime("2021-12-01")
val_end   = pd.to_datetime("2023-12-01")

train_df = df_ar[df_ar["month"] <= train_end].copy()
val_df   = df_ar[(df_ar["month"] > train_end) & (df_ar["month"] <= val_end)].copy()
test_df  = df_ar[df_ar["month"] > val_end].copy()

print("Train period:", train_df["month"].min(), "->", train_df["month"].max())
print("Val period:  ", val_df["month"].min(),   "->", val_df["month"].max())
print("Test period: ", test_df["month"].min(),  "->", test_df["month"].max())

seq_len = 12  # past 12 months

def build_univariate_sequences(df, target_col, seq_len):
    """
    For univariate AR + time
    """
    feature_cols = [target_col, "month_sin", "month_cos", "month_num"]
    X_list, y_list = [], []

    for state, g in df.groupby("state"):
        g = g.sort_values("month")
        if len(g) <= seq_len:
            continue

        feat = g[feature_cols].values
        targ = g[target_col].values

        for i in range(len(g) - seq_len):
            X_list.append(feat[i:i+seq_len])
            y_list.append(targ[i+seq_len])

    X = np.array(X_list, dtype="float32")
    y = np.array(y_list, dtype="float32")
    return X, y, feature_cols


def build_multivar_sequences_for_target(df, target_col, seq_len, top_exog):
    """
    For target + time + top exogenous features.
    """
    base_feats = ["month_sin", "month_cos", "month_num"] + top_exog
    feature_cols = [target_col] + [f for f in base_feats if f != target_col]

    X_list, y_list = [], []

    for state, g in df.groupby("state"):
        g = g.sort_values("month")
        if len(g) <= seq_len:
            continue

        feat = g[feature_cols].values
        targ = g[target_col].values

        for i in range(len(g) - seq_len):
            X_list.append(feat[i:i+seq_len])
            y_list.append(targ[i+seq_len])

    X = np.array(X_list, dtype="float32")
    y = np.array(y_list, dtype="float32")
    return X, y, feature_cols

def build_lstm_model(n_timesteps, n_features, l2_lambda=1e-4):
    l2_reg = regularizers.l2(l2_lambda)
    model = models.Sequential([
        layers.Input(shape=(n_timesteps, n_features)),
        layers.LSTM(64, return_sequences=True,
                    kernel_regularizer=l2_reg,
                    recurrent_regularizer=l2_reg),
        layers.Dropout(0.2),
        layers.LSTM(32, return_sequences=False,
                    kernel_regularizer=l2_reg,
                    recurrent_regularizer=l2_reg),
        layers.Dropout(0.2),
        layers.Dense(32, activation="relu", kernel_regularizer=l2_reg),
        layers.Dense(16, activation="relu"),
        layers.Dense(1)
    ])

    model.compile(
        loss="mse",
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
        metrics=["mae"]
    )
    return model

def seed_windows(df_hist, feature_cols, seq_len):
    windows = {}
    for st, g in df_hist.sort_values(["state","month"]).groupby("state"):
        g_tail = g[feature_cols].tail(seq_len)
        Xw = g_tail.values.astype("float32")
        if len(g_tail) < seq_len:
            pad = np.repeat(Xw[:1], seq_len - len(g_tail), axis=0)
            Xw = np.vstack([pad, Xw])
        windows[st] = Xw
    return windows

def recursive_univariate_forecast(model, df_hist, future_calendar,
                                  target_col, seq_len, feature_cols):

    df_hist = df_hist.sort_values(["state","month"]).copy()
    windows = seed_windows(df_hist, feature_cols, seq_len)
    preds = []

    col_to_idx = {c: i for i, c in enumerate(feature_cols)}

    for st, g_fut in future_calendar.sort_values(["state","month"]).groupby("state"):
        if st not in windows:
            continue

        w = windows[st].copy().astype("float32")

        for _, row in g_fut.iterrows():
            x_input = w.reshape(1, seq_len, len(feature_cols)).astype("float32")
            y_hat = float(model.predict(x_input, verbose=0).ravel()[0])

            preds.append((st, row["month"], y_hat))

            new_feat = w[-1].copy()
            # update target
            new_feat[col_to_idx[target_col]] = y_hat
            # update time features
            new_feat[col_to_idx["month_num"]] = row["month_num"]
            new_feat[col_to_idx["month_sin"]] = row["month_sin"]
            new_feat[col_to_idx["month_cos"]] = row["month_cos"]

            w = np.vstack([w[1:], new_feat])

    out = pd.DataFrame(
        preds,
        columns=["state", "month", f"{target_col}_pred"]
    ).sort_values(["state", "month"]).reset_index(drop=True)
    return out


def recursive_multivar_forecast_for_target(model, df_hist, future_merge,
                                           target_col, seq_len, feature_cols, top_exog):
    """
      For future steps:
        - time features from future_merge
        - top_exog from their _pred columns in future_merge
        - target updated with model prediction
    """
    df_hist = df_hist.sort_values(["state","month"]).copy()
    windows = seed_windows(df_hist, feature_cols, seq_len)
    preds = []

    col_to_idx = {c: i for i, c in enumerate(feature_cols)}

    for st, g_fut in future_merge.sort_values(["state", "month"]).groupby("state"):
        if st not in windows:
            continue

        w = windows[st].copy().astype("float32")

        for _, row in g_fut.iterrows():
            x_input = w.reshape(1, seq_len, len(feature_cols)).astype("float32")
            y_hat = float(model.predict(x_input, verbose=0).ravel()[0])

            preds.append((st, row["month"], y_hat))

            new_feat = w[-1].copy()

            # Update target
            new_feat[col_to_idx[target_col]] = y_hat

            # Update time features
            if "month_num" in col_to_idx:
                new_feat[col_to_idx["month_num"]] = row["month_num"]
            if "month_sin" in col_to_idx:
                new_feat[col_to_idx["month_sin"]] = row["month_sin"]
            if "month_cos" in col_to_idx:
                new_feat[col_to_idx["month_cos"]] = row["month_cos"]

            # Update exogenous features from their predicted paths
            for ex in top_exog:
                if ex in col_to_idx:
                    col_pred = ex + "_pred"
                    if col_pred in row.index:
                        new_feat[col_to_idx[ex]] = row[col_pred]

            w = np.vstack([w[1:], new_feat])

    out = pd.DataFrame(
        preds,
        columns=["state", "month", f"{target_col}_pred"]
    ).sort_values(["state","month"]).reset_index(drop=True)
    return out

hist_cut     = pd.to_datetime("2025-07-01")   # last observed month
future_start = pd.to_datetime("2025-08-01")
future_end   = pd.to_datetime("2030-12-01")

future_idx = pd.MultiIndex.from_product(
    [df_ar["state"].unique(), pd.date_range(future_start, future_end, freq="MS")],
    names=["state","month"]
).to_frame(index=False)

future_idx["month_num"] = future_idx["month"].dt.month
future_idx["month_sin"] = np.sin(2 * np.pi * future_idx["month_num"] / 12)
future_idx["month_cos"] = np.cos(2 * np.pi * future_idx["month_num"] / 12)

exog_future = future_idx.copy()  # will accumulate top_exog predictions

for target_col in top_exog_to_forecast:
    print("\n" + "="*70)
    print(f"Stage 1: Univariate AR LSTM for exogenous target: {target_col}")
    print("="*70)

    if target_col not in df_ar.columns:
        print(f"  [SKIP] {target_col} not found in df_ar.columns")
        continue

    train_X, train_y, feat_cols_uni = build_univariate_sequences(train_df, target_col, seq_len)
    val_X,   val_y,   _            = build_univariate_sequences(val_df,   target_col, seq_len)
    test_X,  test_y,  _            = build_univariate_sequences(test_df,  target_col, seq_len)

    if len(train_X) == 0 or len(val_X) == 0 or len(test_X) == 0:
        print(f"  [SKIP] Not enough sequences for {target_col}")
        continue

    print(f"  Shapes: train{train_X.shape}, val{val_X.shape}, test{test_X.shape}")

    model = build_lstm_model(train_X.shape[1], train_X.shape[2], l2_lambda=1e-4)

    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=8, restore_best_weights=True
    )
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=5, min_lr=1e-5
    )

    history = model.fit(
        train_X, train_y,
        validation_data=(val_X, val_y),
        epochs=100,
        batch_size=32,
        callbacks=[early_stop, reduce_lr],
        verbose=1
    )

    test_loss, test_mae = model.evaluate(test_X, test_y, verbose=0)
    print(f"  {target_col} - Test MSE: {test_loss:.4f}, Test MAE: {test_mae:.4f}")

    df_hist_t = df_ar[df_ar["month"] <= hist_cut].copy()

    pred_df = recursive_univariate_forecast(
        model=model,
        df_hist=df_hist_t,
        future_calendar=future_idx,
        target_col=target_col,
        seq_len=seq_len,
        feature_cols=feat_cols_uni
    )

    # Merge predicted exogenous path
    exog_future = exog_future.merge(pred_df, on=["state","month"], how="left")

# Forecast remaining targets with top_exog as inputs

future_merge = exog_future.copy()

future_all = future_idx[["state","month"]].copy()

non_exog_targets = [t for t in ar_targets if t not in top_exog_to_forecast]

for target_col in non_exog_targets:
    print("\n" + "="*70)
    print(f"Stage 2: Multivariate AR LSTM for target: {target_col}")
    print("="*70)

    if target_col not in df_ar.columns:
        print(f"  [SKIP] {target_col} not found in df_ar.columns")
        continue

    train_X, train_y, feat_cols = build_multivar_sequences_for_target(
        train_df, target_col, seq_len, top_exog=top_exog_to_forecast
    )
    val_X,   val_y,   _        = build_multivar_sequences_for_target(
        val_df,   target_col, seq_len, top_exog=top_exog_to_forecast
    )
    test_X,  test_y,  _        = build_multivar_sequences_for_target(
        test_df,  target_col, seq_len, top_exog=top_exog_to_forecast
    )

    if len(train_X) == 0 or len(val_X) == 0 or len(test_X) == 0:
        print(f"  [SKIP] Not enough sequences for {target_col}")
        continue

    print(f"  Shapes: train{train_X.shape}, val{val_X.shape}, test{test_X.shape}")

    model = build_lstm_model(train_X.shape[1], train_X.shape[2], l2_lambda=1e-4)

    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=8, restore_best_weights=True
    )
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=5, min_lr=1e-5
    )

    history = model.fit(
        train_X, train_y,
        validation_data=(val_X, val_y),
        epochs=100,
        batch_size=32,
        callbacks=[early_stop, reduce_lr],
        verbose=1
    )

    test_loss, test_mae = model.evaluate(test_X, test_y, verbose=0)
    print(f"  {target_col} - Test MSE: {test_loss:.4f}, Test MAE: {test_mae:.4f}")

    # Historical data up to hist_cut for seeding
    df_hist_t = df_ar[df_ar["month"] <= hist_cut].copy()

    pred_df = recursive_multivar_forecast_for_target(
        model=model,
        df_hist=df_hist_t,
        future_merge=future_merge,
        target_col=target_col,
        seq_len=seq_len,
        feature_cols=feat_cols,
        top_exog=top_exog_to_forecast
    )

    future_all = future_all.merge(pred_df, on=["state","month"], how="left")

exog_pred_cols = [f"{c}_pred" for c in top_exog_to_forecast]
future_all = future_all.merge(
    exog_future[["state","month"] + exog_pred_cols],
    on=["state","month"],
    how="left"
)

future_all["year"]      = future_all["month"].dt.year
future_all["month_num"] = future_all["month"].dt.month
future_all["month"]     = future_all["month"].dt.to_period("M").astype(str)

pred_cols = [c for c in future_all.columns if c.endswith("_pred")]
cols_order = ["state", "year", "month"] + pred_cols
future_all = future_all[cols_order]

output_path = "all_features_future_AR_with_top_exog_2025-08_to_2030-12.csv"
future_all.to_csv(output_path, index=False)

print(f"\nSaved forecasts for ALL targets (with top exogenous features) to: {output_path}")
print(future_all.head())




All targets to forecast: ['pm25_mean', 'vmt', 'ndvi', 'flights', 'co_mean', 'co_max1_value', 'co_max1_hour', 'co_aqi', 'no2_mean', 'no2_max1_value', 'no2_max1_hour', 'no2_aqi', 'pm25_max1_value', 'pm25_max1_hour', 'pm25_aqi', 'pm10_mean', 'pm10_max1_value', 'pm10_max1_hour', 'pm10_aqi', 'so2_mean', 'so2_max1_value', 'so2_max1_hour', 'so2_aqi', 'o3_mean', 'o3_max1_value', 'o3_max1_hour', 'o3_aqi', 'awnd', 'prcp', 'tavg', 'tmax', 'tmin']
Top exogenous features (to be forecasted first): ['ndvi', 'o3_max1_value', 'o3_mean', 'o3_aqi', 'tavg']
Train period: 2018-01-01 00:00:00 -> 2021-12-01 00:00:00
Val period:   2022-01-01 00:00:00 -> 2023-12-01 00:00:00
Test period:  2024-01-01 00:00:00 -> 2025-07-01 00:00:00

Stage 1: Univariate AR LSTM for exogenous target: ndvi
  Shapes: train(1800, 12, 4), val(600, 12, 4), test(350, 12, 4)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch




Stage 1: Univariate AR LSTM for exogenous target: o3_max1_value
  Shapes: train(1800, 12, 4), val(600, 12, 4), test(350, 12, 4)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/




Stage 1: Univariate AR LSTM for exogenous target: o3_mean
  Shapes: train(1800, 12, 4), val(600, 12, 4), test(350, 12, 4)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Ep




Stage 1: Univariate AR LSTM for exogenous target: o3_aqi
  Shapes: train(1800, 12, 4), val(600, 12, 4), test(350, 12, 4)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
  o3_aqi - Test MSE: 14.4815, Test MAE: 2.9861





Stage 1: Univariate AR LSTM for exogenous target: tavg
  Shapes: train(1800, 12, 4), val(600, 12, 4), test(350, 12, 4)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
  tavg - Test MSE: 5.5243, Test MAE: 1.6769





Stage 2: Multivariate AR LSTM for target: pm25_mean
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
  pm25_mean - Test MSE: 2.8838, Test MAE: 1.2959





Stage 2: Multivariate AR LSTM for target: vmt
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
  vmt - Test MSE: 1526210.2500, Test MAE: 717.2828





Stage 2: Multivariate AR LSTM for target: flights
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
  flights - Test MSE: 1215036288.0000, Test MAE: 23753.8086





Stage 2: Multivariate AR LSTM for target: co_mean
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
  co_mean - Test MSE: 0.0085, Test M




Stage 2: Multivariate AR LSTM for target: co_max1_value
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
  co_max1_value - Test MSE: 0.0171, Test MAE: 0.0878





Stage 2: Multivariate AR LSTM for target: co_max1_hour
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
  co_max1_hour - Test MSE: 18.1192, Test MAE: 3.0047





Stage 2: Multivariate AR LSTM for target: co_aqi
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
  co_aqi - Test MSE: 0.8638, Test MAE: 0.6542

Stage 2: Multivariate AR LSTM for target: no2_mean
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
  no2_mean - Test MSE: 2.2537, Test MAE: 1.1294





Stage 2: Multivariate AR LSTM for target: no2_max1_value
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
  no2_max1_value - Test MSE: 8.8074, Test MAE: 2.1116





Stage 2: Multivariate AR LSTM for target: no2_max1_hour
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
  no2_max1_hour - Test MSE: 1.2940, Test MAE: 0.7597





Stage 2: Multivariate AR LSTM for target: no2_aqi
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
  no2_aqi - Test MSE: 7.3033, Test MAE: 1.8710





Stage 2: Multivariate AR LSTM for target: pm25_max1_value
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
  pm25_max1_value - Test MSE: 11.6469, Test MAE: 2.0375





Stage 2: Multivariate AR LSTM for target: pm25_max1_hour
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
  pm25_max1_hour - Test MSE: 0.9118, Test MAE: 0.5600





Stage 2: Multivariate AR LSTM for target: pm25_aqi
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
  pm25_aqi - Test MSE: 36.7919, Test MAE: 4.5491





Stage 2: Multivariate AR LSTM for target: pm10_mean
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
  pm10_mean - Test MSE: 38.7209, Test MAE: 3.4502





Stage 2: Multivariate AR LSTM for target: pm10_max1_value
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
  pm10_max1_value - Test MSE: 34.5055, Test MAE: 3.3108





Stage 2: Multivariate AR LSTM for target: pm10_max1_hour
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
  pm10_max1_hour - Test MSE: 5.4284, Test MAE: 1.0418





Stage 2: Multivariate AR LSTM for target: pm10_aqi
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
  pm10_aqi - Test MSE: 18.6274, Test MAE: 2.8495





Stage 2: Multivariate AR LSTM for target: so2_mean
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
  so2_mean - Test MSE: 0.0773, Test MAE: 0.1607





Stage 2: Multivariate AR LSTM for target: so2_max1_value
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
  so2_max1_value - Test MSE: 0.5212, Test MAE: 0.4070





Stage 2: Multivariate AR LSTM for target: so2_max1_hour
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
  so2_max1_hour - Test MSE: 1.3856, Test MAE: 0.8139





Stage 2: Multivariate AR LSTM for target: so2_aqi
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
  so2_aqi - Test MSE: 1.1341, Test MAE: 0.6141





Stage 2: Multivariate AR LSTM for target: o3_max1_hour
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
  o3_max1_hour - Test MSE: 2.6646, Test MAE: 1.1507





Stage 2: Multivariate AR LSTM for target: awnd
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
  awnd - Test MSE: 0.3282, Test MAE: 0.4193





Stage 2: Multivariate AR LSTM for target: prcp
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
  prcp - Test MSE: 2439.8723, Test MAE: 35.2474





Stage 2: Multivariate AR LSTM for target: tmax
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
  tmax - Test MSE: 5.6078, Test MAE: 1.7347





Stage 2: Multivariate AR LSTM for target: tmin
  Shapes: train(1800, 12, 9), val(600, 12, 9), test(350, 12, 9)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
  tmin - Test MSE: 5.4713, Test MAE: 1.7010

Saved forecasts for ALL targets (with top exogenous features) to: all_features_future_AR_with_top_exog_2025-08_to_2030-12.csv
     state  year    month  pm25_mean_pred     vmt_pred  flights_pred  \
0  Alabama  2025  2025-08        8.909400  4911.245117  25708.355469   
1  Alabama  2025  2025-09        9.204036  4892.287598  25708.355469   
2  Alabama  2025  2025-10        8.782427  4924.093750  25708.355469   
3  Alabama  2025  2025-11        8.382743  4959.955566  25708.355469   
4  Alabama  2025  2025-12        8.275459  4774.865723

In [3]:
import pandas as pd

old_path = "MasterDataset_interpolated_final.csv"     
new_path = "all_features_future_AR_with_top_exog_2025-08_to_2030-12.csv"  

old_df = pd.read_csv(old_path)
new_df = pd.read_csv(new_path)

old_df["month"] = pd.to_datetime(old_df["month"])
new_df["month"] = pd.to_datetime(new_df["month"])

pred_cols = [c for c in new_df.columns if c.endswith("_pred")]
rename_map = {c: c.replace("_pred", "") for c in pred_cols}
new_df = new_df.rename(columns=rename_map)

for col in old_df.columns:
    if col not in new_df.columns:
        new_df[col] = np.nan

new_df = new_df[old_df.columns]

combined = pd.concat([old_df, new_df], ignore_index=True)

combined = combined.drop_duplicates(subset=["state", "month"], keep="first")

combined = combined.sort_values(["state", "month"]).reset_index(drop=True)
combined["month"] = combined["month"].dt.to_period("M").astype(str)

combined.to_csv("MasterDataset_AR.csv", index=False)
print("Saved MasterDataset_with_future_AR.csv")
print(combined.head())



Saved MasterDataset_with_future_AR.csv
     state    month     vmt      ndvi  flights   co_mean  co_max1_value  \
0  Alabama  2018-01  5101.0  0.552518   6819.0  0.270312       0.527258   
1  Alabama  2018-02  4975.0  0.541536   6417.0  0.219343       0.402143   
2  Alabama  2018-03  5952.0  0.565882   7507.0  0.226716       0.407704   
3  Alabama  2018-04  6145.0  0.679999   7541.0  0.214604       0.392311   
4  Alabama  2018-05  6253.0  0.761965   8002.0  0.302609       0.490366   

   co_max1_hour    co_aqi   no2_mean  ...  o3_max1_hour     o3_aqi      awnd  \
0     10.153226  5.209677  12.320599  ...     11.854839  30.725806  3.119355   
1     10.383929  3.946429   8.593592  ...     21.219246  22.709325  3.514286   
2      8.952151  3.747312   9.844573  ...     10.468993  40.165997  3.558065   
3      8.936111  3.705556   8.937356  ...     10.388948  45.017963  3.526667   
4      9.212366  4.946237  11.115237  ...      9.495067  45.172110  2.141935   

    prcp       tavg       tma

In [None]:
import pandas as pd

df1 = pd.read_csv('MasterDataset_AR.csv')  
df2 = pd.read_csv('MasterDataset_with_health_outcome.csv')  

merged_df = pd.merge(df1, df2[['state', 'month', 'ihd_deaths', 'copd_deaths', 'asthma_deaths']], 
                     on=['state', 'month'], 
                     how='left')

merged_df.to_csv('MasterDataset_AR_health.csv', index=False)

