In [51]:
import joblib

loaded_model = joblib.load("../Analysis_and_Model/PM2.5predictor.pkl")

In [52]:
import pandas as pd

df = pd.read_csv("../Data/test_data.csv")

In [53]:
df

Unnamed: 0,Year,State,CO,NO2,O3,SO2,PM2.5
0,2016,Alabama,0.337158,11.547225,28.706667,1.081692,9.436674
1,2016,Arizona,0.334874,14.837061,33.312022,2.655121,6.810794
2,2016,Arkansas,0.296425,7.621050,28.825000,1.008650,9.043935
3,2016,California,0.325858,9.503550,30.596454,0.626877,9.611273
4,2016,Colorado,0.358489,11.945474,37.851099,1.394381,6.344656
...,...,...,...,...,...,...,...
379,2023,Virginia,0.268529,7.538555,29.867391,1.745232,8.074219
380,2023,Washington,0.256367,12.381411,27.970270,0.898613,7.701125
381,2023,West Virginia,0.247525,0.000000,26.370000,1.826565,9.747752
382,2023,Wisconsin,0.198063,9.101080,31.389516,0.513425,7.537519


In [54]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Select the columns with numeric data for PCA
numeric_columns_scale = ["CO", "NO2", "O3", "SO2"]

# Standardize the data (mean=0, std=1) for PCA
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df[numeric_columns_scale])
# Apply PCA with 3 components
pca = PCA(n_components=3)
pca_result = pca.fit_transform(data_scaled)

# Create a DataFrame with the first 3 principal components
pca_df = pd.DataFrame(data=pca_result, columns=["PC1", "PC2", "PC3"])

# Add the 'State' column back
pca_df["State"] = df["State"]

# Print the DataFrame with the first 3 principal components
print(pca_df)

          PC1       PC2       PC3          State
0    1.291722 -0.635121  0.443811        Alabama
1    3.197252  1.707163  2.584591        Arizona
2    0.214035 -0.344309  0.566145       Arkansas
3    0.678231 -0.355070 -0.499472     California
4    2.413404  1.941080 -0.136629       Colorado
..        ...       ...       ...            ...
379  0.507559  0.632916  1.819979       Virginia
380  0.541318 -0.584430  0.135302     Washington
381 -1.174387  0.065896  2.932269  West Virginia
382 -0.548612  0.431337 -0.781631      Wisconsin
383 -1.086643  3.073084 -0.129890        Wyoming

[384 rows x 4 columns]


In [55]:
pca_df = pd.get_dummies(pca_df, columns=["State"], prefix="State")
pca_df

Unnamed: 0,PC1,PC2,PC3,State_Alabama,State_Arizona,State_Arkansas,State_California,State_Colorado,State_Connecticut,State_Delaware,...,State_South Dakota,State_Tennessee,State_Texas,State_Utah,State_Vermont,State_Virginia,State_Washington,State_West Virginia,State_Wisconsin,State_Wyoming
0,1.291722,-0.635121,0.443811,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,3.197252,1.707163,2.584591,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0.214035,-0.344309,0.566145,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0.678231,-0.355070,-0.499472,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2.413404,1.941080,-0.136629,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,0.507559,0.632916,1.819979,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
380,0.541318,-0.584430,0.135302,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
381,-1.174387,0.065896,2.932269,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
382,-0.548612,0.431337,-0.781631,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [56]:
predictions = loaded_model.predict(pca_df)

In [57]:
predictions

array([2.37351862, 1.5737182 , 2.36107841, 2.10723673, 1.29351164,
       1.94723957, 2.20528808, 2.35711834, 2.1469159 , 1.72112355,
       2.35619637, 2.38373616, 2.45515137, 2.24885834, 2.41626946,
       2.42311444, 2.21053806, 2.17698831, 2.13244329, 2.11387726,
       2.33188753, 2.44618726, 2.29128652, 2.06295468, 2.20833954,
       1.31618927, 2.16466405, 2.05640447, 1.52248989, 2.09225237,
       2.31136814, 2.26641485, 2.25331162, 2.22958004, 2.06022592,
       2.27817812, 1.91842271, 2.56360654, 2.12476697, 2.3077958 ,
       2.2650609 , 1.57666156, 2.10856801, 2.32179719, 1.97447153,
       2.51781996, 2.28070682, 1.72729905, 2.48741286, 1.48527016,
       2.27140812, 2.06844604, 1.31709141, 1.99602241, 2.23257504,
       2.39621731, 2.28807453, 1.63705151, 2.32636471, 2.26039241,
       2.40840389, 2.2896226 , 2.4365753 , 2.41250971, 2.19893488,
       2.23962369, 2.14729668, 2.10086796, 2.24104882, 2.46347498,
       2.22539054, 1.99292054, 2.21744605, 1.30319414, 2.18483

In [58]:
import numpy as np

og = np.exp(predictions)
og

array([10.73509863,  4.82455354, 10.60237901,  8.22548065,  3.64556603,
        7.0093121 ,  9.07286491, 10.56047584,  8.55842261,  5.59080651,
       10.5507439 , 10.84534718, 11.64819666,  9.47691025, 11.20398437,
       11.28093842,  9.1206225 ,  8.81970401,  8.43545191,  8.2802839 ,
       10.29735978, 11.54424753,  9.8876502 ,  7.86918646,  9.10059269,
        3.72918337,  8.71167475,  7.81781005,  4.5836237 ,  8.10314591,
       10.08821731,  9.64476089,  9.51920772,  9.29596129,  7.84774261,
        9.75888466,  6.81020832, 12.98255514,  8.37094658, 10.05224305,
        9.63171116,  4.83877486,  8.23643835, 10.19397833,  7.20281217,
       12.40153137,  9.78359323,  5.62543933, 12.03011223,  4.41615831,
        9.69304014,  7.91251786,  3.7325491 ,  7.35972384,  9.32384443,
       10.98155785,  9.85594209,  5.13999195, 10.24064604,  9.58685037,
       11.11620431,  9.87121163, 11.43381623, 11.16193924,  9.01540588,
        9.38979711,  8.56168213,  8.1732609 ,  9.40318839, 11.74

In [59]:
predicted_pm25 = pd.DataFrame()

In [60]:
predicted_pm25["State"] = df["State"]
predicted_pm25["Year"] = df["Year"]
predicted_pm25["PM2.5"] = og

In [61]:
predicted_pm25

Unnamed: 0,State,Year,PM2.5
0,Alabama,2016,10.735099
1,Arizona,2016,4.824554
2,Arkansas,2016,10.602379
3,California,2016,8.225481
4,Colorado,2016,3.645566
...,...,...,...
379,Virginia,2023,10.401233
380,Washington,2023,7.610402
381,West Virginia,2023,13.171847
382,Wisconsin,2023,9.662820


In [62]:
data_vis = predicted_pm25[predicted_pm25["Year"] == "2023"]

In [63]:
year_2023_rows = predicted_pm25[predicted_pm25["Year"] == 2023]
year_2023_rows.to_csv("predicted_2023_values.csv", index=False)