#### Dataset 1
This dataset has been sourced from the following source: Center for Air, Climate and Energy Solutions [https://www.caces.us/data]

Duration: 2000 - 2015

This dataset provides estimates of outdoor concentrations for seven pollutants (four gases: O3, CO, SO2, NO2; three aerosols: PM10, PM2.5, PN [particle number]) throughout the contiguous U.S. We've extracted the values of the four gases and PM2.5 for our purpose. More detailed information about the dataset can be found here - [https://www.caces.us/_files/ugd/342c07_ebb72126333c4fd7859e1702f288bafb.pdf]

In [3]:
import pandas as pd

# Read the dataset into a DataFrame
df_with_pollutants = pd.read_csv("PM2.5withpollutants.csv")

# Drop the 'fips' column
df_with_pollutants = df_with_pollutants.drop("fips", axis=1)

In [4]:
df_with_pollutants.shape

(248640, 6)

In [5]:
df_with_pollutants.head()

Unnamed: 0,pollutant,year,pred_wght,state_abbr,lat,lon
0,co,2000,0.342283,AL,32.500383,-86.494186
1,co,2000,0.322647,AL,30.548923,-87.762466
2,co,2000,0.302642,AL,31.844037,-85.309929
3,co,2000,0.308587,AL,33.030918,-87.127655
4,co,2000,0.331939,AL,33.955246,-86.591402


In [6]:
# Drop 'lat' and 'lon' columns
df_with_pollutants = df_with_pollutants.drop(["lat", "lon"], axis=1)

# Group by 'pollutant', 'year', and 'state', and calculate the average of 'pred_wght'
df_with_pollutants = (
    df_with_pollutants.groupby(["pollutant", "year", "state_abbr"]).mean().reset_index()
)

In [7]:
df_with_pollutants.shape

(3840, 4)

In [8]:
df_with_pollutants.head()

Unnamed: 0,pollutant,year,state_abbr,pred_wght
0,co,2000,AL,0.324396
1,co,2000,AR,0.327984
2,co,2000,AZ,0.422265
3,co,2000,CA,0.462393
4,co,2000,CO,0.383861


In [9]:
df_with_pollutants["pollutant"].unique()

array(['co', 'no2', 'o3', 'pm25', 'so2'], dtype=object)

In [10]:
# Pivot the DataFrame to have pollutants as columns
df_with_pollutants = df_with_pollutants.pivot(
    index=["year", "state_abbr"], columns="pollutant", values="pred_wght"
).reset_index()

# Rename the columns
df_with_pollutants.columns.name = None

# Fill any NaN values with 0 if needed
df_with_pollutants = df_with_pollutants.fillna(0)

In [11]:
df_with_pollutants.head()

Unnamed: 0,year,state_abbr,co,no2,o3,pm25,so2
0,2000,AL,0.324396,5.983593,57.601273,15.064598,3.251818
1,2000,AR,0.327984,6.059882,53.942551,12.076962,2.859271
2,2000,AZ,0.422265,9.915901,58.28224,7.400383,2.343078
3,2000,CA,0.462393,10.756098,48.635815,10.896779,1.930857
4,2000,CO,0.383861,8.429847,59.836639,5.501561,2.511782


In [12]:
# Create a dictionary to map state abbreviations to state names
state_mapping = {
    "AL": "Alabama",
    "AK": "Alaska",
    "AZ": "Arizona",
    "AR": "Arkansas",
    "CA": "California",
    "CO": "Colorado",
    "CT": "Connecticut",
    "DE": "Delaware",
    "DC": "District of Columbia",
    "FL": "Florida",
    "GA": "Georgia",
    "HI": "Hawaii",
    "ID": "Idaho",
    "IL": "Illinois",
    "IN": "Indiana",
    "IA": "Iowa",
    "KS": "Kansas",
    "KY": "Kentucky",
    "LA": "Louisiana",
    "ME": "Maine",
    "MD": "Maryland",
    "MA": "Massachusetts",
    "MI": "Michigan",
    "MN": "Minnesota",
    "MS": "Mississippi",
    "MO": "Missouri",
    "MT": "Montana",
    "NE": "Nebraska",
    "NV": "Nevada",
    "NH": "New Hampshire",
    "NJ": "New Jersey",
    "NM": "New Mexico",
    "NY": "New York",
    "NC": "North Carolina",
    "ND": "North Dakota",
    "OH": "Ohio",
    "OK": "Oklahoma",
    "OR": "Oregon",
    "PA": "Pennsylvania",
    "RI": "Rhode Island",
    "SC": "South Carolina",
    "SD": "South Dakota",
    "TN": "Tennessee",
    "TX": "Texas",
    "UT": "Utah",
    "VT": "Vermont",
    "VA": "Virginia",
    "WA": "Washington",
    "WV": "West Virginia",
    "WI": "Wisconsin",
    "WY": "Wyoming",
    "AS": "American Samoa",
    "GU": "Guam",
    "MP": "Northern Mariana Islands",
    "PR": "Puerto Rico",
    "VI": "Virgin Islands",
    "TT": "Trust Territories",
}
# Replace 'state_abbr' with state names using the mapping
df_with_pollutants["state_abbr"] = df_with_pollutants["state_abbr"].map(state_mapping)

In [13]:
# Rename the 'state_abbr' column to 'State'
df_with_pollutants.rename(columns={"state_abbr": "State"}, inplace=True)

In [14]:
df_with_pollutants.head()

Unnamed: 0,year,State,co,no2,o3,pm25,so2
0,2000,Alabama,0.324396,5.983593,57.601273,15.064598,3.251818
1,2000,Arkansas,0.327984,6.059882,53.942551,12.076962,2.859271
2,2000,Arizona,0.422265,9.915901,58.28224,7.400383,2.343078
3,2000,California,0.462393,10.756098,48.635815,10.896779,1.930857
4,2000,Colorado,0.383861,8.429847,59.836639,5.501561,2.511782


In [15]:
df_with_pollutants.rename(
    columns={
        "pm25": "PM2.5",
        "co": "CO",
        "so2": "SO2",
        "o3": "O3",
        "no2": "NO2",
        "year": "Year",
    },
    inplace=True,
)

In [16]:
# Convert and save the merged_df to a CSV file
df_with_pollutants.to_csv("pollutant_dataset.csv", index=False)