In [11]:
import pandas as pd
import numpy as np

## Zipcode Data

In [34]:
import pandas as pd
import os

# Define the path to the folder containing the files
folder_path = '../AirQuality_Zipcode'

# List of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Function to calculate the daily mean air quality
def calculate_daily_mean(minimum, maximum, q1, q3, median, count):
    n = count
    mean = (2.2 / (2.2 + n ** 0.75)) * ((minimum + maximum) / 2) + \
           (0.7 - (0.72 / n ** 0.55)) * ((q1 + q3) / 2) + \
           (0.3 + (0.72 / n ** 0.55) - (2.2 / (2.2 + n ** 0.75))) * median
    return mean

# Function to process each file and calculate yearly means
def process_file(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Extract year from the 'date' column
    df['Year'] = pd.to_datetime(df['date']).dt.year

    # Assuming that the zipcode is part of the filename, extract it
    zipcode = file_path.split('/')[-1].split('.')[0]

    # Add zipcode to the dataframe
    df['Zipcode'] = zipcode

    # Calculate the daily mean for each row
    df['Daily_Mean'] = df.apply(lambda x: calculate_daily_mean(x['min'], x['max'], x['q1'], x['q3'], x['median'], x['count']), axis=1)

    # Group by year and zipcode and calculate the mean
    yearly_means = df.groupby(['Year', 'Zipcode'])['Daily_Mean'].mean().reset_index()

    return yearly_means

# Process each file and store the results
all_data = pd.DataFrame()
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    yearly_means = process_file(file_path)
    all_data = pd.concat([all_data, yearly_means])

# Split data by year for 2022 and 2023
data_2022 = all_data[all_data['Year'] == 2022]
data_2023 = all_data[all_data['Year'] == 2023]

# Save the data to CSV files
data_2022.to_csv('../AirQuality_Zipcode/2022_AQI_ZIPCODE.csv', index=False)
data_2023.to_csv('../AirQuality_Zipcode/2023_AQI_ZIPCODE.csv', index=False)


In [35]:
df2 = pd.read_csv('../AirQuality_Zipcode/2023_AQI_ZIPCODE.csv')
df2

Unnamed: 0,Year,Zipcode,Daily_Mean
0,2023,2128,9.646887
1,2023,2113,9.963605
2,2023,2111,9.464501
3,2023,2139,9.111052
4,2023,2135,8.870724
5,2023,2127,10.532466
6,2023,2124,9.108688
7,2023,2130,8.710598
8,2023,2118,10.577658


In [36]:
# Load the air quality data for 2022 and 2023
aqi_2022 = pd.read_csv('../AirQuality_Zipcode/2022_AQI_ZIPCODE.csv')
aqi_2023 = pd.read_csv('../AirQuality_Zipcode/2023_AQI_ZIPCODE.csv')

# Load the health data for 2022 and 2023
health_2022 = pd.read_csv('../Health_Data/Zipcode/Health_Zipcode_2022.csv')
health_2023 = pd.read_csv('../Health_Data/Zipcode/Health_Zipcode_2023.csv')

# Rename 'ZCTA5' to 'Zipcode' in health data for alignment
health_2022.rename(columns={'ZCTA5': 'Zipcode'}, inplace=True)
health_2023.rename(columns={'ZCTA5': 'Zipcode'}, inplace=True)

# Rename 'Daily_Mean' to 'PM2.5' in air quality data
aqi_2022.rename(columns={'Daily_Mean': 'PM2.5'}, inplace=True)
aqi_2023.rename(columns={'Daily_Mean': 'PM2.5'}, inplace=True)

# Merge the air quality data with the health data for each year
merged_2022 = pd.merge(health_2022, aqi_2022, on='Zipcode', how='left')
merged_2022.dropna(subset=['PM2.5'], inplace=True)

merged_2023 = pd.merge(health_2023, aqi_2023, on='Zipcode', how='left')
merged_2023.dropna(subset=['PM2.5'], inplace=True)

# Save the merged data into new CSV files
merged_2022.to_csv('../Merged_Data/2022_Health_Zipcode_final.csv', index=False)
merged_2023.to_csv('../Merged_Data/2023_Health_Zipcode_final.csv', index=False)

In [37]:
df2 = pd.read_csv('../Merged_Data/2023_Health_Zipcode_final.csv')
df2

Unnamed: 0,Zipcode,TotalPopulation,ACCESS2_CrudePrev,ACCESS2_Crude95CI,ARTHRITIS_CrudePrev,ARTHRITIS_Crude95CI,BINGE_CrudePrev,BINGE_Crude95CI,BPHIGH_CrudePrev,BPHIGH_Crude95CI,...,MOBILITY_Crude95CI,SELFCARE_CrudePrev,SELFCARE_Crude95CI,INDEPLIVE_CrudePrev,INDEPLIVE_Crude95CI,DISABILITY_CrudePrev,DISABILITY_Crude95CI,Geolocation,Year,PM2.5
0,2111,7383,5.5,"( 4.2, 7.1)",15.8,"(13.9, 17.9)",16.8,"(14.6, 19.1)",23.6,"(21.2, 26.2)",...,"( 8.4, 11.3)",2.9,"( 2.5, 3.3)",6.7,"( 5.6, 7.9)",25.3,"(21.8, 28.6)",POINT (-71.05910467 42.35013318),2023.0,9.464501
1,2113,6915,2.6,"( 2.0, 3.4)",12.3,"(10.9, 13.9)",25.6,"(22.6, 28.8)",14.6,"(13.0, 16.4)",...,"( 3.9, 5.3)",1.2,"( 1.1, 1.3)",3.8,"( 3.3, 4.4)",16.3,"(14.2, 18.7)",POINT (-71.05523575 42.36533484),2023.0,9.963605
2,2118,26498,6.8,"( 5.8, 8.1)",17.1,"(15.1, 19.3)",19.5,"(17.1, 22.3)",24.7,"(22.3, 27.4)",...,"( 9.3, 11.9)",3.7,"( 3.4, 4.1)",7.4,"( 6.6, 8.3)",26.0,"(23.2, 28.7)",POINT (-71.07068615 42.3381858),2023.0,10.577658
3,2124,47783,7.7,"( 6.6, 8.9)",22.4,"(20.0, 25.0)",16.0,"(13.9, 18.4)",32.5,"(29.7, 35.6)",...,"(13.8, 17.1)",5.3,"( 4.9, 5.8)",9.9,"( 8.9, 11.0)",31.3,"(28.3, 34.3)",POINT (-71.07137829 42.28596194),2023.0,9.108688
4,2127,31799,4.6,"( 3.9, 5.5)",16.7,"(14.8, 18.7)",22.8,"(20.1, 25.9)",20.0,"(17.9, 22.3)",...,"( 7.1, 9.1)",2.6,"( 2.3, 2.8)",5.7,"( 5.1, 6.4)",21.0,"(18.9, 23.4)",POINT (-71.05000701 42.33258472),2023.0,10.532466
5,2128,40508,13.2,"(11.4, 15.3)",17.9,"(16.0, 20.1)",20.0,"(17.5, 22.9)",23.5,"(21.2, 26.0)",...,"(10.9, 13.7)",4.2,"( 3.8, 4.5)",8.6,"( 7.7, 9.5)",30.4,"(27.7, 33.4)",POINT (-71.015624 42.36281022),2023.0,9.646887
6,2130,35401,5.1,"( 4.3, 6.1)",18.2,"(16.2, 20.5)",20.2,"(17.7, 23.0)",23.0,"(20.7, 25.4)",...,"( 8.4, 10.5)",2.9,"( 2.7, 3.2)",6.0,"( 5.4, 6.7)",22.6,"(20.3, 25.0)",POINT (-71.11485808 42.3098728),2023.0,8.710598
7,2135,42780,4.1,"( 3.4, 4.9)",14.0,"(12.4, 15.8)",22.3,"(19.6, 25.3)",18.0,"(16.1, 20.1)",...,"( 6.0, 7.7)",2.0,"( 1.8, 2.2)",5.4,"( 4.8, 6.1)",21.2,"(18.8, 23.7)",POINT (-71.15381592 42.34988596),2023.0,8.870724
8,2139,36349,3.6,"( 3.0, 4.2)",13.2,"(12.0, 14.8)",20.9,"(18.7, 23.6)",17.7,"(16.1, 19.5)",...,"( 5.0, 6.4)",1.7,"( 1.6, 1.9)",4.7,"( 4.2, 5.3)",18.4,"(16.6, 20.5)",POINT (-71.10304364 42.36253913),2023.0,9.111052


## Boston Data

In [108]:
boston_aq = pd.read_csv('../AirQuality_Boston/boston-air-quality.csv')
health_2021 = pd.read_csv('../Health_Data/Boston/Health_Boston_2021.csv')
health_2022 = pd.read_csv('../Health_Data/Boston/Health_Boston_2022.csv')
health_2023 = pd.read_csv('../Health_Data/Boston/Health_Boston_2023.csv')
health_2020 = pd.read_csv('../Health_Data/Boston/Health_Boston_2020.csv')

In [110]:
health_2022['Year'] = 2022
health_2023['Year'] = 2023
health_2021['Year'] = 2021
health_2020['Year'] = 2020

In [112]:
boston_aq['date'] = pd.to_datetime(boston_aq['date'])
boston_aq['year'] = boston_aq['date'].dt.year

# Calculating the average pm2.5 values for each year
average_pm25_per_year = boston_aq.groupby('year')['pm25'].mean().reset_index()

average_pm25_per_year

Unnamed: 0,year,pm25
0,2020,34.842541
1,2021,32.090411
2,2022,29.978082
3,2023,32.678466


In [113]:
health_2020.shape, health_2021.shape, health_2022.shape, health_2023.shape

((1, 119), (1, 127), (1, 127), (1, 155))

In [114]:
health_2021 = health_2021.drop([col for col in health_2021.columns if ('Crude95CI' in col) or ('Adj' in col)], axis=1)
health_2022 = health_2022.drop([col for col in health_2022.columns if ('Crude95CI' in col) or ('Adj' in col)], axis=1)
health_2023 = health_2023.drop([col for col in health_2023.columns if ('Crude95CI' in col) or ('Adj' in col)], axis=1)
health_2020 = health_2020.drop([col for col in health_2020.columns if ('Crude95CI' in col) or ('Adj' in col)], axis=1)

In [115]:
health_2020.shape, health_2021.shape, health_2022.shape, health_2023.shape

((1, 35), (1, 37), (1, 37), (1, 44))

In [116]:
health = pd.concat([health_2021,health_2022,health_2023,health_2020], ignore_index=True)

In [117]:
health = health.dropna(axis=1)

In [118]:
health

Unnamed: 0,StateAbbr,StateDesc,PlaceName,PlaceFIPS,TotalPopulation,ACCESS2_CrudePrev,ARTHRITIS_CrudePrev,BINGE_CrudePrev,BPHIGH_CrudePrev,BPMED_CrudePrev,...,LPA_CrudePrev,MAMMOUSE_CrudePrev,MHLTH_CrudePrev,OBESITY_CrudePrev,PHLTH_CrudePrev,SLEEP_CrudePrev,STROKE_CrudePrev,TEETHLOST_CrudePrev,Geolocation,Year
0,MA,Massachusetts,Boston,2507000,617594,11.5,17.9,22.1,22.8,69.7,...,26.9,82.2,15.5,22.8,10.8,37.5,2.6,15.6,POINT (-71.10225136 42.30916082),2021
1,MA,Massachusetts,Boston,2507000,617594,8.8,17.0,19.6,22.8,69.7,...,20.4,82.7,15.5,21.9,7.9,32.6,2.4,13.4,POINT (-71.10225136 42.30916082),2022
2,MA,Massachusetts,Boston,2507000,617594,6.4,17.5,19.6,23.8,70.0,...,22.7,82.7,17.3,23.7,8.9,32.6,2.4,13.4,POINT (-71.10225136 42.30916082),2023
3,MA,Massachusetts,Boston,2507000,617594,10.8,18.6,22.8,23.8,70.9,...,23.2,82.2,14.7,21.6,11.1,37.5,2.6,15.6,POINT (-71.10225136 42.30916082),2020


In [119]:
final = health.merge(average_pm25_per_year, left_on='Year', right_on='year')

In [124]:
final = final.drop(["year"], axis=1)