Here I expanded the data column of the original AQI dataset so that each key value pair becomes a new column where the column name is the key value.

In [4]:
# Adjusting the transformation function to also handle "PM10" and ignore unexpected parameters.
def extended_transform(row):
    # Parse the JSON data
    parsed_data = json.loads(row['data'])
    
    # Initialize a dictionary to store the transformed data with default values
    transformed = {
        'date': row['date'],
        'zip_code': row['zip_code'],
        'ReportingArea': '',
        'StateCode': '',
        'Latitude': 0,
        'Longitude': 0,
        'OZONEAQI': None,  # Default as None, which will be replaced by actual values or remain as NaN in the DataFrame
        'PM2.5AQI': None,  # Same as above
        'PM10AQI': None,  # Adding default for PM10
        'CategoryNumber': None,
        'CategoryName': ''
    }

    # Extracting information specific to "OZONE", "PM2.5", and "PM10"
    for record in parsed_data:
        parameter = record['ParameterName']
        
        if parameter not in ['OZONE', 'PM2.5', 'PM10']:
            continue  # If the parameter is not one of the expected types, skip it
        
        # Update the common information if not already done
        if not transformed['ReportingArea']:
            transformed.update({
                'ReportingArea': record['ReportingArea'],
                'StateCode': record['StateCode'],
                'Latitude': record['Latitude'],
                'Longitude': record['Longitude'],
                'CategoryNumber': record['Category']['Number'],
                'CategoryName': record['Category']['Name']
            })
        
        # Update the AQI values for the specific parameters
        transformed[f'{parameter}AQI'] = record['AQI']
    
    return transformed


# Load the new dataset
new_file_path = '../Datasets/AQI/2021_daily_aqi_data.csv'
new_data = pd.read_csv(new_file_path)

# Apply the transformation to each row in the new dataset
extended_transformed_data_2021 = new_data.apply(extended_transform, axis=1)

# Convert the results into a DataFrame
final_extended_df_2021 = pd.DataFrame(list(extended_transformed_data_2021))

# Saving the final DataFrame to a CSV file
final_extended_df_2021.to_csv('../Datasets/AQI/2021_Daily_Aqi_Data_Cleaned', index=False)

# Returning the shape of the final DataFrame and the first few rows to confirm the transformation
(final_extended_df_2021.shape, final_extended_df_2021.head())


((16400, 11),
          date  zip_code ReportingArea StateCode  Latitude  Longitude  \
 0  2021-01-01      2045      Weymouth        MA   42.2459   -70.9628   
 1  2021-01-02      2045      Weymouth        MA   42.2459   -70.9628   
 2  2021-01-03      2045      Weymouth        MA   42.2459   -70.9628   
 3  2021-01-04      2045      Weymouth        MA   42.2459   -70.9628   
 4  2021-01-05      2045      Weymouth        MA   42.2459   -70.9628   
 
    OZONEAQI  PM2.5AQI PM10AQI  CategoryNumber CategoryName  
 0       NaN      40.0    None             1.0         Good  
 1       NaN      17.0    None             1.0         Good  
 2       NaN      25.0    None             1.0         Good  
 3       NaN      28.0    None             1.0         Good  
 4       NaN      20.0    None             1.0         Good  )

Now I will calculate the average AQI of each zipcode and save it in a new csv file.

In [16]:
path_file = '../Datasets/AQI/2021_Daily_Aqi_Data_Cleaned.csv'
df = pd.read_csv(path_file)

averages = df.groupby('zip_code')[['OZONEAQI', 'PM2.5AQI']].mean().reset_index()
averages['zip_code'] = averages['zip_code'].astype(str).str.zfill(5)

averages.to_csv('../Datasets/AQI/2021_Avg_Aqi_Data_Cleaned.csv', index=False)