In [1]:
# Importing the required libraries and dependencies
import pandas as pd

In [2]:
# *FOR GOOGLE COLAB* - Import the `files` library to allow files upload
#from google.colab import files

In [3]:
# *FOR GOOGLE COLAB* - Upload "consolidated_seven_ny_counties.csv" into Colab, then store in a DataFrame
#uploaded = files.upload()

In [4]:
# Storing the data in a Pandas DataFrame
# Setting the "date_local" column as the Datetime Index.

new_df = pd.read_csv(
    "Resources/consolidated_seven_ny_counties.csv",
    index_col='date_local',
    low_memory=False,
    parse_dates=True
)

# Preview the dataset
new_df.head()

Unnamed: 0_level_0,county_code,parameter_code,parameter,latitude,longitude,sample_duration_code,units_of_measure,observation_count,validity_indicator,arithmetic_mean,...,units_of_measure_81102,observation_count_81102,validity_indicator_81102,arithmetic_mean_81102,first_max_value_81102,first_max_hour_81102,aqi_81102,county_81102,city_81102,aqi_max
date_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,5.0,42101.0,Carbon monoxide,40.8679,-73.87809,1,Parts per million,43.0,Y,0.5341,...,,,,,,,,,,52.0
2013-01-02,5.0,42101.0,Carbon monoxide,40.8679,-73.87809,1,Parts per million,48.0,Y,0.427084,...,,,,,,,,,,60.0
2013-01-03,5.0,42101.0,Carbon monoxide,40.8679,-73.87809,1,Parts per million,47.0,Y,0.50009,...,,,,,,,,,,59.0
2013-01-04,5.0,42101.0,Carbon monoxide,40.8679,-73.87809,1,Parts per million,48.0,Y,0.579166,...,,,,,,,,,,64.5
2013-01-05,5.0,42101.0,Carbon monoxide,40.8679,-73.87809,1,Parts per million,48.0,Y,0.64375,...,,,,,,,,,,41.5


In [5]:
new_df.shape

(28736, 106)

In [6]:
new_df.columns

Index(['county_code', 'parameter_code', 'parameter', 'latitude', 'longitude',
       'sample_duration_code', 'units_of_measure', 'observation_count',
       'validity_indicator', 'arithmetic_mean',
       ...
       'units_of_measure_81102', 'observation_count_81102',
       'validity_indicator_81102', 'arithmetic_mean_81102',
       'first_max_value_81102', 'first_max_hour_81102', 'aqi_81102',
       'county_81102', 'city_81102', 'aqi_max'],
      dtype='object', length=106)

In [7]:
#Dropping all unnecessary columns and displaying DataFrame
selected_columns_df = new_df [['latitude_88502', 'longitude_88502','county_88502', 'aqi','aqi_42401','aqi_44201','aqi_42602','aqi_88502','aqi_max' ]].copy()
selected_columns_df.rename(columns={"aqi": "aqi_42101", 'latitude_88502': 'latitude', 'longitude_88502': 'longitude', 'county_88502': 'county'}, inplace=True)
selected_columns_df.head()

Unnamed: 0_level_0,latitude,longitude,county,aqi_42101,aqi_42401,aqi_44201,aqi_42602,aqi_88502,aqi_max
date_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-01-01,40.81551,-73.88553,Bronx,4.5,9.8,22.0,23.0,52.0,52.0
2013-01-02,40.81551,-73.88553,Bronx,6.5,11.4,21.0,26.5,60.0,60.0
2013-01-03,40.81551,-73.88553,Bronx,6.5,15.0,19.375,35.5,59.0,59.0
2013-01-04,40.81551,-73.88553,Bronx,7.0,13.0,19.0,36.5,60.0,64.5
2013-01-05,,,,8.0,10.2,23.5,41.5,,41.5


In [8]:
# View sum of null values per column
selected_columns_df.isnull().sum()

latitude       692
longitude      692
county         692
aqi_42101     7180
aqi_42401    17756
aqi_44201    12579
aqi_42602    19335
aqi_88502      692
aqi_max          0
dtype: int64

In [9]:
#Fill in blanks in lat, long, and city columns using forward fill
selected_columns_df['latitude'] = selected_columns_df['latitude'].ffill()
selected_columns_df['longitude'] = selected_columns_df['longitude'].ffill()
selected_columns_df['county'] = selected_columns_df['county'].ffill()
selected_columns_df.isnull().sum()

latitude         0
longitude        0
county           0
aqi_42101     7180
aqi_42401    17756
aqi_44201    12579
aqi_42602    19335
aqi_88502      692
aqi_max          0
dtype: int64

In [10]:
aqi_columns = ["aqi_42101","aqi_42401", "aqi_44201", "aqi_42602", "aqi_88502"]
groupby_columns = ['county','date_local']

In [11]:
#Function to fill in AQI values
def impute_aqi_columns(df, aqi_columns, groupby_columns):
    """
    Impute missing values in AQI columns using a sequence of methods.
    
    Parameters:
    df (pd.DataFrame): The dataframe containing the data.
    aqi_columns (list): List of AQI column names to impute.
    groupby_columns (list): List of columns to group by for mean imputation.
    
    Returns:
    pd.DataFrame: The dataframe with imputed AQI columns.
    """
    for col in aqi_columns:
        # Forward fill and backward fill
        df[col] = df[col].ffill().bfill()
        
        # Interpolation
        df[col] = df[col].interpolate(method='time')
        
        # Mean imputation by category
        df[col] = df.groupby(groupby_columns)[col].transform(lambda x: x.fillna(x.mean()))
        
        # Overall mean imputation
        df[col] = df[col].fillna(df[col].mean())
    
    return df



In [12]:
#Apply the function to selected_columns_df.
selected_filled_columns_df = impute_aqi_columns(selected_columns_df, aqi_columns, groupby_columns)

# Check for missing values
print("Missing values after imputation:")
print(selected_columns_df.isnull().sum())

Missing values after imputation:
latitude     0
longitude    0
county       0
aqi_42101    0
aqi_42401    0
aqi_44201    0
aqi_42602    0
aqi_88502    0
aqi_max      0
dtype: int64


In [13]:
selected_filled_columns_df.reset_index(inplace = True)
selected_filled_columns_df.head()

Unnamed: 0,date_local,latitude,longitude,county,aqi_42101,aqi_42401,aqi_44201,aqi_42602,aqi_88502,aqi_max
0,2013-01-01,40.81551,-73.88553,Bronx,4.5,9.8,22.0,23.0,52.0,52.0
1,2013-01-02,40.81551,-73.88553,Bronx,6.5,11.4,21.0,26.5,60.0,60.0
2,2013-01-03,40.81551,-73.88553,Bronx,6.5,15.0,19.375,35.5,59.0,59.0
3,2013-01-04,40.81551,-73.88553,Bronx,7.0,13.0,19.0,36.5,60.0,64.5
4,2013-01-05,40.81551,-73.88553,Bronx,8.0,10.2,23.5,41.5,60.0,41.5


In [14]:
# Write output file to /Resources folder
selected_columns_df.to_csv("Resources/consolidated_seven_ny_counties_filled.csv", index=False)

