In [71]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from keras.models import Sequential
from keras.layers import Dense, Dropout,LSTM
import aqi



## Data Extraction

In [7]:
## Importing 1 year data
inp_data =  pd.read_csv('./data/AirQualityData.csv',sep = ",")

In [8]:
# droping null values
inp_data = inp_data.dropna()
# Select the last row
last_row = inp_data.iloc[-1]
# Drop the last row
inp_data.drop(last_row.name, inplace=True)

# Check if the hour column contains valid date format
mask = inp_data['hour'].str.contains('^\d{2}:\d{2}$')

# Filter out rows with invalid date formats
inp_data = inp_data[mask]

In [109]:
inp_data['parameter name'].unique()

array(['OZONE', 'NO', 'NO2', 'PM2.5', 'SO2', 'CO', 'NOX', 'PM10', 'TEMP',
       'RHUM', 'RWD', 'RWS', 'WS', 'NOY', 'PMC', 'WD', 'BARPR', 'PRECIP',
       'SRAD', 'BC', 'NO2Y', 'UV-AETH', 'NH3'], dtype=object)

In [9]:
### Pulling required site data :
site_data = inp_data[inp_data['sitename'] == 'San Andreas'].reset_index(drop = True)

## Feature Engineering

In [10]:
###### Droping unrequired columns
site_df = site_data.copy()
site_df = site_df.drop(columns = ['AQSID','sitename','GMT offset','reporting units','datasource'])

In [11]:
## Creating datetime stamp column from input
site_df['datetime'] = pd.to_datetime(site_df.date.astype(str) + ' ' + site_df.hour.astype(str) + ':00')
site_df['datetime'] = pd.to_datetime(site_df['datetime'])

  site_df['datetime'] = pd.to_datetime(site_df.date.astype(str) + ' ' + site_df.hour.astype(str) + ':00')


In [12]:
## Drop unrequired columns
site_df.drop(columns = ['date','hour'])
# Rearrange the columns
site_df = site_df[['datetime','parameter name','value']]

In [13]:
## Creating df's for required major pollutants
no2_df = pd.DataFrame()
pm25_df = pd.DataFrame()
pm10_df = pd.DataFrame()
co_df = pd.DataFrame()
ozone_df = pd.DataFrame()
so2_df = pd.DataFrame()
if 'NO2' in site_df['parameter name'].unique():
    no2_df = site_df[site_df['parameter name'] == 'NO2'].reset_index(drop=True)
if 'PM2.5' in site_df['parameter name'].unique():
    pm25_df = site_df[site_df['parameter name'] == 'PM2.5'].reset_index(drop=True)
if 'PM10' in site_df['parameter name'].unique():
    pm10_df = site_df[site_df['parameter name'] == 'PM10'].reset_index(drop=True)
if 'CO' in site_df['parameter name'].unique():
    co_df = site_df[site_df['parameter name'] == 'CO'].reset_index(drop=True)
if 'OZONE' in site_df['parameter name'].unique():
    ozone_df = site_df[site_df['parameter name'] == 'OZONE'].reset_index(drop=True)
if 'SO2' in site_df['parameter name'].unique():
    so2_df = site_df[site_df['parameter name'] == 'SO2'].reset_index(drop=True)


In [14]:
### Combining the segregated pollutant data together:
req_df = pd.DataFrame()
temp_df = pd.DataFrame()
for i in ['NO2','SO2','OZONE','CO','PM10','PM2.5']:
    if i in site_df['parameter name'].unique():
        temp_df = site_df[site_df['parameter name'] == i ].reset_index(drop=True)
        req_df = pd.concat([req_df, temp_df]).reset_index(drop = True)

In [15]:
# Create the new dataframe with datetime as the index, parameter as the columns, and values as the cells
req_pivot = req_df.pivot_table(index='datetime', columns='parameter name', values='value')
## Sorting
df_sorted = req_pivot.sort_values(by='datetime')
## Droping null values
df_sorted = df_sorted.dropna()

In [76]:
import aqi
myaqi = aqi.to_iaqi(aqi.POLLUTANT_PM25, df_sorted['PM2.5'], algo=aqi.ALGO_EPA)


TypeError: conversion from Series to Decimal is not supported

In [74]:
df_sorted

parameter name,OZONE,PM10,PM2.5
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-04-19 00:00:00,46.0,8.0,3.0
2022-04-19 01:00:00,45.0,59.0,3.0
2022-04-19 02:00:00,39.0,9.0,6.0
2022-04-19 03:00:00,34.0,6.0,5.0
2022-04-19 04:00:00,30.0,6.0,4.0
...,...,...,...
2023-04-19 19:00:00,37.0,0.0,1.0
2023-04-19 20:00:00,39.0,2.0,0.0
2023-04-19 21:00:00,38.0,4.0,-1.0
2023-04-19 22:00:00,40.0,6.0,0.0


## Model developement for LSTM

In [16]:
## Creating required sequences function
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data[i:i+n_steps])
        y.append(data[i+n_steps])
    return np.array(X), np.array(y)


In [32]:
## Fucntion for model prediction for each pollutant
def model_multivariate(df):
    ## Normalizing the data using MinMax Scaler
    values = df.values
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(values)

    ## Coverting scaled dataset into supervised data
    n_steps = 24
    X, Y = create_sequences(scaled_data, n_steps)

    # Split the data into training and testing sets
    train_size = int(len(df) * 0.8)
    
    X_train, X_test = X[train_size:,] , X[:train_size,] 
    print('X_train' ,X_train.shape)
    print('X_test' ,X_test.shape)
    Y_train, Y_test = Y[train_size:,] , Y[:train_size,]
    print('Y_train' ,Y_train.shape)
    print('Y_test' ,Y_test.shape)
   


    # Define the LSTM model architecture
    model = Sequential()
    model.add(LSTM(units = 50, return_sequences = True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(0.2))

    model.add(LSTM(units = 50, return_sequences = True))
    model.add(Dropout(0.2))

    model.add(LSTM(units = 50))
    model.add(Dropout(0.2))

    model.add(Dense(units = 1))

    model.compile(optimizer = 'adam', loss = 'mean_squared_error') 
    model.fit(X_train, Y_train, epochs = 100, batch_size = 64)
    y_pred24 = model.predict(X_test[-24:])
    req = scaler.inverse_transform(y_pred24)
    return req

In [23]:
## Fucntion for model prediction for each pollutant
def model_per_pollutant(df):
    # Split the data into training and testing sets
    train_size = int(len(df) * 0.8)
    train_data = df.iloc[:train_size]
    test_data = df.iloc[train_size:]
    ## Normalizing the data using MinMax Scaler
    scaler = MinMaxScaler()
    train_scaled = scaler.fit_transform(train_data)
    test_scaled = scaler.transform(test_data)
    # Define the number of previous time steps to use for each prediction
    n_steps = 24
    X_train, y_train = create_sequences(train_scaled, n_steps)
    X_test, y_test = create_sequences(test_scaled, n_steps)

    # Define the LSTM model architecture
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(n_steps, 1)))
    model.add(LSTM(units=50))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X_train, y_train, epochs=100, batch_size=64)
    y_pred24 = model.predict(X_test[-24:])
    req = scaler.inverse_transform(y_pred24)
    return req

In [35]:
## Creating the final required df
column_names = df_sorted.columns.tolist()
import pandas as pd
import datetime
#Create dataframe with 24 rows and 0 columns

date_time_index = pd.date_range(start='00:00:00', end='23:00:00', freq='1H')
final = pd.DataFrame(index=date_time_index, columns=column_names)
data = df_sorted.iloc[:,i]
temp = pd.DataFrame({'value': data})
final_mul = model_multivariate(temp)

X_train (1583, 24, 1)
X_test (6426, 24, 1)
Y_train (1583, 1)
Y_test (6426, 1)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72

In [38]:
final_mul[:,0:1].shape

(24, 1)

In [39]:
final_mul

array([[ 6.6966786 ],
       [ 7.8802047 ],
       [ 7.6067657 ],
       [ 8.499366  ],
       [ 7.8416686 ],
       [ 7.8578286 ],
       [13.242014  ],
       [ 5.500723  ],
       [ 5.45416   ],
       [ 4.663791  ],
       [ 4.3787217 ],
       [ 0.9873174 ],
       [ 1.6363006 ],
       [ 4.4173503 ],
       [-0.13570778],
       [ 2.5117126 ],
       [ 3.4908936 ],
       [ 2.5802395 ],
       [ 1.7643235 ],
       [ 2.661047  ],
       [ 2.0660582 ],
       [ 4.229162  ],
       [ 7.298787  ],
       [ 9.218445  ]], dtype=float32)

In [33]:
## Creating the final required df
column_names = df_sorted.columns.tolist()
import pandas as pd
import datetime
#Create dataframe with 24 rows and 0 columns

date_time_index = pd.date_range(start='00:00:00', end='23:00:00', freq='1H')
final = pd.DataFrame(index=date_time_index, columns=column_names)
for i in range (0,df_sorted.shape[1]):
    data = df_sorted.iloc[:,i]
    temp = pd.DataFrame({'value': data})
    final.iloc[:,i] = model_multivariate(temp)

X_train (1583, 24, 1)
X_test (6426, 24, 1)
Y_train (1583, 1)
Y_test (6426, 1)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72

In [34]:
final

Unnamed: 0,OZONE,PM10,PM2.5
2023-04-22 00:00:00,13.754711,8.982903,6.726343
2023-04-22 01:00:00,11.70812,9.60614,8.020169
2023-04-22 02:00:00,8.127414,10.534348,7.828309
2023-04-22 03:00:00,8.035173,11.267755,8.765796
2023-04-22 04:00:00,7.148194,9.027455,8.186937
2023-04-22 05:00:00,6.063726,11.010241,8.227454
2023-04-22 06:00:00,5.192498,9.020339,13.601071
2023-04-22 07:00:00,8.562148,8.164997,6.100686
2023-04-22 08:00:00,12.174021,6.28904,5.850148
2023-04-22 09:00:00,8.76031,6.82001,5.050403


In [31]:
df_sorted.tail(23)

parameter name,OZONE,PM10,PM2.5
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-04-19 00:00:00,46.0,26.0,5.0
2023-04-19 01:00:00,44.0,5.0,3.0
2023-04-19 02:00:00,41.0,8.0,3.0
2023-04-19 03:00:00,36.0,9.0,3.0
2023-04-19 04:00:00,30.0,9.0,2.0
2023-04-19 05:00:00,29.0,9.0,4.0
2023-04-19 06:00:00,26.0,9.0,4.0
2023-04-19 07:00:00,20.0,10.0,7.0
2023-04-19 08:00:00,17.0,11.0,7.0
2023-04-19 09:00:00,14.0,8.0,7.0


## Multivariate for LSTM Trial

In [40]:
df_sorted

parameter name,OZONE,PM10,PM2.5
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-04-19 00:00:00,46.0,8.0,3.0
2022-04-19 01:00:00,45.0,59.0,3.0
2022-04-19 02:00:00,39.0,9.0,6.0
2022-04-19 03:00:00,34.0,6.0,5.0
2022-04-19 04:00:00,30.0,6.0,4.0
...,...,...,...
2023-04-19 19:00:00,37.0,0.0,1.0
2023-04-19 20:00:00,39.0,2.0,0.0
2023-04-19 21:00:00,38.0,4.0,-1.0
2023-04-19 22:00:00,40.0,6.0,0.0


In [None]:
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
	X, y = list(), list()
	for i in range(len(sequences)):
		# find the end of this pattern
		end_ix = i + n_steps
		# check if we are beyond the dataset
		if end_ix > len(sequences)-1:
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequences[i:end_ix, :], sequences[end_ix, :]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)

In [101]:
## Creating the final required df

import pandas as pd
import datetime
def lstm_multivariate(df):
    values = df.values
    column_names = df.columns.tolist()
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(values)
    n_steps = 24
    X, y = split_sequences(scaled_data, n_steps)
    n_features = X.shape[2]
    # Split the data into training and testing sets
    train_size = int(len(df_sorted) * 0.8)
    X_train, X_test = X[train_size:,] , X[:train_size,] 
    # print('X_train' ,X_train.shape)
    # print('X_test' ,X_test.shape)
    Y_train, Y_test = y[train_size:,] , y[:train_size,]
    # print('Y_train' ,Y_train.shape)
    # print('Y_test' ,Y_test.shape)
    n_features = X.shape[2]
    # define model
    model = Sequential()
    model.add(LSTM(100, activation='relu', return_sequences=True, input_shape=(n_steps, n_features)))
    model.add(LSTM(100, activation='relu'))
    model.add(Dense(n_features))
    model.compile(optimizer='adam', loss='mse')
    # fit model
    model.fit(X_train, Y_train, epochs=45, verbose =1 )
    # demonstrate prediction
    x_input = X[-24:]
    x_input = x_input.reshape((24, n_steps, n_features))
    yhat = model.predict(x_input, verbose=0)
    req = scaler.inverse_transform(yhat)
    date_time_index = pd.date_range(start='00:00:00', end='23:00:00', freq='1H')
    final = pd.DataFrame(index=date_time_index, columns=column_names, data = req)
    return final


In [45]:
values = df_sorted.values
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(values)
scaled_data

array([[0.50549451, 0.03832753, 0.03184713],
       [0.49450549, 0.21602787, 0.03184713],
       [0.42857143, 0.04181185, 0.05095541],
       ...,
       [0.41758242, 0.02439024, 0.00636943],
       [0.43956044, 0.03135889, 0.01273885],
       [0.45054945, 0.02787456, 0.00636943]])

In [42]:
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
	X, y = list(), list()
	for i in range(len(sequences)):
		# find the end of this pattern
		end_ix = i + n_steps
		# check if we are beyond the dataset
		if end_ix > len(sequences)-1:
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequences[i:end_ix, :], sequences[end_ix, :]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)

In [46]:
n_steps = 24
X, y = split_sequences(scaled_data, n_steps)
print(X.shape, y.shape)
# summarize the data
for i in range(len(X)):
 print(X[i], y[i])

(8009, 24, 3) (8009, 3)
[[0.50549451 0.03832753 0.03184713]
 [0.49450549 0.21602787 0.03184713]
 [0.42857143 0.04181185 0.05095541]
 [0.37362637 0.03135889 0.04458599]
 [0.32967033 0.03135889 0.03821656]
 [0.31868132 0.02439024 0.03184713]
 [0.25274725 0.03484321 0.03184713]
 [0.20879121 0.04878049 0.03184713]
 [0.18681319 0.03832753 0.06369427]
 [0.14285714 0.02787456 0.05732484]
 [0.13186813 0.02439024 0.07006369]
 [0.14285714 0.03135889 0.05095541]
 [0.07692308 0.02787456 0.03184713]
 [0.2967033  0.02090592 0.02547771]
 [0.35164835 0.01393728 0.01273885]
 [0.42857143 0.00348432 0.00636943]
 [0.45054945 0.         0.00636943]
 [0.47252747 0.00696864 0.        ]
 [0.49450549 0.01045296 0.00636943]
 [0.50549451 0.01045296 0.01910828]
 [0.51648352 0.01045296 0.03184713]
 [0.50549451 0.01045296 0.03821656]
 [0.40659341 0.02090592 0.02547771]
 [0.30769231 0.02787456 0.04458599]] [0.30769231 0.03484321 0.03821656]
[[0.49450549 0.21602787 0.03184713]
 [0.42857143 0.04181185 0.05095541]
 [0.

In [47]:
n_features = X.shape[2]

In [48]:
n_features

3

In [49]:
# Split the data into training and testing sets
train_size = int(len(df_sorted) * 0.8)
X_train, X_test = X[train_size:,] , X[:train_size,] 
print('X_train' ,X_train.shape)
print('X_test' ,X_test.shape)
Y_train, Y_test = y[train_size:,] , y[:train_size,]
print('Y_train' ,Y_train.shape)
print('Y_test' ,Y_test.shape)

X_train (1583, 24, 3)
X_test (6426, 24, 3)
Y_train (1583, 3)
Y_test (6426, 3)


In [50]:
X[0].size

72

In [95]:
n_features = X.shape[2]
# define model
model = Sequential()
model.add(LSTM(100, activation='relu', return_sequences=True, input_shape=(n_steps, n_features)))
model.add(LSTM(100, activation='relu'))
model.add(Dense(n_features))
model.compile(optimizer='adam', loss='mse')
# fit model
model.fit(X_train, Y_train, epochs=30, verbose =1 )


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x22176b13890>

In [96]:
type(X)

numpy.ndarray

In [102]:
# demonstrate prediction
x_input = X[-24:]
x_input = x_input.reshape((24, n_steps, n_features))
yhat = model.predict(x_input, verbose=0)
print(yhat)

[[0.4003926  0.0264542  0.02095686]
 [0.4244967  0.0254151  0.02045559]
 [0.42700624 0.02669998 0.02137808]
 [0.4611526  0.0255873  0.01889857]
 [0.47254264 0.02988023 0.02262569]
 [0.5242544  0.03037697 0.02260456]
 [0.48731685 0.03948269 0.03384154]
 [0.4699505  0.04259711 0.03909778]
 [0.41698664 0.04837867 0.04750183]
 [0.38426274 0.04998185 0.05223305]
 [0.37104577 0.04544019 0.04992602]
 [0.3785908  0.03744901 0.04205777]
 [0.35166937 0.03488766 0.0408328 ]
 [0.34355026 0.0300221  0.03666275]
 [0.34322655 0.02702407 0.03482261]
 [0.34594816 0.02361698 0.03132187]
 [0.34925795 0.02041243 0.0271453 ]
 [0.3666408  0.01693646 0.02193834]
 [0.2086635  0.02968018 0.03673846]
 [0.20665348 0.02284907 0.02951769]
 [0.41447926 0.00392096 0.00718134]
 [0.40490878 0.01553722 0.01600035]
 [0.47792    0.01309831 0.00942446]
 [0.477382   0.0181471  0.01156462]]


In [103]:
req = scaler.inverse_transform(yhat)

In [104]:
req

array([[36.435726  ,  4.5923543 ,  1.2902262 ],
       [38.6292    ,  4.294133  ,  1.2115272 ],
       [38.857567  ,  4.6628933 ,  1.3563578 ],
       [41.96489   ,  4.343555  ,  0.96707475],
       [43.00138   ,  5.575626  ,  1.5522331 ],
       [47.70715   ,  5.7181916 ,  1.5489156 ],
       [44.345833  ,  8.3315325 ,  3.3131223 ],
       [42.765495  ,  9.22537   ,  4.138351  ],
       [37.945786  , 10.884678  ,  5.4577875 ],
       [34.96791   , 11.344791  ,  6.2005887 ],
       [33.765163  , 10.041335  ,  5.838385  ],
       [34.451763  ,  7.7478647 ,  4.6030693 ],
       [32.00191   ,  7.0127573 ,  4.410749  ],
       [31.263075  ,  5.616343  ,  3.7560518 ],
       [31.233616  ,  4.7559075 ,  3.467149  ],
       [31.481283  ,  3.7780735 ,  2.9175334 ],
       [31.782473  ,  2.8583665 ,  2.2618127 ],
       [33.364315  ,  1.8607637 ,  1.4443189 ],
       [18.988379  ,  5.518212  ,  3.7679386 ],
       [18.805466  ,  3.5576837 ,  2.6342769 ],
       [37.717613  , -1.8746839 , -0.872

In [118]:
date_time_index = pd.date_range(start='00:00:00', end='23:00:00', freq='1H')
final = pd.DataFrame(index=date_time_index, columns=column_names, data = req)

[Timestamp('2023-04-22 00:00:00'), Timestamp('2023-04-22 01:00:00'), Timestamp('2023-04-22 02:00:00'), Timestamp('2023-04-22 03:00:00'), Timestamp('2023-04-22 04:00:00'), Timestamp('2023-04-22 05:00:00'), Timestamp('2023-04-22 06:00:00'), Timestamp('2023-04-22 07:00:00'), Timestamp('2023-04-22 08:00:00'), Timestamp('2023-04-22 09:00:00'), Timestamp('2023-04-22 10:00:00'), Timestamp('2023-04-22 11:00:00'), Timestamp('2023-04-22 12:00:00'), Timestamp('2023-04-22 13:00:00'), Timestamp('2023-04-22 14:00:00'), Timestamp('2023-04-22 15:00:00'), Timestamp('2023-04-22 16:00:00'), Timestamp('2023-04-22 17:00:00'), Timestamp('2023-04-22 18:00:00'), Timestamp('2023-04-22 19:00:00'), Timestamp('2023-04-22 20:00:00'), Timestamp('2023-04-22 21:00:00'), Timestamp('2023-04-22 22:00:00'), Timestamp('2023-04-22 23:00:00')]


In [94]:
final

Unnamed: 0,OZONE,PM10,PM2.5
2023-04-22 00:00:00,43.612583,5.964577,1.903429
2023-04-22 01:00:00,44.496773,6.585727,1.850311
2023-04-22 02:00:00,44.758114,9.204762,2.986109
2023-04-22 03:00:00,41.597118,10.176509,3.658944
2023-04-22 04:00:00,37.150036,10.643082,4.541256
2023-04-22 05:00:00,31.308853,10.760278,5.517965
2023-04-22 06:00:00,25.196148,10.33025,6.265985
2023-04-22 07:00:00,26.337433,8.734379,5.876341
2023-04-22 08:00:00,22.633341,8.076459,6.312825
2023-04-22 09:00:00,16.399853,8.876628,7.157352


In [106]:
len(final)

24

In [114]:
final.loc[1,'OZONE']

KeyError: 1

In [107]:
column_names

['OZONE', 'PM10', 'PM2.5']

In [138]:
# Define ozone AQI breakpoints
ozone_breakpoints = {
    0: [0, 50],
    1: [51, 100],
    2: [101, 150],
    3: [151, 200],
    4: [201, 300],
    5: [301, 500]
}

# Calculate AQI for ozone
def calculate_ozone_aqi(ozone_ppm):
    aqi = 0
    for bp in ozone_breakpoints:
        if ozone_ppm >= ozone_breakpoints[bp][0] and ozone_ppm <= ozone_breakpoints[bp][1]:
            aqi = bp
    return aqi

print(calculate_ozone_aqi(43.6))

0


In [139]:
import aqi
myaqi = aqi.to_aqi([
    (aqi.POLLUTANT_PM25, '12'),
    (aqi.POLLUTANT_PM10, '24'),
    (aqi.POLLUTANT_O3_8H, '0.087')
])

In [140]:
myaqi

Decimal('129')

In [128]:
import aqi
if 'OZONE' in column_names:
    for index, row in final.iterrows():
        final.loc[index, 'OZONE'] = calculate_ozone_aqi(final.loc[index, 'OZONE'])
if 'SO2' in column_names:
    for index, row in final.iterrows():
        final.loc[index, 'SO2'] = aqi.to_iaqi(aqi.POLLUTANT_SO2_1H, final.loc[index, 'SO2'], algo=aqi.ALGO_EPA)
if 'CO' in column_names:
    for index, row in final.iterrows():
        final.loc[index, 'CO'] = aqi.to_iaqi(aqi.POLLUTANT_CO_8H, final.loc[index, 'CO'], algo=aqi.ALGO_EPA)
if 'PM10' in column_names:
    for index, row in final.iterrows():
        final.loc[index, 'PM10'] = aqi.to_iaqi(aqi.POLLUTANT_PM10, final.loc[index, 'PM10'], algo=aqi.ALGO_EPA)
if 'PM2.5' in column_names:
    for index, row in final.iterrows():
        final.loc[index, 'PM2.5'] = aqi.to_iaqi(aqi.POLLUTANT_PM25, final.loc[index, 'PM2.5'], algo=aqi.ALGO_EPA)
if 'NO2' in column_names:
    for index, row in final.iterrows():
        final.loc[index, 'NO2'] = aqi.to_iaqi(aqi.POLLUTANT_NO2_1H, final.loc[index, 'NO2'], algo=aqi.ALGO_EPA)

TypeError: unsupported operand type(s) for -: 'NoneType' and 'NoneType'

In [105]:
df_sorted.tail(47)

parameter name,OZONE,PM10,PM2.5
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-04-17 20:00:00,38.0,7.0,1.0
2023-04-17 21:00:00,41.0,12.0,1.0
2023-04-17 22:00:00,43.0,19.0,3.0
2023-04-17 23:00:00,48.0,16.0,6.0
2023-04-18 00:00:00,47.0,21.0,5.0
2023-04-18 01:00:00,46.0,14.0,5.0
2023-04-18 02:00:00,42.0,16.0,5.0
2023-04-18 03:00:00,39.0,17.0,6.0
2023-04-18 04:00:00,37.0,11.0,5.0
2023-04-18 05:00:00,37.0,8.0,3.0


In [73]:
df_sorted.tail(23)

parameter name,OZONE,PM10,PM2.5
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-04-19 00:00:00,46.0,26.0,5.0
2023-04-19 01:00:00,44.0,5.0,3.0
2023-04-19 02:00:00,41.0,8.0,3.0
2023-04-19 03:00:00,36.0,9.0,3.0
2023-04-19 04:00:00,30.0,9.0,2.0
2023-04-19 05:00:00,29.0,9.0,4.0
2023-04-19 06:00:00,26.0,9.0,4.0
2023-04-19 07:00:00,20.0,10.0,7.0
2023-04-19 08:00:00,17.0,11.0,7.0
2023-04-19 09:00:00,14.0,8.0,7.0
