
# **Loading dataset and removing unwanted columns**

In [None]:
import pandas as pd

# Load the CSV file into a pandas dataframe
df_train = pd.read_csv("Air_Quality.csv")


# Remove the "Start_date" and "unique id" columns
df_train = df_train.drop(["Start_Date", "Unique ID", "Geo Join ID", "Message", "Time Period"], axis=1)

# **Dropping independent variable**

In [92]:
df_train = df_train.sample(frac=1).reset_index(drop=True)
# Split the training data into features and labels
X_train = df_train.drop(columns=['Data Value'])
y_train = df_train['Data Value']
X_train.head()

Unnamed: 0,Indicator ID,Name,Measure,Measure Info,Geo Type Name,Geo Place Name
0,365,Fine Particulate Matter (PM2.5),Mean,mcg per cubic meter,CD,Sheepshead Bay (CD15)
1,641,Boiler Emissions- Total PM2.5 Emissions,Number per km2,number,UHF42,Flushing - Clearview
2,375,Nitrogen Dioxide (NO2),Mean,ppb,UHF34,Southwest Queens
3,375,Nitrogen Dioxide (NO2),Mean,ppb,UHF34,Bensonhurst - Bay Ridge
4,386,Ozone (O3),Mean,ppb,CD,Central Harlem (CD10)


# **Checking missing columns**

In [93]:
# Count the number of missing values in each column
missing_values = X_train.isnull().sum()

# Print the result
print(missing_values)

Indicator ID      0
Name              0
Measure           0
Measure Info      0
Geo Type Name     0
Geo Place Name    0
dtype: int64


# **Checking if any rows have same attributes**

In [94]:

# Check if any rows have all the same values
mask = (df_train.nunique(axis=1) == 1)
result = df_train[mask]

# Print the rows that have all the same values
print(result)

Empty DataFrame
Columns: [Indicator ID, Name, Measure, Measure Info, Geo Type Name, Geo Place Name, Data Value]
Index: []


# **Performing one hot encoding on categorical values**

In [95]:

# Perform one hot encoding on all columns
X_train = pd.get_dummies(X_train)
X_train.head()
# Print the encoded dataframe
print(X_train)

       Indicator ID  \
0               365   
1               641   
2               375   
3               375   
4               386   
...             ...   
16117           365   
16118           375   
16119           365   
16120           653   
16121           375   

       Name_Air Toxics Concentrations- Average Benzene Concentrations  \
0                                                      0                
1                                                      0                
2                                                      0                
3                                                      0                
4                                                      0                
...                                                  ...                
16117                                                  0                
16118                                                  0                
16119                                                  0          

# **Performing one hot encoding of column "Indicator ID"**

In [96]:
df_encoded = pd.get_dummies(X_train['Indicator ID'], prefix='Indicator ID')

# Concatenate the encoded columns with the original dataframe
X_train = pd.concat([X_train, df_encoded], axis=1)

# Drop the original unique id column
X_train = X_train.drop('Indicator ID', axis=1)

# Print the new dataframe
print(X_train)

       Name_Air Toxics Concentrations- Average Benzene Concentrations  \
0                                                      0                
1                                                      0                
2                                                      0                
3                                                      0                
4                                                      0                
...                                                  ...                
16117                                                  0                
16118                                                  0                
16119                                                  0                
16120                                                  0                
16121                                                  0                

       Name_Air Toxics Concentrations- Average Formaldehyde Concentrations  \
0                                            

In [105]:
# Display the unique values in a column
unique_values = df['Name'].unique()
print(unique_values)

['Ozone (O3)' 'Sulfur Dioxide (SO2)' 'PM2.5-Attributable Deaths'
 'Boiler Emissions- Total SO2 Emissions'
 'Boiler Emissions- Total PM2.5 Emissions'
 'Boiler Emissions- Total NOx Emissions'
 'Air Toxics Concentrations- Average Benzene Concentrations'
 'Air Toxics Concentrations- Average Formaldehyde Concentrations'
 'PM2.5-Attributable Asthma Emergency Department Visits'
 'PM2.5-Attributable Respiratory Hospitalizations (Adults 20 Yrs and Older)'
 'PM2.5-Attributable Cardiovascular Hospitalizations (Adults 40 Yrs and Older)'
 'Traffic Density- Annual Vehicle Miles Traveled'
 'O3-Attributable Cardiac and Respiratory Deaths'
 'O3-Attributable Asthma Emergency Department Visits'
 'O3-Attributable Asthma Hospitalizations'
 'Traffic Density- Annual Vehicle Miles Traveled for Cars'
 'Traffic Density- Annual Vehicle Miles Traveled for Trucks'
 'Nitrogen Dioxide (NO2)' 'Fine Particulate Matter (PM2.5)']


# **Checking data type of column values**

In [97]:
np.asarray(X_train)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

# **Converting data frame to Numpy array**

In [98]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

# **Checking whether dataset is linear or  non linear**

In [104]:
from statsmodels.stats.stattools import durbin_watson
from statsmodels.regression.linear_model import OLS
# Fit a linear regression model to the data
model = OLS(y_train, X_train).fit()

# Perform the Durbin-Watson test
dw_stat = durbin_watson(model.resid)

# Print the Durbin-Watson statistic
print("Durbin-Watson statistic: ", dw_stat)

# Check for autocorrelation
if dw_stat < 2 or dw_stat > 2:
    print("There is autocorrelation in the data, the dataset is likely non-linear")
else:
    print("There is no autocorrelation in the data, the dataset is likely linear")

Durbin-Watson statistic:  1.9975868543237028
There is autocorrelation in the data, the dataset is likely non-linear


# **Training using Gradient Boosting Regressor**

In [100]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from keras.models import Sequential
from keras.layers import Dense, Activation
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.preprocessing import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam
import keras.backend as K
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))



# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)




# Define the model
model = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, max_depth=3, random_state=2)

# Fit the model to the training data
model.fit(X_train, y_train)


# Make predictions on the validation set
predictions = model.predict(X_val)

# Calculate the mean squared error
mse = mean_squared_error(y_val, predictions)

# Take the square root of the mean squared error to get the root mean squared error
rmse = np.sqrt(mse)
print("RMSE:", rmse)
# Calculate the R-squared score
r2 = r2_score(y_val, predictions)

# Print the R-squared score
print("R-squared:", r2)


RMSE: 7.048064962657481
R-squared: 0.8750949799084894


# **Training using Sequential Regressor**

In [101]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from keras.models import Sequential
from keras.layers import Dense, Activation
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.preprocessing import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsRegressor
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam
import keras.backend as K
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
import matplotlib.pyplot as plt
from sklearn.ensemble import *
from keras.models import *
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))



# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)




# Create a Sequential model object
model = Sequential()

# Add layers to the model
model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=1))
# Set the learning rate hyperparameter
learning_rate = 0.001

# Compile the model with the specified learning rate
optimizer = Adam(learning_rate=learning_rate)
model.compile(loss='mean_squared_error', optimizer=optimizer)

# Train the model on the training data
model.fit(X_train, y_train, epochs=500, batch_size=32, verbose=0)


# Make predictions on the validation set
predictions = model.predict(X_val)

# Calculate the mean squared error
mse = mean_squared_error(y_val, predictions)

# Take the square root of the mean squared error to get the root mean squared error
rmse = np.sqrt(mse)
print("RMSE:", rmse)
# Calculate the R-squared score
r2 = r2_score(y_val, predictions)

# Print the R-squared score
print("R-squared:", r2)

RMSE: 6.45108247409826
R-squared: 0.9020151540972329
