In [None]:
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                        normalize=False,
                        title='Confusion matrix',
                        cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
            horizontalalignment="center",
            color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import random

import keras
from keras.models import Sequential
from keras.layers import Dense

from sklearn.compose import ColumnTransformer

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

import category_encoders as ce


import itertools


def ann_main(df, predays, hl_count, hl_nodes, batch_size, epochs):

    test_df = df[['temp','humidity','precip','windspeed','Season']].copy()
    
    encoder = ce.OneHotEncoder(cols='Season',handle_unknown='return_nan', return_df = True, use_cat_names=True)
    df = encoder.fit_transform(df)
    
    for x in range(1,predays+1):
        test_df=test_df.shift(periods=1,fill_value=0)
        df = df.join(test_df[['temp','humidity','precip','windspeed','Season']],rsuffix="_pr"+str(x))

    X = df.iloc[:, [df.columns.get_loc('temp'),df.columns.get_loc('humidity'), 
                    df.columns.get_loc('precip'), df.columns.get_loc('windspeed')]].values

    for x in range(1,predays+1):
        Xnew = df.iloc[:, [df.columns.get_loc('temp_pr'+str(x)),df.columns.get_loc('humidity_pr'+str(x)), 
                    df.columns.get_loc('precip_pr'+str(x)), df.columns.get_loc('windspeed_pr'+str(x))]].values
        X = np.concatenate((X,Xnew),axis=1)
    
    y = df.iloc[:, df.columns.get_loc('aqi')].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)

    x_row, x_col = X.shape
    ann_input_size = x_col

    ann = keras.Sequential()
    ann.add(Dense(hl_nodes, activation = 'relu', input_dim = ann_input_size))
    for hidden_layer in range(hl_count):
        ann.add(Dense(hl_nodes, activation = 'relu'))
    ann.add(Dense(1, activation = 'sigmoid'))

    ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

    ann.fit(X_train, y_train, batch_size, epochs)

    tr_pred = ann.predict(X_train)
    tr_scores = ann.evaluate(X_train, y_train, verbose = 0)
    print("Acc on train data: {}% \n error on train: {}".format(tr_scores[1], (1-tr_scores[1])))

    te_scores = ann.evaluate(X_test, y_test, verbose = 0)
    print("Acc on test data: {}% \n error on test: {}".format(te_scores[1], 1-te_scores[1]))

    te_pred = ann.predict(X_test)
    te_pred = (te_pred > 0.5).astype(int)
    cm = confusion_matrix(y_true=y_test, y_pred=te_pred)

    cm_plot_labels = ['good_aqi','bad_aqi']
    #plot_confusion_matrix(cm=cm, classes=cm_plot_labels, title='Confusion Matrix')

    f1score = f1_score(y_true=y_test, y_pred=te_pred)
    print("f1Score: {}".format(f1score))
    
    return (te_scores[1], f1score)

   
    
def ann_run(filename, prevdays, hl_count_max, hl_nodes_min, hl_nodes_max, batch_min, batch_max, ep_min, ep_max):


    df = pd.read_csv(filename)

    df["aqi"] = (df['aqi'] >= 25.0).astype(int)

    df_res = pd.DataFrame(columns=['previous_days','hl_count', 'hl_nodes','batch_size', 'epochs', 'f1_score', 'accuracy'])

    for predays in range(prevdays+1):
        for hl_count in range(1,hl_count_max+1):
            for hl_nodes in range(hl_nodes_min, hl_nodes_max+1):
                for batch in range(batch_min,batch_max+1):
                    for ep in range(ep_min,ep_max+1):
                        f1Score, acc = ann_main(df = df, predays = predays, hl_count = hl_count, hl_nodes = hl_nodes,batch_size = batch, epochs = ep)
                        new_row = {'previous_days':predays, 'hl_count':hl_count, 'hl_nodes':hl_nodes, 'batch_size':batch, 'epochs':ep, 'f1_score':f1Score, 'accuracy':acc}
                        df_res = df_res.append(new_row, ignore_index=True)       
                        
    print("ANN run done")
    
    return (df_res)
    

    

In [None]:
prevdays = 5
hl_count_max = 5
hl_nodes_min = 8
hl_nodes_max= 12
batch_min= 10
batch_max=10
ep_min=10 
ep_max=10
df_res = ann_run("../Data_Prep_LA/WeatherAndPollution_2011_2020.csv", prevdays, hl_count_max, hl_nodes_min, hl_nodes_max, batch_min, batch_max, ep_min, ep_max)
print(df_res)

In [None]:
prevdays = 5
hl_count_max = 5
hl_nodes_min = 8
hl_nodes_max= 12
batch_min= 10
batch_max=10
ep_min=10 
ep_max=10
df_res = ann_run("../Data_Prep_NY/WeatherAndPollution_2011_2020.csv", prevdays, hl_count_max, hl_nodes_min, hl_nodes_max, batch_min, batch_max, ep_min, ep_max)
df_res.to_csv("NY_Results_20230129.csv",index=False)
print('done')

In [None]:
prevdays = 5
hl_count_max = 5
hl_nodes_min = 8
hl_nodes_max= 12
batch_min= 10
batch_max=10
ep_min=10 
ep_max=10
df_res = ann_run("../Data_Prep_FL/WeatherAndPollution_2011_2020.csv", prevdays, hl_count_max, hl_nodes_min, hl_nodes_max, batch_min, batch_max, ep_min, ep_max)
df_res.to_csv("FL_Results_20230129.csv",index=False)
print('done')

In [None]:
prevdays = 5
hl_count_max = 5
hl_nodes_min = 8
hl_nodes_max= 12
batch_min= 10
batch_max=10
ep_min=10 
ep_max=10
df_res = ann_run("../Data_Prep_SF/WeatherAndPollution_2011_2020.csv", prevdays, hl_count_max, hl_nodes_min, hl_nodes_max, batch_min, batch_max, ep_min, ep_max)
df_res.to_csv("SF_Results_20230129.csv",index=False)
print("done")
