<a href="https://colab.research.google.com/github/Ditsuhi/Nitrogen_Dioxide_Prediction/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import all required libraries

import zipfile
from glob import glob
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from keras import layers
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from scipy.interpolate import NearestNDInterpolator
from keras.models import Sequential
from keras.layers import ConvLSTM2D, BatchNormalization
from keras.layers import Bidirectional
from keras.layers import  Conv2D 
from time import time
from sklearn.preprocessing import OneHotEncoder

In [None]:
# To calculate nearest neighbor interpolation for meteorological data

def CalcNNvalue(array_interpolate):
  
  array_float = array_interpolate.astype(float)
  knowncell_position= np.argwhere(array_float!=0)  
  knowncell_value = array_float[array_float!=0] 
  unknowncell_position = np.argwhere(array_float==0)
  myInterpolator = NearestNDInterpolator(knowncell_position, knowncell_value) 
  unknown_values = myInterpolator(unknowncell_position)
  array_float[array_float == 0 ] = unknown_values
  return array_float.tolist()


def calc_NN_fullData(full_data):
  NN_list =[]
  for item in full_data:    
    try: 
      NN_list.append(CalcNNvalue(item))
    except IndexError:
      NN_list.append(item.tolist())  
  return NN_list


def calculate_NN_fullData_allAttributes (df_all):
  df_all_NN_list = []
  # The number in the range is the number of meteorological features 
  # to be interpolated using nearest neighbor interpolation.  
  for attr_numb in range(1): 
    certain_attr = df_all[:, :, attr_numb]    
    certain_attr_reshaped= certain_attr.reshape(certain_attr.shape[0], 20, 17)
    certain_attr_reshaped_NN = calc_NN_fullData(certain_attr_reshaped) 
    certain_attr_reshaped_NN_original_shape = np.reshape(certain_attr_reshaped_NN, (certain_attr.shape[0], 340))
    df_all_NN_list.append(certain_attr_reshaped_NN_original_shape.tolist())   
  df_all_NN_array = np.dstack((item)for item in df_all_NN_list)  
  return df_all_NN_array

In [None]:
#unzip data giving the path of certain dataset

path = '/content/AirMetTraffic_2019_2020_firstSixMonths.zip'
with zipfile.ZipFile(path, 'r') as zip_ref:
    zip_ref.extractall('/content/')

airMetTraf = glob("/content/*.csv")



In [None]:
#sort dataset in chronological order


def sortingFiles(eachFile):
    return int(eachFile) if eachFile.isdigit() else eachFile
def natural_keys(eachFile):
    return [sortingFiles(c) for c in re.split('(\d+)',eachFile)]

sorted_airMetTraf= sorted(airMetTraf, key = natural_keys)
sorted_airMetTraf_2019 = sorted_airMetTraf[:4344]
sorted_airMetTraf_2020= sorted_airMetTraf[4344:]

In [None]:
# These are the feyures from the matrices: FID	 NO2	 UV	 windSpeed	 windDir	 Temp	 Humidity	 Pressure	 SolarRad	 Prec	 intensidad	 ocupacion	 carga	 vmed

df_2019 = [pd.read_csv(f, usecols=[' NO2', ' UV',  ' windSpeed', ' windDir', ' Temp', ' Humidity', ' Pressure', ' SolarRad', ' Prec', ' intensidad',	' ocupacion',	' carga',	 ' vmed']).values for f in sorted_airMetTraf_2019]
#df_2020 = [pd.read_csv(f, usecols=[' NO2', ' UV',  ' windSpeed', ' windDir', ' Temp', ' Humidity', ' Pressure', ' SolarRad', ' Prec', ' intensidad',	' ocupacion',	' carga',	 ' vmed']).values for f in sorted_airMetTraf_2020]

In [None]:
df_all_2019  = np.asarray(df_2019)
#df_all_2020  = np.asarray(df_2020)

In [None]:

# This step is for outlier handling (Temperature:res; Humidity:reshum;
# and Average Speed:speed)

def tempOut (df_all):
  return np.where(df_all[:, :, 4] < -3)

def humOut (df_all):
  return np.where(df_all[:, :, 5] < 0)

def speedOut (df_all):
  return np.where(df_all[:, :, 12] < 0)

res = tempOut (df_all_2019)
reshum = humOut (df_all_2019)
speed = speedOut (df_all_2019)


# all values for a temperature data below -3 are converted to an average
# before and after the values.

for i in range(len(res[0])):
  if df_all[:, :, 4][res[0][i]][res[1][i]-1] > -3 and df_all[:, :, 4][res[0][i]][res[1][i]+1] > -3:
    df_all[:, :, 4][res[0][i]][res[1][i]] = (df_all[:, :, 4][res[0][i]][res[1][i]-1]+df_all[:, :, 4][res[0][i]][res[1][i]+1])/2


# all values for a humidity data below 0 are converted to an average
# before and after the values.

for i in range(len(reshum[0])):
  if df_all[:, :, 5][reshum[0][i]][reshum[1][i]-1] >= 0 and df_all[:, :, 5][reshum[0][i]][reshum[1][i]+1] >= 0:
    df_all[:, :, 5][reshum[0][i]][reshum[1][i]] = (df_all[:, :, 5][reshum[0][i]][reshum[1][i]-1]+df_all[:, :, 5][reshum[0][i]][reshum[1][i]+1])/2


# all values for a speed data below 0 are converted to 0.

for i in range(len(speed[0])):
  df_all[:, :, 12][speed[0][i]][speed[1][i]] = 0


# deleting precipitation, because most values are 0

df_all_non_prec = np.delete(df_all, 8, 2)
air= df_all_non_prec[:, :, 0].reshape(-1, 340, 1)
traf =  df_all_non_prec[:, :, 8:12].reshape(-1, 340, 4)
NN_dataframe = calculate_NN_fullData_allAttributes (df_all_non_prec[:, :, 1:8])
df_air_NN_Met = np.concatenate((air, idw_dataframe, traf), axis=2)
not_nun = np.nan_to_num(df_air_NN_Met)
round_data = np.round(not_nun, 1)

In [None]:
#convert wind direction to categorical data, and then apply One Hot Encoder

df_categ = pd.DataFrame(round_data.reshape(-1, 12), columns = ['NO2', 'UV',  'windSpeed', 'windDir', 'Temp', 'Humidity', 'Pressure', 'SolarRad',  'intensidad',	'ocupacion',	'carga',	 'vmed'])

df_categ['windDir_Categ'] = ""

for item in range(0, len(df_categ)):
  if (df_categ['windDir'][item] >=0 and  df_categ['windDir'][item] <22.5) or df_categ['windDir'][item] >337.5:
    df_categ['windDir_Categ'][item] = 'north'        
  elif df_categ['windDir'][item] >=22.5 and  df_categ['windDir'][item] < 67.5:
    df_categ['windDir_Categ'][item] = 'northeast' 
  elif df_categ['windDir'][item] >=67.5 and  df_categ['windDir'][item] < 112.5:
    df_categ['windDir_Categ'][item] = 'east' 
  elif df_categ['windDir'][item] >=112.5 and  df_categ['windDir'][item] < 157.5:
    df_categ['windDir_Categ'][item] = 'southeast' 
  elif df_categ['windDir'][item] >=157.5 and  df_categ['windDir'][item] < 202.5:
    df_categ['windDir_Categ'][item] = 'south' 
  elif df_categ['windDir'][item] >=202.5 and  df_categ['windDir'][item] < 247.5:
    df_categ['windDir_Categ'][item] = 'southwest' 
  elif df_categ['windDir'][item] >=247.5 and  df_categ['windDir'][item] < 292.5:
    df_categ['windDir_Categ'][item] = 'west' 
  else: 
    df_categ['windDir_Categ'][item] = 'northwest' 


In [None]:
encoder=OneHotEncoder(sparse=False)
df_categ_encoded = pd.DataFrame(encoder.fit_transform(df_categ[['windDir_Categ']]))
df_categ_encoded.columns = encoder.get_feature_names(['windDir_Categ'])
df_categ.drop(['windDir_Categ'] ,axis=1, inplace=True)
OH_X_train= pd.concat([df_categ, df_categ_encoded ], axis=1)

#create final dataset for further analyses by deleting windir, as it is aready converted to caregorical data; deleting UV as it is not available for June 2019 and for whole period of 2020;  
#deleting carga(Traffic load-according to the definition is the combination of intensity, occupancy time and capacity of the road),
# and vmed(average traffic speed - because it is available only for M30 road which is 15.8% of the case study)

final_dataframe = OH_X_train.drop(['windDir', 'UV', 'carga', 'vmed'], axis = 1)