In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import json

In [2]:
# Set the search path for files (assuming the directory is relative to the current script)
file_path_mc124 = os.path.join("..", "..","fine_dust_complete", ".csv")
df = pd.read_csv('../data/df_fine_dust_wind_merged.csv', parse_dates=['datetime'])

# rename the wrongly named column -- NEED TO CLEAN THAT UP LATER
df = df.rename(columns={'pm10_value': 'value'})

df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 309715 entries, 172992 to 542554
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   datetime        309715 non-null  datetime64[ns]
 1   station         309715 non-null  object        
 2   core            309715 non-null  object        
 3   value           309715 non-null  float64       
 4   hour            309715 non-null  float64       
 5   day             309715 non-null  float64       
 6   month           309715 non-null  float64       
 7   year            309715 non-null  float64       
 8   day_of_week     309715 non-null  float64       
 9   is_weekend      309715 non-null  float64       
 10  wind_speed      309715 non-null  float64       
 11  wind_direction  309715 non-null  float64       
dtypes: datetime64[ns](1), float64(9), object(2)
memory usage: 30.7+ MB


In [3]:
dfs = {}

for value in df['core'].unique():
    filtered_df = df[df['core'] == value]
    dfs[value] = filtered_df

print(dfs.keys())

dict_keys(['no2', 'no', 'nox', 'pm10', 'pm2'])


In [4]:
df_no2 = dfs['no2']
df_no = dfs['no']
df_nox = dfs['nox']
df_pm10 = dfs['pm10']
df_pm2 = dfs['pm2']

In [5]:
# prep no2 for h-1 values
df_no2 = df_no2.copy()
df_no2 = df_no2.rename(columns={'value': 'no2'})

# get wind speed, wind direction and no2 value of the previous hour
df_no2.loc[:, 'wind_speed_h-1'] = df_no2.groupby(['station', 'core'])['wind_speed'].shift(-1)
df_no2.loc[:, 'wind_direction_h-1'] = df_no2.groupby(['station', 'core'])['wind_direction'].shift(-1)
df_no2.loc[:, 'no2_h-1'] = df_no2.groupby(['station', 'core'])['no2'].shift(-1)

# remove unnecessary columns which will lead to merge duplicates
df_no2 = df_no2.drop(columns=['core'])

# Verify the changes
df_no2.tail()

Unnamed: 0,datetime,station,no2,hour,day,month,year,day_of_week,is_weekend,wind_speed,wind_direction,wind_speed_h-1,wind_direction_h-1,no2_h-1
542532,2024-05-01 04:00:00,mc124,6.0,4.0,1.0,5.0,2024.0,2.0,0.0,16.1,130.0,13.2,140.0,7.0
542537,2024-05-01 03:00:00,mc124,7.0,3.0,1.0,5.0,2024.0,2.0,0.0,13.2,140.0,13.2,140.0,7.0
542542,2024-05-01 02:00:00,mc124,7.0,2.0,1.0,5.0,2024.0,2.0,0.0,13.2,140.0,12.7,150.0,14.0
542547,2024-05-01 01:00:00,mc124,14.0,1.0,1.0,5.0,2024.0,2.0,0.0,12.7,150.0,14.1,160.0,19.0
542552,2024-05-01 00:00:00,mc124,19.0,0.0,1.0,5.0,2024.0,2.0,0.0,14.1,160.0,,,


In [6]:
# prep no for h-1 values
df_no = df_no.copy()
df_no = df_no.rename(columns={'value': 'no'})

# get 'no' value for the previous hour
df_no.loc[:, 'no_h-1'] = df_no.groupby(['station', 'core'])['no'].shift(-1)

# remove unnecessary columns which will lead to merge duplicates
df_no = df_no.loc[:, ['datetime','no', 'no_h-1']]

# Verify the changes
df_no.tail()

Unnamed: 0,datetime,no,no_h-1
542533,2024-05-01 04:00:00,2.0,2.0
542538,2024-05-01 03:00:00,2.0,2.0
542543,2024-05-01 02:00:00,2.0,2.0
542548,2024-05-01 01:00:00,2.0,2.0
542553,2024-05-01 00:00:00,2.0,


In [7]:
# prep nox for h-1 values
df_nox = df_nox.copy()
df_nox = df_nox.rename(columns={'value': 'nox'})

# get nox value for the previous hour
df_nox.loc[:, 'nox_h-1'] = df_nox.groupby(['station', 'core'])['nox'].shift(-1)

# remove unnecessary columns which will lead to merge duplicates
df_nox = df_nox.loc[:, ['datetime','nox', 'nox_h-1']]

# Verify the changes
df_nox.tail()

Unnamed: 0,datetime,nox,nox_h-1
542534,2024-05-01 04:00:00,7.0,7.0
542539,2024-05-01 03:00:00,7.0,8.0
542544,2024-05-01 02:00:00,8.0,14.0
542549,2024-05-01 01:00:00,14.0,20.0
542554,2024-05-01 00:00:00,20.0,


In [8]:
# prep pm10 for h-1 values
df_pm10 = df_pm10.copy()
df_pm10 = df_pm10.rename(columns={'value': 'pm10'})

# get pm10 value for the previous hour
df_pm10.loc[:, 'pm10_h-1'] = df_pm10.groupby(['station', 'core'])['pm10'].shift(-1)

# remove unnecessary columns which will lead to merge duplicates
df_pm10 = df_pm10.loc[:, ['datetime','pm10', 'pm10_h-1']]

# Verify the changes
df_pm10.tail()

Unnamed: 0,datetime,pm10,pm10_h-1
542530,2024-05-01 04:00:00,30.0,29.0
542535,2024-05-01 03:00:00,29.0,28.0
542540,2024-05-01 02:00:00,28.0,28.0
542545,2024-05-01 01:00:00,28.0,29.0
542550,2024-05-01 00:00:00,29.0,


In [9]:
# prep pm2.5 for h-1 values
df_pm2 = df_pm2.copy()
df_pm2 = df_pm2.rename(columns={'value': 'pm2.5'})

# get pm10 value for the previous hour
df_pm2.loc[:, 'pm2.5_h-1'] = df_pm2.groupby(['station', 'core'])['pm2.5'].shift(-1)

# remove unnecessary columns which will lead to merge duplicates
df_pm2 = df_pm2.loc[:, ['datetime', 'pm2.5', 'pm2.5_h-1']]

# Verify the changes
df_pm2.tail()

Unnamed: 0,datetime,pm2.5,pm2.5_h-1
542531,2024-05-01 04:00:00,16.0,15.0
542536,2024-05-01 03:00:00,15.0,14.0
542541,2024-05-01 02:00:00,14.0,13.0
542546,2024-05-01 01:00:00,13.0,12.0
542551,2024-05-01 00:00:00,12.0,


In [10]:
# create list of dataframes
dataframes_l = [df_pm2, df_pm10, df_nox, df_no, df_no2]

# get the first df
df_m = dataframes_l[0]

# iterate through the dataframes
for df in dataframes_l[1:]:
    # perform an outer join on 'datetime'
    df_m = pd.merge(df_m, df, on='datetime', how='outer')

# Sort the merged DataFrame by 'datetime'
df_m = df_m.sort_values(by='datetime')

df_m.tail(10)


Unnamed: 0,datetime,pm2.5,pm2.5_h-1,pm10,pm10_h-1,nox,nox_h-1,no,no_h-1,station,...,day,month,year,day_of_week,is_weekend,wind_speed,wind_direction,wind_speed_h-1,wind_direction_h-1,no2_h-1
52683,2024-05-31 01:00:00,13.0,13.0,20.0,20.0,19.0,23.0,2.0,4.0,mc124,...,31.0,5.0,2024.0,4.0,0.0,11.0,120.0,8.4,110.0,17.0
52682,2024-05-31 02:00:00,14.0,13.0,21.0,20.0,10.0,19.0,1.0,2.0,mc124,...,31.0,5.0,2024.0,4.0,0.0,9.0,140.0,11.0,120.0,16.0
52681,2024-05-31 03:00:00,15.0,14.0,22.0,21.0,9.0,10.0,1.0,1.0,mc124,...,31.0,5.0,2024.0,4.0,0.0,5.9,170.0,9.0,140.0,8.0
52680,2024-05-31 04:00:00,13.0,15.0,20.0,22.0,9.0,9.0,1.0,1.0,mc124,...,31.0,5.0,2024.0,4.0,0.0,5.8,210.0,5.9,170.0,7.0
52679,2024-05-31 05:00:00,13.0,13.0,20.0,20.0,26.0,9.0,7.0,1.0,mc124,...,31.0,5.0,2024.0,4.0,0.0,4.8,200.0,5.8,210.0,8.0
52678,2024-05-31 06:00:00,16.0,13.0,24.0,20.0,44.0,26.0,17.0,7.0,mc124,...,31.0,5.0,2024.0,4.0,0.0,4.9,200.0,4.8,200.0,14.0
52677,2024-05-31 07:00:00,17.0,16.0,25.0,24.0,106.0,44.0,51.0,17.0,mc124,...,31.0,5.0,2024.0,4.0,0.0,5.1,200.0,4.9,200.0,18.0
52676,2024-05-31 08:00:00,15.0,17.0,23.0,25.0,100.0,106.0,43.0,51.0,mc124,...,31.0,5.0,2024.0,4.0,0.0,3.9,200.0,5.1,200.0,29.0
52675,2024-05-31 09:00:00,13.0,15.0,27.0,23.0,64.0,100.0,26.0,43.0,mc124,...,31.0,5.0,2024.0,4.0,0.0,3.2,200.0,3.9,200.0,34.0
52674,2024-05-31 10:00:00,10.0,13.0,22.0,27.0,67.0,64.0,25.0,26.0,mc124,...,31.0,5.0,2024.0,4.0,0.0,2.7,160.0,3.2,200.0,24.0


In [13]:
# reindexing the columns (change column order)
column_order_new = ['datetime',  'station', 'hour', 'day', 'month', 'year', 'day_of_week', 'is_weekend', 
                    'no2', 'no', 'nox', 'pm10', 'pm2.5', 'wind_speed', 'wind_direction', 
                    'no2_h-1', 'no_h-1', 'nox_h-1', 'pm10_h-1', 'pm2.5_h-1', 'wind_speed_h-1', 'wind_direction_h-1']
df_m = df_m.reindex(columns=column_order_new)
df_m.head()

Unnamed: 0,datetime,station,hour,day,month,year,day_of_week,is_weekend,no2,no,...,pm2.5,wind_speed,wind_direction,no2_h-1,no_h-1,nox_h-1,pm10_h-1,pm2.5_h-1,wind_speed_h-1,wind_direction_h-1
60480,2015-08-20 01:00:00,mc124,1.0,20.0,8.0,2015.0,3.0,0.0,31.0,18.0,...,,7.5,100.0,72.0,38.0,130.0,,,6.8,140.0
60479,2015-08-20 02:00:00,mc124,2.0,20.0,8.0,2015.0,3.0,0.0,28.0,15.0,...,,8.1,120.0,31.0,18.0,59.0,,,7.5,100.0
60478,2015-08-20 03:00:00,mc124,3.0,20.0,8.0,2015.0,3.0,0.0,23.0,7.0,...,,7.3,120.0,28.0,15.0,51.0,,,8.1,120.0
60477,2015-08-20 04:00:00,mc124,4.0,20.0,8.0,2015.0,3.0,0.0,29.0,15.0,...,,7.4,120.0,23.0,7.0,35.0,,,7.3,120.0
60476,2015-08-20 05:00:00,mc124,5.0,20.0,8.0,2015.0,3.0,0.0,34.0,24.0,...,,8.6,120.0,29.0,15.0,51.0,,,7.4,120.0


In [14]:
# export complete dataframe
df_m.to_csv("df_h-1_complete_mc124.csv", index=False)