In [None]:
# remove null values
df.dropna(inplace=True)
df.info()

In [None]:
# remove timezone information
df.loc[:, 'datetime'] = df['datetime'].astype(str).str.slice(0, 19)
df['datetime'] = pd.to_datetime(df['datetime'], format='mixed')
df.loc[:, 'datetime'] = df['datetime'].dt.tz_localize(None)
df.info()

In [1]:
# split date information from datetime
df_reduced['hour'] = df_reduced['datetime'].dt.strftime('%H')  # Hour (00-23)
df_reduced['day'] = df_reduced['datetime'].dt.strftime('%d')  # Day of the month (01-31)
df_reduced['month'] = df_reduced['datetime'].dt.strftime('%m')  # Month (01-12)
df_reduced['year'] = df_reduced['datetime'].dt.strftime('%Y')  # Month (01-12)
df_reduced.sample(3)

In [None]:
# add day of the week to dataframe
import calendar

days = {
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday",
}

df_prepped = df_reduced
# convert the 'day', 'month', and 'year' columns to integers
df_prepped['day'] = df_reduced['day'].astype(int)
df_prepped['month'] = df_reduced['month'].astype(int)
df_prepped['year'] = df_reduced['year'].astype(int)

# function to determine the day of the week
def get_day_of_week(row):
    return calendar.weekday(row['year'], row['month'], row['day'])

# function to determine if day is weekday or weekend 
def is_weekend(day_number):
    return 1 if day_number >= 5 else 0
    #day_number >= 5 ? 1 : 0
    
# apply the functions to create the new columns
df_prepped['day_of_week'] = df_prepped.apply(get_day_of_week, axis=1)
df_prepped['is_weekend'] = df_prepped['day_of_week'].apply(is_weekend)

df_prepped.info()

In [None]:
#------------- prep -------------

# get h-1 for all core values
dfs = {}

for value in df['core'].unique():
    filtered_df = df_prepped[df_prepped['core'] == value]
    dfs[value] = filtered_df

df_no2 = dfs['no2']
df_no = dfs['no']
df_nox = dfs['nox']
df_pm10 = dfs['pm10']
df_pm2 = dfs['pm2']

#------------- no2 -------------

# get no2 for h-1 values
df_no2 = df_no2.copy()
df_no2 = df_no2.rename(columns={'value': 'no2'})
df_no2.loc[:, 'no2_h-1'] = df_no2.groupby(['station', 'core'])['no2'].shift(-1)

# remove unnecessary columns which will lead to merge duplicates
df_no2 = df_no2.drop(columns=['core'])

#------------- no -------------

# get no for h-1 values
df_no = df_no.copy()
df_no = df_no.rename(columns={'value': 'no'})
df_no.loc[:, 'no_h-1'] = df_no.groupby(['station', 'core'])['no'].shift(-1)

# remove unnecessary columns which will lead to merge duplicates
df_no = df_no.loc[:, ['datetime','no', 'no_h-1']]

#------------- nox -------------

# get nox for h-1 values
df_nox = df_nox.copy()
df_nox = df_nox.rename(columns={'value': 'nox'})
df_nox.loc[:, 'nox_h-1'] = df_nox.groupby(['station', 'core'])['nox'].shift(-1)

# remove unnecessary columns which will lead to merge duplicates
df_nox = df_nox.loc[:, ['datetime','nox', 'nox_h-1']]

#------------- pm10 -------------

# get pm10 for h-1 values
df_pm10 = df_pm10.copy()
df_pm10 = df_pm10.rename(columns={'value': 'pm10'})
df_pm10.loc[:, 'pm10_h-1'] = df_pm10.groupby(['station', 'core'])['pm10'].shift(-1)

# remove unnecessary columns which will lead to merge duplicates
df_pm10 = df_pm10.loc[:, ['datetime','pm10', 'pm10_h-1']]

#------------- pm2.5 -------------

df_pm2 = df_pm2.copy()
df_pm2 = df_pm2.rename(columns={'value': 'pm2.5'})
df_pm2.loc[:, 'pm2.5_h-1'] = df_pm2.groupby(['station', 'core'])['pm2.5'].shift(-1)

# remove unnecessary columns which will lead to merge duplicates
df_pm2 = df_pm2.loc[:, ['datetime', 'pm2.5', 'pm2.5_h-1']]

#------------- merge & order -------------

# create list of dataframes
dataframes_l = [df_pm2, df_pm10, df_nox, df_no, df_no2]

# get the first df
df_m = dataframes_l[0]

# iterate through the dataframes
for df in dataframes_l[1:]:
    # perform an outer join on 'datetime'
    df_m = pd.merge(df_m, df, on='datetime', how='outer')

# Sort the merged DataFrame by 'datetime'
df_m = df_m.sort_values(by='datetime')

# reindexing the columns (change column order)
column_order_new = ['datetime',  'station', 'hour', 'day', 'month', 'year', 'day_of_week', 'is_weekend', 
                    'no2', 'no', 'nox', 'pm10', 'pm2.5', 'wind_speed', 'wind_direction', 
                    'no2_h-1', 'no_h-1', 'nox_h-1', 'pm10_h-1', 'pm2.5_h-1', 'wind_speed_h-1', 'wind_direction_h-1']
df_m = df_m.reindex(columns=column_order_new)

df_m.head()