In [56]:
from statsmodels.tsa.stattools import adfuller, grangercausalitytests, add_constant, coint
from statsmodels.tsa.api import AutoReg, VAR
from geopy.distance import great_circle
import matplotlib.pyplot as plt
import sklearn.metrics as skm
from tqdm import tqdm
import pandas as pd
import numpy as np
import math

In [44]:
data = pd.read_csv(r"/Users/main/Vault/Thesis/Data/pm25_weer.csv")
data.drop(data.iloc[:, 0:7], axis=1, inplace=True)
data.drop(["jaar", "maand", "weeknummer", "#STN", "timestamp", "components", "dag", "tijd", "uur", "datum", "sensortype", "weekdag", "U", "H", "T", "FH"], axis=1, inplace=True)

In [45]:
grouped_df = data.groupby(["YYYYMMDD", "tag"])["pm25", "longitude", "latitude", "DD"].mean().copy().reset_index()
grouped_df.rename(columns={"U":"Wind", "DD":"Angle"}, inplace=True)
grouped_df.head(5)

  grouped_df = data.groupby(["YYYYMMDD", "tag"])["pm25", "longitude", "latitude", "DD"].mean().copy().reset_index()


Unnamed: 0,YYYYMMDD,tag,pm25,longitude,latitude,Angle
0,20190625,Amsterdam,16.602565,4.866208,52.359714,170.434783
1,20190625,Beverwijk,18.822,4.650824,52.492369,276.0
2,20190626,Amsterdam,21.974609,4.866208,52.359714,126.521739
3,20190626,Beverwijk,2.851417,4.657933,52.485871,19.166667
4,20190627,Amsterdam,4.4845,4.866208,52.359714,37.916667


In [46]:
Locations = grouped_df["tag"].unique()
LocDict = dict()

for i in range(len(Locations)):
    LocDict[Locations[i]] = (grouped_df[grouped_df.tag == Locations[i]]["latitude"].mean(), grouped_df[grouped_df.tag == Locations[i]]["longitude"].mean())

LocDict

{'Amsterdam': (52.385565471780744, 4.901327224364789),
 'Beverwijk': (52.486366756022534, 4.658584737682753),
 'Heemskerk': (52.50593616684639, 4.671953092146708),
 'Wijk aan Zee': (52.493443775747366, 4.59824578234632),
 'Velsen-Noord': (52.47382287197285, 4.647238399148998),
 'Driehuis': (52.447194827537714, 4.636977030738782),
 'IJmuiden': (52.45820717461739, 4.615567828011168),
 'Velsen-Zuid': (52.46234499999992, 4.637255999999981),
 'Zaandam': (52.45848175659801, 4.824912486862412),
 'Koog aan de Zaan': (52.46407900000035, 4.811263999999972),
 'Uithoorn': (52.23809099999961, 4.8082289999999865)}

In [47]:
def get_bearing(coor1, coor2):
    dLon = (coor2[1] - coor1[1])
    y = math.sin(dLon) * math.cos(coor2[0])
    x = math.cos(coor1[0]) * math.sin(coor2[0]) - math.sin(coor1[0]) * math.cos(coor2[0]) * math.cos(dLon)
    brng = math.atan2(y, x)
    brng = np.rad2deg(brng)
    return brng

W = np.zeros((11, 11))
AngleMatrix = np.zeros((11, 11))

for i in range(len(LocDict)):
    for j in range(len(LocDict)):
        if i != j:
            theta = get_bearing(LocDict[Locations[i]], LocDict[Locations[j]])
            W[i, j] = 1 / great_circle(LocDict[Locations[i]], LocDict[Locations[j]]).km
            AngleMatrix[i, j] = theta

In [48]:
grouped_df["Date"] = grouped_df["YYYYMMDD"].astype(str)
grouped_df.set_index("Date", inplace=True)
grouped_df.drop(columns=["YYYYMMDD", "latitude", "longitude"], inplace=True)
grouped_df.head(5)

Unnamed: 0_level_0,tag,pm25,Angle
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20190625,Amsterdam,16.602565,170.434783
20190625,Beverwijk,18.822,276.0
20190626,Amsterdam,21.974609,126.521739
20190626,Beverwijk,2.851417,19.166667
20190627,Amsterdam,4.4845,37.916667


In [49]:
UniqueNames = grouped_df.tag.unique()

PolDict = {elem : pd.DataFrame() for elem in UniqueNames}
# WindDict = {elem : pd.DataFrame() for elem in UniqueNames}
AngleDict = {elem : pd.DataFrame() for elem in UniqueNames}

for key in PolDict.keys():
    PolDict[key] = grouped_df[:][grouped_df.tag == key]
    PolDict[key].rename(columns={"pm25":key}, inplace=True)
    PolDict[key].drop(["Angle"], axis=1, inplace=True)
    del PolDict[key]["tag"]

    # WindDict[key] = grouped_df[:][grouped_df.tag == key]
    # WindDict[key].rename(columns={"Wind":key}, inplace=True)
    # WindDict[key].drop(["pm25", "Angle"], axis=1, inplace=True)
    # del WindDict[key]["tag"]

    AngleDict[key] = grouped_df[:][grouped_df.tag == key]
    AngleDict[key].rename(columns={"Angle":key}, inplace=True)
    AngleDict[key].drop(["pm25"], axis=1 , inplace=True)
    del AngleDict[key]["tag"]


AngleDict["Amsterdam"]

Unnamed: 0_level_0,Amsterdam
Date,Unnamed: 1_level_1
20190625,170.434783
20190626,126.521739
20190627,37.916667
20190628,49.583333
20190629,117.083333
...,...
20221018,176.828194
20221019,76.592357
20221020,128.169014
20221021,163.281734


In [50]:
df_pol = pd.DataFrame(PolDict["Amsterdam"].copy())
# df_wind = pd.DataFrame(WindDict["Amsterdam"].copy())
df_angle = pd.DataFrame(AngleDict["Amsterdam"].copy())

for key in PolDict:
    df_pol = df_pol.combine_first(PolDict[key])
    # df_wind = df_wind.combine_first(WindDict[key])
    df_angle = df_angle.combine_first(AngleDict[key])

df_angle

Unnamed: 0_level_0,Amsterdam,Beverwijk,Driehuis,Heemskerk,IJmuiden,Koog aan de Zaan,Uithoorn,Velsen-Noord,Velsen-Zuid,Wijk aan Zee,Zaandam
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
20190625,170.434783,276.000000,,,,,,,,,
20190626,126.521739,19.166667,,,,,,,,,
20190627,37.916667,33.013699,,,,,,,,,
20190628,49.583333,50.314961,,,,,,,,,
20190629,117.083333,118.232044,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
20221018,176.828194,180.424028,177.058824,173.281250,168.938053,,177.058824,177.058824,,177.058824,177.058824
20221019,76.592357,77.437186,77.083333,77.111111,77.083333,,77.083333,77.083333,,77.083333,77.083333
20221020,128.169014,127.835616,127.727273,127.245509,127.777778,,127.727273,127.727273,,127.727273,127.727273
20221021,163.281734,163.045685,162.916667,162.768362,161.329114,,162.916667,162.916667,,162.916667,162.916667


In [51]:
for column in df_pol:
    median_value = (df_pol[column].median(), df_angle[column].median())  #, df_wind[column].median())
    df_pol[column].fillna(value=median_value[0], inplace = True)
    df_angle[column].fillna(value=median_value[1], inplace = True)
    # df_wind[column].fillna(value=median_value[2], inplace = True)
    
df_angle

Unnamed: 0_level_0,Amsterdam,Beverwijk,Driehuis,Heemskerk,IJmuiden,Koog aan de Zaan,Uithoorn,Velsen-Noord,Velsen-Zuid,Wijk aan Zee,Zaandam
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
20190625,170.434783,276.000000,208.416667,208.263383,207.272727,206.25,211.666667,206.363636,205.982143,207.849462,206.250000
20190626,126.521739,19.166667,208.416667,208.263383,207.272727,206.25,211.666667,206.363636,205.982143,207.849462,206.250000
20190627,37.916667,33.013699,208.416667,208.263383,207.272727,206.25,211.666667,206.363636,205.982143,207.849462,206.250000
20190628,49.583333,50.314961,208.416667,208.263383,207.272727,206.25,211.666667,206.363636,205.982143,207.849462,206.250000
20190629,117.083333,118.232044,208.416667,208.263383,207.272727,206.25,211.666667,206.363636,205.982143,207.849462,206.250000
...,...,...,...,...,...,...,...,...,...,...,...
20221018,176.828194,180.424028,177.058824,173.281250,168.938053,206.25,177.058824,177.058824,205.982143,177.058824,177.058824
20221019,76.592357,77.437186,77.083333,77.111111,77.083333,206.25,77.083333,77.083333,205.982143,77.083333,77.083333
20221020,128.169014,127.835616,127.727273,127.245509,127.777778,206.25,127.727273,127.727273,205.982143,127.727273,127.727273
20221021,163.281734,163.045685,162.916667,162.768362,161.329114,206.25,162.916667,162.916667,205.982143,162.916667,162.916667


In [61]:
done_list = list()

for i in range(len(df_pol.columns)):

    for j in range(i, len(df_pol.columns)):

        if j != i:
            p_value = coint(df_pol[df_pol.columns[i]], df_pol[df_pol.columns[j]])[1]

            if p_value < 0.05:
                print(f'{df_pol.columns[i]} - {df_pol.columns[j]} are cointegrated')  

Amsterdam - Wijk aan Zee are cointegrated
Driehuis - Heemskerk are cointegrated
Driehuis - IJmuiden are cointegrated
Driehuis - Koog aan de Zaan are cointegrated
Driehuis - Uithoorn are cointegrated
Driehuis - Velsen-Noord are cointegrated
Driehuis - Velsen-Zuid are cointegrated
Driehuis - Wijk aan Zee are cointegrated
Driehuis - Zaandam are cointegrated
Heemskerk - Koog aan de Zaan are cointegrated
Heemskerk - Uithoorn are cointegrated
Heemskerk - Wijk aan Zee are cointegrated
Heemskerk - Zaandam are cointegrated
IJmuiden - Koog aan de Zaan are cointegrated
IJmuiden - Uithoorn are cointegrated
IJmuiden - Velsen-Noord are cointegrated
IJmuiden - Velsen-Zuid are cointegrated
IJmuiden - Wijk aan Zee are cointegrated
IJmuiden - Zaandam are cointegrated
Koog aan de Zaan - Uithoorn are cointegrated
Koog aan de Zaan - Velsen-Noord are cointegrated
Koog aan de Zaan - Velsen-Zuid are cointegrated
Koog aan de Zaan - Wijk aan Zee are cointegrated
Koog aan de Zaan - Zaandam are cointegrated
Velse

In [67]:
for i in range(len(df_pol.columns)):
    adfstat = adfuller(df_pol[df_pol.columns[i]])[1]
    
    if adfstat < 0.05:
        print(f'{df_pol.columns[i]} is not stationary')

Beverwijk is not stationary
Driehuis is not stationary
Heemskerk is not stationary
IJmuiden is not stationary
Koog aan de Zaan is not stationary
Velsen-Noord is not stationary
Velsen-Zuid is not stationary
Zaandam is not stationary


In [52]:
VARModel = VAR(df_pol, ).fit(trend="n")
VARModel.summary()

  self._init_dates(dates, freq)


  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Sun, 08, Jan, 2023
Time:                     17:05:56
--------------------------------------------------------------------
No. of Equations:         11.0000    BIC:                    39.5552
Nobs:                     1204.00    HQIC:                   39.2361
Log likelihood:          -42175.5    FPE:                9.04307e+16
AIC:                      39.0434    Det(Omega_mle):     8.18216e+16
--------------------------------------------------------------------
Results for equation Amsterdam
                         coefficient       std. error           t-stat            prob
--------------------------------------------------------------------------------------
L1.Amsterdam                0.728665         0.038061           19.145           0.000
L1.Beverwijk                0.052948         0.016648            3.180           0.001
L1.Driehuis                -0.06

In [53]:
for key in PolDict:
    R2 = skm.r2_score(VARModel.fittedvalues[key] + VARModel.resid[key], VARModel.fittedvalues[key])
    print(F'The R-Squared of {key} is: {R2*100:.2f}%')

The R-Squared of Amsterdam is: 52.47%
The R-Squared of Beverwijk is: 82.82%
The R-Squared of Heemskerk is: 81.87%
The R-Squared of Wijk aan Zee is: 82.69%
The R-Squared of Velsen-Noord is: 27.11%
The R-Squared of Driehuis is: 35.84%
The R-Squared of IJmuiden is: 63.73%
The R-Squared of Velsen-Zuid is: 41.34%
The R-Squared of Zaandam is: 50.12%
The R-Squared of Koog aan de Zaan is: 17.19%
The R-Squared of Uithoorn is: 77.69%


In [54]:
WY = pd.DataFrame(np.matmul(df_pol.to_numpy(), W))

i = 0
for key in PolDict:
    WY.rename(columns={i:f'{key}'}, inplace=True)
    i += 1

SVAR = VAR(WY).fit(trend="n")
SVAR.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Sun, 08, Jan, 2023
Time:                     17:06:05
--------------------------------------------------------------------
No. of Equations:         11.0000    BIC:                    15.7036
Nobs:                     1204.00    HQIC:                   15.3845
Log likelihood:          -27816.8    FPE:                3.95976e+06
AIC:                      15.1917    Det(Omega_mle):     3.58278e+06
--------------------------------------------------------------------
Results for equation Amsterdam
                         coefficient       std. error           t-stat            prob
--------------------------------------------------------------------------------------
L1.Amsterdam                0.567627         0.285212            1.990           0.047
L1.Beverwijk                0.020215         0.048619            0.416           0.678
L1.Heemskerk                0.13

In [None]:
for key in PolDict:
    R2 = skm.r2_score(SVAR.fittedvalues[key] + SVAR.resid[key], SVAR.fittedvalues[key])
    print(F'The R-Squared of {key} is: {R2*100:.2f}%')

In [None]:
WW = list()

wind = np.random.lognormal(mean=2.5, sigma=0.5, size=len(df_angle))

for i in range(len(df_angle)):
    wind_direction = np.zeros((len(df_angle.columns), len(df_angle.columns)))
    
    for j in range(len(df_angle.columns)):
        wind_direction[j, :] = AngleMatrix[j, :] - df_angle.iloc[i, j]
        wind_direction = (np.cos(wind_direction) * W)  * wind[i]
        wind_direction = np.nan_to_num(wind_direction, nan=0, posinf=0, neginf=0)

    WW.append(wind_direction)

WWY = np.zeros((len(df_pol), len(df_pol.columns)))

for i in range(len(df_pol)):
    WWY[i, :] = np.matmul(df_pol.iloc[i, :].to_numpy(), WW[i])

WWY = pd.DataFrame(WWY)

i = 0
for key in PolDict:
    WWY.rename(columns={i:f'{key}'}, inplace=True)
    i += 1

EXOG = pd.DataFrame(np.concatenate((df_pol, WWY), axis=1))

i = 0
for key in PolDict:
    EXOG.rename(columns={i:f'{key}'}, inplace=True)
    EXOG.rename(columns={i + 11:f'Spatial - {key}'}, inplace=True)
    i += 1

SWVAR = VAR(EXOG).fit(trend="n")
SWVAR.summary()

In [None]:
for key in PolDict:
    R2 = skm.r2_score(SWVAR.fittedvalues[key] + SWVAR.resid[key], SWVAR.fittedvalues[key])
    print(F'The R-Squared of {key} is: {R2*100:.2f}%')