In [46]:
from statsmodels.tsa.stattools import adfuller, grangercausalitytests, add_constant, coint, pacf
from statsmodels.tsa.vector_ar.vecm import coint_johansen, select_coint_rank
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.api import AutoReg, VAR
from geopy.distance import great_circle
import matplotlib.pyplot as plt
import sklearn.metrics as skm
from tqdm import tqdm
import pandas as pd
import numpy as np
import math

In [5]:
data = pd.read_csv(r"/Users/main/Vault/Thesis/Data/pm25_weer.csv")
data.drop(data.iloc[:, 0:7], axis=1, inplace=True)
data.drop(data.iloc[:, 1:5], axis=1, inplace=True)
data.drop(["jaar", "maand", "weeknummer", "#STN", "timestamp", "components", "dag", "tijd", "uur", "datum", "sensortype", "weekdag", "U", "H", "FH", "T"], axis=1, inplace=True)

In [7]:
grouped_df = data.groupby(by=["YYYYMMDD", "tag"]).mean().copy().reset_index()
grouped_df.rename(columns={"DD":"Angle"}, inplace=True)
# grouped_df.head(50)

In [8]:
Locations = grouped_df["tag"].unique()
LocDict = dict()

for i in range(len(Locations)):
    LocDict[Locations[i]] = (grouped_df[grouped_df.tag == Locations[i]]["latitude"].mean(), grouped_df[grouped_df.tag == Locations[i]]["longitude"].mean())

# LocDict

In [9]:
def get_bearing(coor1, coor2):
    dLon = (coor2[1] - coor1[1])
    y = math.sin(dLon) * math.cos(coor2[0])
    x = math.cos(coor1[0]) * math.sin(coor2[0]) - math.sin(coor1[0]) * math.cos(coor2[0]) * math.cos(dLon)
    brng = math.atan2(y, x)
    brng = np.rad2deg(brng)
    return brng

W = np.zeros((11, 11))
AngleMatrix = np.zeros((11, 11))

for i in range(len(LocDict)):
    for j in range(len(LocDict)):
        if i != j:
            theta = get_bearing(LocDict[Locations[i]], LocDict[Locations[j]])
            W[i, j] = 1 / great_circle(LocDict[Locations[i]], LocDict[Locations[j]]).km
            AngleMatrix[i, j] = theta

In [10]:
grouped_df["Date"] = grouped_df["YYYYMMDD"].astype(str)
grouped_df.set_index("Date", inplace=True)
grouped_df.drop(columns=["YYYYMMDD", "latitude", "longitude"], inplace=True)
# grouped_df.head(5)

In [11]:
UniqueNames = grouped_df.tag.unique()

PolDict = {elem : pd.DataFrame() for elem in UniqueNames}
AngleDict = {elem : pd.DataFrame() for elem in UniqueNames}

for key in PolDict.keys():
    PolDict[key] = grouped_df[:][grouped_df.tag == key]
    PolDict[key].rename(columns={"pm25":key}, inplace=True)
    PolDict[key].drop(["Angle"], axis=1, inplace=True)
    del PolDict[key]["tag"]

    AngleDict[key] = grouped_df[:][grouped_df.tag == key]
    AngleDict[key].rename(columns={"Angle":key}, inplace=True)
    AngleDict[key].drop(["pm25"], axis=1 , inplace=True)
    del AngleDict[key]["tag"]


# AngleDict["Amsterdam"]

In [12]:
df_pol = pd.DataFrame(PolDict["Amsterdam"].copy())
df_angle = pd.DataFrame(AngleDict["Amsterdam"].copy())

for key in PolDict:
    df_pol = df_pol.combine_first(PolDict[key])
    df_angle = df_angle.combine_first(AngleDict[key])

# df_angle

In [54]:
for column in df_pol:
    median_value = (df_pol[column].median(), df_angle[column].median())
    df_pol[column].fillna(value=median_value[0], inplace = True)
    df_angle[column].fillna(value=median_value[1], inplace = True)
    
# df_angle

In [60]:
cont = select_coint_rank(endog=df_pol, det_order=1, k_ar_diff=5, method='maxeig', signif=0.05)
cont.summary()

r_0,r_1,test statistic,critical value
0,1,173.1,73.94
1,2,153.7,67.9
2,3,139.0,61.81
3,4,85.48,55.73
4,5,66.72,49.59
5,6,43.87,43.42
6,7,38.15,37.16
7,8,28.02,30.82


In [59]:
VARModel = VAR(df_pol).fit(trend="ctt")
VARModel.summary()

  self._init_dates(dates, freq)


  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Tue, 10, Jan, 2023
Time:                     17:19:40
--------------------------------------------------------------------
No. of Equations:         11.0000    BIC:                    39.5351
Nobs:                     1204.00    HQIC:                   39.1289
Log likelihood:          -42046.3    FPE:                7.70774e+16
AIC:                      38.8836    Det(Omega_mle):     6.78732e+16
--------------------------------------------------------------------
Results for equation Amsterdam
                         coefficient       std. error           t-stat            prob
--------------------------------------------------------------------------------------
const                       7.030859         1.405380            5.003           0.000
trend                      -0.008013         0.003879           -2.066           0.039
trend**2                    0.00

In [15]:
WY = pd.DataFrame(np.matmul(df_pol.to_numpy(), W))

for i in range(len(df_pol.columns)):
    WY.rename(columns={i:f'{df_pol.columns[i]}'}, inplace=True)

SVAR = VAR(WY).fit(trend="ctt")
SVAR.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Tue, 10, Jan, 2023
Time:                     17:03:57
--------------------------------------------------------------------
No. of Equations:         11.0000    BIC:                    15.6834
Nobs:                     1204.00    HQIC:                   15.2773
Log likelihood:          -27687.6    FPE:                3.37505e+06
AIC:                      15.0319    Det(Omega_mle):     2.97201e+06
--------------------------------------------------------------------
Results for equation Amsterdam
                         coefficient       std. error           t-stat            prob
--------------------------------------------------------------------------------------
const                       3.241165         0.566846            5.718           0.000
trend                      -0.002213         0.001565           -1.414           0.157
trend**2                    0.00

In [68]:
WW = list()


for i in range(len(df_angle)):
    wind_direction = np.zeros((len(df_angle.columns), len(df_angle.columns)))
    
    for j in range(0, len(df_angle.columns)):
        wind_direction[j, :] = np.cos(AngleMatrix[j, :] - df_angle.iloc[i, j])

    wind_direction = wind_direction * W 
    wind_direction = np.nan_to_num(wind_direction, nan=0, posinf=0, neginf=0)
    WW.append(wind_direction)



WWY = np.zeros((len(df_pol), len(df_pol.columns)))
for i in range(len(df_pol)):
    WWY[i, :] = np.matmul(WW[i], df_pol.iloc[i, :].to_numpy().T)


WWY = pd.DataFrame(WWY)
for i in range(len(df_pol.columns)):
    WWY.rename(columns={i:f'{df_pol.columns[i]}'}, inplace=True)



# EXOG = pd.DataFrame(np.concatenate((df_pol, WWY), axis=1))
# for i in range(len(df_pol.columns)):
#     EXOG.rename(columns={i:f'{df_pol.columns[i]}'}, inplace=True)
#     EXOG.rename(columns={i + 11:f'Spatial - {df_pol.columns[i]}'}, inplace=True)



SWVAR = VAR(WWY).fit(maxlags=5, trend="ctt")
# SWVAR.summary()

In [70]:
for key in PolDict:
    R2 = skm.r2_score(SWVAR.fittedvalues[key] + SWVAR.resid[key], SWVAR.fittedvalues[key])
    print(F'The R-Squared of {key} is: {R2*100:.2f}%')

The R-Squared of Amsterdam is: 7.56%
The R-Squared of Beverwijk is: 5.16%
The R-Squared of Heemskerk is: 6.11%
The R-Squared of Wijk aan Zee is: 5.29%
The R-Squared of Velsen-Noord is: 10.75%
The R-Squared of Driehuis is: 7.17%
The R-Squared of IJmuiden is: 8.43%
The R-Squared of Velsen-Zuid is: 80.37%
The R-Squared of Zaandam is: 7.63%
The R-Squared of Koog aan de Zaan is: 31.35%
The R-Squared of Uithoorn is: 29.24%
