In [4]:
########################
### Import Libraries ###
########################

import pandas as pd
import geopandas as gpd
import matplotlib as plt
import numpy as np

## Adjacency Matrix of Dutch Municipalities

In [5]:
#####################################
### Load ShapeFile Municipalities ###
#####################################

# Adjacency is based on a ShapeFile from the CBS: https://www.cbs.nl/nl-nl/dossier/nederland-regionaal/geografische-data/wijk-en-buurtkaart-2020

path_cbs_municipality_shapefile = r'C:\Users\matsu\Desktop\Universiteit\MSc Statistics & Data Science\Year 3\Semester 1\Mandatory Courses\Urban Computing\Project\Code\Cloned Repositories\EpiGNN\data\WijkBuurtkaart_2020_v3\gemeente_2020_v3.shp'

# Load the shapefile
gdf = gpd.read_file(path_cbs_municipality_shapefile)

# Remove duplicate municipalities where H2O is included. Also remove the small Belgian enclave in Noord-Brabant.
gdf = gdf[(gdf['H2O'] != 'JA') & (gdf['H2O'] != 'B')]
gdf = gdf.reset_index(drop=True)

In [6]:
###############################
### Create Adjacency Matrix ###
###############################

adjacency_dict = {}

for i, municipality_i in gdf.iterrows():
    adjacent_municipalities = []
    for j, municipality_j in gdf.iterrows():
        if i != j and municipality_i.geometry.touches(municipality_j.geometry):
            adjacent_municipalities.append(municipality_j['GM_NAAM'])
    adjacency_dict[municipality_i['GM_NAAM']] = adjacent_municipalities

# Create a DataFrame to represent the adjacency matrix
adjacency_matrix = pd.DataFrame(index=gdf['GM_NAAM'], columns=gdf['GM_NAAM'], dtype=int).fillna(0)

# Populate the adjacency matrix
for municipality, neighbors in adjacency_dict.items():
    adjacency_matrix.loc[municipality, neighbors] = 1

In [35]:
#############################
### Save Adjacency Matrix ###
#############################

adjacency_matrix.to_csv('adjacency_matrix.csv')

## Dataset COVID-19 Infections per Municipality

In [8]:
####################
### Load Dataset ###
####################

path_covid_daily_infections = r'C:\Users\matsu\Desktop\Universiteit\MSc Statistics & Data Science\Year 3\Semester 1\Mandatory Courses\Urban Computing\Project\Code\Cloned Repositories\EpiGNN\data\COVID-19_aantallen_gemeente_per_dag_tm_03102021.csv'
df_covid_daily_infections_full = pd.read_csv(path_covid_daily_infections, delimiter=';')

In [9]:
#########################################
### Clean COVID-19 Infections Dataset ###
#########################################

# Remove redundant columns
df_covid_daily_infections = df_covid_daily_infections_full[["Date_of_publication", "Municipality_code", "Municipality_name","Province", "Total_reported", "Deceased"]]

# Check for NaNs in Total_reported and Deceased
nan_mask_total_reported = df_covid_daily_infections['Total_reported'].isna()
nan_mask_total_deceased = df_covid_daily_infections['Deceased'].isna()
sum(nan_mask_total_reported), sum(nan_mask_total_deceased)

# Remove those rows for which the Municipality is unknown
df_covid_daily_infections = df_covid_daily_infections.dropna(subset=['Municipality_name'])
nan_mask_total_municipality = df_covid_daily_infections['Municipality_name'].isna()
sum(nan_mask_total_municipality)

# Sorting the DataFrame alphabetically by Municipality name per Day
df_covid_daily_infections = df_covid_daily_infections.sort_values(by=['Date_of_publication', 'Municipality_name'])
df_covid_daily_infections = df_covid_daily_infections.reset_index(drop=True)

# Unique Municipality Names Count (344)
unique_names_count = df_covid_daily_infections['Municipality_name'].nunique()
print(unique_names_count)

344


**NEXT** 

[1] Check if the adjacency matrix has the same municipalities as the dataset

[2] Change .csv file to .txt file per Spain example

[3] Create additional file that lists the municipalities as per the Spain example