In [102]:
'''
These scripts prepares raw downloaded CSV files for use with Open Tabulate.
It takes the raw directory as input and outputs to the processed directory.
The following steps are taken:
- Format latitude and longitude as seperate columns
- Remove / replace characters that are out of place
- Converts format to CSV 
- Encodes CSV with utf-8
- Filters out data not related to health (eg police stations)

-Sam Lumley
Dec 2021

'''

'\nThese scripts prepares raw downloaded CSV files for use with Open Tabulate.\nIt takes the raw directory as input and outputs to the processed directory.\nThe following steps are taken:\n- Format latitude and longitude as seperate columns\n- Remove / replace characters that are out of place\n- Converts format to CSV \n- Encodes CSV with utf-8\n- Filters out data not related to health (eg police stations)\n\n-Sam Lumley\nDec 2021\n\n'

In [103]:
import pandas as pd

# If necessary, install openpyxl and geopandas
#import sys
#!conda install --yes --prefix {sys.prefix} openpyxl

#%pip install geopandas

In [104]:
# If necessary, generate province folder structure

# import os

# folders=['ab','bc','mb','nb','nl','ns','nt','nu','on','pe','qc','sk','yt']

# for items in folders:
#     os.mkdir(items)


In [105]:
import os
from pathlib import Path
import shutil

src = '/home/jovyan/data-vol-1/ODHF/LODE-ECDO/scripts/HealthFacilities/V2/1-PreProcessing/raw'
dst = '/home/jovyan/data-vol-1/ODHF/LODE-ECDO/scripts/HealthFacilities/V2/1-PreProcessing/processed'

if os.path.exists(dst):
    shutil.rmtree(dst)
    shutil.copytree(src, dst)
    
files_in_directory = os.listdir(dst)
filtered_files = [file for file in files_in_directory if not file.endswith(".csv")]
for file in filtered_files:
    try:
        path_to_file = os.path.join(dst, file)
        os.remove(path_to_file)
    except:
        pass

In [106]:
# AB Calgary covid 
# convert point to lat lon
# ignore empty rows

import pandas as pd

df=pd.read_csv('raw/AB_Calgary_covid_vaccination_centres.csv')

def strip_point(x):
    
    x=str(x)  
    if (x == 'nan'):
        t = [None,None]
        return t
    else:
        t=x.strip('POINT (')
        t=t.rstrip(')')
        return t.split()

LONGS=[]
LATS=[]
for i in df["POINT"]:
	LONGS.append(strip_point(i)[0])
	LATS.append(strip_point(i)[1])

df["LONGITUDE"]=LONGS
df["LATITUDE"]=LATS

df.to_csv('processed/AB_Calgary_covid_vaccination_centres.csv')

In [107]:
# AB Calgary healthcare 
# point to lat lon

import pandas as pd

df=pd.read_csv('raw/AB_Calgary_healthcare_facilities.csv')

def strip_point(x):

    x=str(x)    
    t=x.strip('POINT (')
    t=t.rstrip(')')
    # 	print(t)
    return t.split()

LONGS=[]
LATS=[]
for i in df["location"]:
	LONGS.append(strip_point(i)[1])
	LATS.append(strip_point(i)[0])

df["LONGITUDE"]=LONGS
df["LATITUDE"]=LATS

df.to_csv('processed/AB_Calgary_healthcare_facilities.csv')

In [108]:
# utf-8 encode datasets

df=pd.read_csv('raw/BC_emergency_rooms.csv')
df.to_csv('processed/BC_emergency_rooms.csv')

df2=pd.read_csv('raw/BC_hospital.csv')
df2.to_csv('processed/BC_hospital.csv')

df3=pd.read_csv('raw/BC_pharmacies.csv')
df3.to_csv('processed/BC_pharmacies.csv')

df4=pd.read_csv('raw/BC_urgent_care.csv')
df4.to_csv('processed/BC_urgent_care.csv')

df5=pd.read_csv('raw/BC_walk-in_clinics.csv')
df5.to_csv('processed/BC_walk-in_clinics.csv')

df5=pd.read_csv('raw/ON_Guelph_healthcare_facilities.csv')
df5.to_csv('processed/ON_Guelph_healthcare_facilities.csv')

In [109]:
# Manitoba

# read geojson, output csv
# limit to hospitals
# remove null values in the geometry

import geopandas as gpd
import pandas as pd

#read geojson with geopandas into geodataframe
sc1=gpd.read_file('raw/MB_COVID-19_and_Flu_-_Vaccination_Sites.geojson')

df=pd.DataFrame(sc1)


def strip_point(x):
    
    x=str(x)  
    if (x == 'nan'):
        t = [None,None]
        return t
    elif x == 'None':
        t = [None,None]
        return t
    else:
        t=x.strip('POINT (')
        t=t.rstrip(')')
        return t.split() 


LONGS=[]
LATS=[]
for i in df.geometry:
	LONGS.append(strip_point(i)[0])
	LATS.append(strip_point(i)[1])

df["LONGITUDE"]=LONGS
df["LATITUDE"]=LATS

df.to_csv('processed/MB_covid-19_and_flu_vaccine_sites.csv')


In [110]:
# New Brunswick
# replace characters that would not show up correctly in the processed dataset
# convert point to lat lon

df=pd.read_csv('raw/NB_nursing_homes.csv')

df=df.replace('Ã¨','è', regex=True)
df=df.replace('Ã´','ô', regex=True)
df=df.replace('Ã‰','É', regex=True)
df=df.replace('Â','', regex=True)

def strip_point(x):
    z=x.strip('(')
    a=z.rstrip(')')
    t=a.replace(",", "")
    return t.split()

LONGS=[]
LATS=[]
for i in df["Location"]:
	LONGS.append(strip_point(i)[1])
	LATS.append(strip_point(i)[0])

df["LONGITUDE"]=LONGS
df["LATITUDE"]=LATS

df.to_csv('processed/NB_nursing_homes.csv')

In [111]:
# NL healthcare and hospital datasets
# convert from xls to csv

import openpyxl

df = pd.read_excel("raw/NL_Hospital.xlsx", engine='openpyxl')
df.to_csv('processed/NL_hospital.csv')

df2 = pd.read_excel("raw/NL_HealthCentre.xls", engine='openpyxl')
df2.to_csv('processed/NL_healthcare_facilities.csv')


In [112]:
# Processing for Nova Scotia
# Convert 'POINT' Geometry into lat and lon

import pandas as pd

df=pd.read_csv('raw/NS_hospital.csv')

def strip_point(x):

	x=str(x)    
	t=x.strip('POINT (')
	t=t.rstrip(')')
	return t.split()

LONGS=[]
LATS=[]
for i in df["the_geom"]:
	LONGS.append(strip_point(i)[0])
	LATS.append(strip_point(i)[1])

df["LONGITUDE"]=LONGS
df["LATITUDE"]=LATS

df.to_csv('processed/NS_hospital.csv')


In [113]:
# Processing for PEI Health Facilities data
# Turn 'Location" column into long and lat

df=pd.read_csv('raw/PE_healthcare_facilities.csv')

# df=df.loc[df["Facility Type"] != "Public Nursing Home"]

def strip_point(x):
    z=x.strip('(')
    a=z.rstrip(')')
    t=a.replace(",", "")
    return t.split()

LONGS=[]
LATS=[]
for i in df["Location 1"]:
	LONGS.append(strip_point(i)[1])
	LATS.append(strip_point(i)[0])

df["LONGITUDE"]=LONGS
df["LATITUDE"]=LATS

df.to_csv('processed/PE_healthcare_facilities.csv')

In [114]:
# Processing for NB Moncton 

#read geojson, output csv
#limit ourselves to hospitals
#Also turn 'POINT' Geometry into lat and lon

import geopandas as gpd
import pandas as pd

#read shapefile with geopandas into geodataframe

files = ['raw/NB_Moncton_medical_clinics', 'raw/NB_Moncton_pharmacies', 'raw/NB_Moncton_senior_care_facilities']

for file in files:

    filename = file + '.geojson'
    #print(filename)
    sc1=gpd.read_file(filename)

    df=pd.DataFrame(sc1)


    def strip_point(x):

        x=str(x)    
        t=x.strip('POINT (')
        t=t.rstrip(')')
    # 	print(t)
        return t.split()

    LONGS=[]
    LATS=[]
    for i in df.geometry:
        LONGS.append(strip_point(i)[0])
        LATS.append(strip_point(i)[1])

    df["LONGITUDE"]=LONGS
    df["LATITUDE"]=LATS

    location = 'processed/' + file.strip('raw/') + '.csv'
    df.to_csv(location)

In [115]:
# ON Durham, Toronto clinics, Toronto covid test sites and York hospitals

# read geojson, output csv
# limit to hospitals
# Also turn 'POINT' Geometry into lat and lon


import geopandas as gpd
import pandas as pd

#read shapefile with geopandas into geodataframe

files = ['raw/ON_Durham_healthcare_facilities', 'raw/ON_Toronto_covid-19-immunization-clinics', 'raw/ON_Toronto_covid-19-testing-sites', 'raw/ON_York_hospital']

for file in files:

    filename = file + '.geojson'
    #print(filename)
    sc1=gpd.read_file(filename)

    df=pd.DataFrame(sc1)


    def strip_point(x):

        x=str(x)    
        t=x.strip('POINT (')
        t=t.rstrip(')')
    # 	print(t)
        return t.split()

    LONGS=[]
    LATS=[]
    for i in df.geometry:
        LONGS.append(strip_point(i)[0])
        LATS.append(strip_point(i)[1])

    df["LONGITUDE"]=LONGS
    df["LATITUDE"]=LATS

    location = 'processed/' + file.strip("raw/") + '.csv'
    df.to_csv(location)

In [116]:
# QC Gatineau 
# convert POINT to lat lon

import pandas as pd

df=pd.read_csv('raw/QC_Gatineau_public_places_including_hospitals.csv')

def strip_point(x):

    x=str(x)    
    t=x.strip('POINT (')
    t=t.rstrip(')')
    # 	print(t)
    return t.split()

LONGS=[]
LATS=[]
for i in df["GEOM"]:
	LONGS.append(strip_point(i)[1])
	LATS.append(strip_point(i)[0])

df["LONGITUDE"]=LONGS
df["LATITUDE"]=LATS

df2 = df[(df['TYPE'] == 'C.L.S.C.') | (df['TYPE'] == 'C.H.S.L.D.') | (df['TYPE'] == 'Résidence pour personnes âgées') | (df['TYPE'] == 'Centre hospitalier') | (df['TYPE'] == 'Autres services médicaux et de santé') | (df['TYPE'] == 'Résidence pour personnes âgées(Type "Familiale")')]
df3 = df2.reset_index(drop=True)

df3.to_csv('processed/QC_Gatineau_public_places_including_hospitals.csv')

In [118]:
# QC Quebec city 
# convert POINT to lat lon

import pandas as pd

df=pd.read_csv('raw/QC_Quebec_City_public_places.csv')

def strip_point(x):

    x=str(x)    
    t=x.strip('POINT (')
    t=t.rstrip(')')
    # 	print(t)
    return t.split()

LONGS=[]
LATS=[]
for i in df["GEOMETRIE"]:
	LONGS.append(strip_point(i)[1])
	LATS.append(strip_point(i)[0])

df["LONGITUDE"]=LONGS
df["LATITUDE"]=LATS

df2 = df[df['DESCRIPTION'] == 'Hôpitaux']
df3 = df2.reset_index(drop=True)

df3.to_csv('processed/QC_Quebec_City_public_places.csv')

In [119]:
# transfer files directly from PreProcessing/processed to opentabulate/data/input

import os
from pathlib import Path
import shutil

src = '/home/jovyan/data-vol-1/ODHF/LODE-ECDO/scripts/HealthFacilities/V2/1-PreProcessing/processed'
dst = '/home/jovyan/data-vol-1/ODHF/LODE-ECDO/scripts/HealthFacilities/V2/2-OpenTabulate/opentabulate/data/input'

if os.path.exists(dst):
    shutil.rmtree(dst)
    shutil.copytree(src, dst)

In [123]:
# check if all folders contain the same amout of files (datasets). Output would have different number since some files are dropped directly there

import os

raw = '/home/jovyan/data-vol-1/ODHF/LODE-ECDO/scripts/HealthFacilities/V2/1-PreProcessing/raw'
pro = '/home/jovyan/data-vol-1/ODHF/LODE-ECDO/scripts/HealthFacilities/V2/1-PreProcessing/processed'
input_ = '/home/jovyan/data-vol-1/ODHF/LODE-ECDO/scripts/HealthFacilities/V2/2-OpenTabulate/opentabulate/data/input'
print(len(os.listdir(raw)))
print(len(os.listdir(pro)))
print(len(os.listdir(input_)))


39
39
39
