In [1]:
import pandas as pd
import re
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)


## Clean the Proximity Measures Database from Statistics Canada

In [2]:
df = pd.read_csv('PMD-en/PMD-en.csv')
df.head()

Unnamed: 0,DBUID,DBPOP,DAUID,DAPOP,CSDUID,CSDNAME,CSDTYPE,CSDPOP,CMAUID,CMAPUID,CMANAME,CMATYPE,CMAPOP,PRUID,PRNAME,PRPOP,lon,lat,in_db_emp,prox_idx_emp,in_db_pharma,prox_idx_pharma,in_db_childcare,prox_idx_childcare,in_db_health,prox_idx_health,in_db_grocery,prox_idx_grocery,in_db_educpri,prox_idx_educpri,in_db_educsec,prox_idx_educsec,in_db_lib,prox_idx_lib,in_db_parks,prox_idx_parks,in_db_transit,prox_idx_transit,transit_na,amenity_dense,suppressed
0,10010165001,160,10010165,506,1001519,St. John's,CY,108860,1.0,10001.0,St. John's,B,205955,10,Newfoundland and Labrador / Terre-Neuve-et-Labrador,519716,-52.7765,47.53,1,0.0202,0,0.0121,0,0.0402,0,0.0069,0,..,0,0.0384,0,0.0495,0,0.0486,0,0.0141,1,0.0058,0,0,0
1,10010165002,25,10010165,506,1001519,St. John's,CY,108860,1.0,10001.0,St. John's,B,205955,10,Newfoundland and Labrador / Terre-Neuve-et-Labrador,519716,-52.7793,47.529,1,0.0193,0,0.014,0,0.0257,0,0.0028,0,..,0,0.0562,0,0.0375,0,..,0,..,0,0.0046,0,0,0
2,10010165006,268,10010165,506,1001519,St. John's,CY,108860,1.0,10001.0,St. John's,B,205955,10,Newfoundland and Labrador / Terre-Neuve-et-Labrador,519716,-52.7768,47.5265,1,0.0199,1,0.0205,1,0.0395,1,0.007,0,..,0,0.0734,0,0.0436,0,0.0545,0,..,1,0.0101,0,0,0
3,10010165007,53,10010165,506,1001519,St. John's,CY,108860,1.0,10001.0,St. John's,B,205955,10,Newfoundland and Labrador / Terre-Neuve-et-Labrador,519716,-52.7726,47.5263,1,0.0204,0,0.0238,0,0.0425,0,0.0074,0,..,0,0.0733,0,0.0548,0,0.0796,0,0.013,0,0.0098,0,0,0
4,10010166001,71,10010166,327,1001519,St. John's,CY,108860,1.0,10001.0,St. John's,B,205955,10,Newfoundland and Labrador / Terre-Neuve-et-Labrador,519716,-52.773,47.5268,1,0.0204,0,0.0238,0,0.0425,0,0.0074,0,..,0,0.0735,0,0.0548,0,0.0796,0,0.013,0,0.0098,0,0,0


In [3]:
# Drop columns which are not needed
listofcols = ['DAUID', 'DAPOP', 'CSDUID', 'CSDNAME', 'CSDTYPE', 
              'CSDPOP', 'CMAUID', 'CMAPUID', 'CMANAME', 'CMATYPE', 'CMAPOP', 
              'PRUID', 'PRPOP','lon', 'lat', 'in_db_emp', 
              'in_db_pharma', 'in_db_childcare', 'in_db_health', 'in_db_grocery', 'in_db_educpri', 
              'in_db_educsec', 'in_db_lib', 'in_db_parks', 'in_db_transit', 'transit_na', 'suppressed']
df.drop(listofcols, axis = 1, inplace = True)

In [4]:
# Filter for data from Ontario
mask = df['PRNAME'] == 'Ontario'
df_on = df[mask]

In [6]:
len(df_on[df_on['prox_idx_emp'] == 'F'])

503

In [9]:
# See #https://www150.statcan.gc.ca/n1/pub/71-607-x/71-607-x2020011-eng.htm for details
# Remove 503 rows with F
# F too unreliable to be published
df_on = df_on[df_on['prox_idx_emp'] != 'F']

# Replace .. entries from proximity data columns with NaN values
# .. not available for a specific reference period
list_prox = ['prox_idx_emp', 'prox_idx_pharma', 'prox_idx_childcare',
             'prox_idx_health', 'prox_idx_grocery', 'prox_idx_educpri',
             'prox_idx_educsec', 'prox_idx_lib', 'prox_idx_parks', 
             'prox_idx_transit']

for each in list_prox:
    df_on[each] = df_on[each].replace({"..":np.nan})
    
# Remove commas from population column
list_pop = ['DBPOP']
for each in list_pop:
    df_on[each] = df_on[each].str.replace(',', '')
    
# Convert UID columns to strings (necessary for use in QGIS)
df_on[['DBUID']] = df_on[['DBUID']].astype(str)

# Convert the numeric columns as appropriate
for each in list_prox:
    df_on[each] = pd.to_numeric(df_on[each])
    
print(df_on.dtypes) # check datatypes

DBUID                 object 
DBPOP                 object 
PRNAME                object 
prox_idx_emp          float64
prox_idx_pharma       float64
prox_idx_childcare    float64
prox_idx_health       float64
prox_idx_grocery      float64
prox_idx_educpri      float64
prox_idx_educsec      float64
prox_idx_lib          float64
prox_idx_parks        float64
prox_idx_transit      float64
amenity_dense         object 
dtype: object


In [10]:
# Export the proximity data for use in QGIS
df_on.to_csv(r'proximity_data_prep_forQGIS.csv', index = False)