In [1]:
!pip install geopandas

Collecting geopandas
[?25l  Downloading https://files.pythonhosted.org/packages/f7/a4/e66aafbefcbb717813bf3a355c8c4fc3ed04ea1dd7feb2920f2f4f868921/geopandas-0.8.1-py2.py3-none-any.whl (962kB)
[K     |████████████████████████████████| 972kB 8.3MB/s 
[?25hCollecting pyproj>=2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/e5/c3/071e080230ac4b6c64f1a2e2f9161c9737a2bc7b683d2c90b024825000c0/pyproj-2.6.1.post1-cp36-cp36m-manylinux2010_x86_64.whl (10.9MB)
[K     |████████████████████████████████| 10.9MB 17.3MB/s 
Collecting fiona
[?25l  Downloading https://files.pythonhosted.org/packages/ec/20/4e63bc5c6e62df889297b382c3ccd4a7a488b00946aaaf81a118158c6f09/Fiona-1.8.13.post1-cp36-cp36m-manylinux1_x86_64.whl (14.7MB)
[K     |████████████████████████████████| 14.7MB 322kB/s 
Collecting click-plugins>=1.0
  Downloading https://files.pythonhosted.org/packages/e9/da/824b92d9942f4e472702488857914bdd50f73021efea15b4cad9aca8ecef/click_plugins-1.1.1-py2.py3-none-any.whl
Collecting 

In [2]:
import io
import os
from google.colab import drive
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from shapely.geometry import Point, Polygon

  import pandas.util.testing as tm


### 1) Connect to the Drive

In [3]:
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### 2) Download and save the 2016 Census at the 3 digit zip code level  [(2016 census CANSIM)](https://www12.statcan.gc.ca/census-recensement/2016/dp-pd/prof/details/download-telecharger/comp/GetFile.cfm?Lang=E&FILETYPE=CSV&GEONO=046)

In [4]:
# List of variables that we want (age, income, total population)
vars = [1,9,13,24,696,697,698,699,700,701,702,703,704,705] 

appended_data = []
filename = '/content/drive/My Drive/Data/YCBS-299/98-401-X2016046_English_CSV_data.csv'
chunksize=10000

# We filter with "H" as a first letter of the zipcode to keep only codes for Montreal
for chunk in pd.read_csv(filename, chunksize=chunksize):
    filtered = chunk[chunk['GEO_CODE (POR)'].str.slice(stop=1)=='H']
    filtered = filtered[filtered['Member ID: Profile of Forward Sortation Areas (2247)'].isin(vars)]
    appended_data.append(filtered)
    
# See pd.concat documentation for more info
cansim = pd.concat(appended_data)
cansim = cansim[['GEO_CODE (POR)','DIM: Profile of Forward Sortation Areas (2247)','Dim: Sex (3): Member ID: [1]: Total - Sex']]
cansim['DIM: Profile of Forward Sortation Areas (2247)'] = cansim['DIM: Profile of Forward Sortation Areas (2247)'].str.replace("$","", regex=True).replace(",","", regex=True).replace(" ","", regex=True)
cansim_wide = cansim.pivot(index='GEO_CODE (POR)', columns='DIM: Profile of Forward Sortation Areas (2247)', values='Dim: Sex (3): Member ID: [1]: Total - Sex')

# Check data
cansim_wide = cansim_wide.reset_index()
cansim_wide.head(5)

DIM: Profile of Forward Sortation Areas (2247),0to14years,100000andover,10000to19999,15to64years,20000to29999,30000to39999,40000to49999,50000to59999,60000to69999,65yearsandover,70000to79999,80000to89999,90000to99999,Population2016
GEO_CODE (POR),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
H0M,375,5,115,715,90,65,35,45,25,110,10,10,5,1202
H1A,4665,950,4320,21785,3775,3610,3470,2520,1635,6065,1240,700,460,32516
H1B,2945,330,3215,13530,2670,2500,2040,1465,890,3690,560,340,190,20160
H1C,2930,545,1655,10450,1350,1350,1360,1085,730,1300,555,330,240,14678
H1E,6775,1160,7055,27410,6035,4710,3885,2630,1690,8235,1180,730,475,42420


### 3) Download and save the boundary file for 2016 Census  [(Forward Sortation Area)](https://www12.statcan.gc.ca/census-recensement/alternative_alternatif.cfm?l=eng&dispext=zip&teng=lfsa000b16a_e.zip&k=%20%20%20%2044221&loc=http://www12.statcan.gc.ca/census-recensement/2011/geo/bound-limit/files-fichiers/2016/lfsa000b16a_e.zip)

In [7]:
# Read shape file
gdf = gpd.read_file('/content/drive/My Drive/Data/YCBS-299/lfsa000b16a_e.shp')

In [8]:
# Let's take a copy of our layer
data_proj = gdf.copy()

# Reproject the geometries by replacing the values with projected ones (we want long-lat and the original file was in Lambert)
data_proj = data_proj.to_crs(epsg=4326)

In [9]:
# Create point in the middle of the centroid
demo_long_lat = data_proj.copy()
demo_long_lat['Longitude'] = data_proj.geometry.centroid.x
demo_long_lat['Latitude']  = data_proj.geometry.centroid.y


  This is separate from the ipykernel package so we can avoid doing imports until

  after removing the cwd from sys.path.


In [33]:
# Merge demo info with coordinates
census_regions = pd.merge(cansim_wide, demo_long_lat,  how='left', left_on=['GEO_CODE (POR)'], right_on = ['CFSAUID'])
census_regions = census_regions.drop(['CFSAUID', 'PRUID', 'PRNAME', 'Longitude', 'Latitude'], axis=1)
census_regions.rename({'GEO_CODE (POR)': 'GEO3_Code', '0to14years':'age_0_14', '100000andover':'Income_100k', '10000to19999':'Income_10k',
                       '15to64years':'age_15_64', '20000to29999':'Income_20k', '30000to39999':'Income_30k', '40000to49999':'Income_40k',
                       '50000to59999':'Income_50k', '60000to69999':'Income_60k', '65yearsandover':'age_65', '70000to79999':'Income_70k',
                       '80000to89999':'Income_80k', '90000to99999':'Income_90k', 'Population2016':'Population2016', 'geometry':'Geometry'})

Unnamed: 0,GEO_CODE (POR),0to14years,100000andover,10000to19999,15to64years,20000to29999,30000to39999,40000to49999,50000to59999,60000to69999,65yearsandover,70000to79999,80000to89999,90000to99999,Population2016,geometry
0,H0M,375,5,115,715,90,65,35,45,25,110,10,10,5,1202,"MULTIPOLYGON (((-74.51960 45.03463, -74.51939 ..."
1,H1A,4665,950,4320,21785,3775,3610,3470,2520,1635,6065,1240,700,460,32516,"MULTIPOLYGON (((-73.47668 45.70214, -73.47722 ..."
2,H1B,2945,330,3215,13530,2670,2500,2040,1465,890,3690,560,340,190,20160,"POLYGON ((-73.50219 45.65192, -73.50231 45.651..."
3,H1C,2930,545,1655,10450,1350,1350,1360,1085,730,1300,555,330,240,14678,"POLYGON ((-73.50900 45.69807, -73.50900 45.698..."
4,H1E,6775,1160,7055,27410,6035,4710,3885,2630,1690,8235,1180,730,475,42420,"MULTIPOLYGON (((-73.57704 45.62832, -73.57717 ..."


### 4) Save to csv. The rest is done in Alteryx

In [34]:
census_regions.to_csv('/content/drive/My Drive/Data/YCBS-299/demographics_code.csv', index=False)