# EDA pour identifer la structure des donnees Open-Source de MTL

In [1]:
!pip install geopandas

Collecting geopandas
[?25l  Downloading https://files.pythonhosted.org/packages/f7/a4/e66aafbefcbb717813bf3a355c8c4fc3ed04ea1dd7feb2920f2f4f868921/geopandas-0.8.1-py2.py3-none-any.whl (962kB)
[K     |████████████████████████████████| 972kB 2.7MB/s 
[?25hCollecting fiona
[?25l  Downloading https://files.pythonhosted.org/packages/ec/20/4e63bc5c6e62df889297b382c3ccd4a7a488b00946aaaf81a118158c6f09/Fiona-1.8.13.post1-cp36-cp36m-manylinux1_x86_64.whl (14.7MB)
[K     |████████████████████████████████| 14.7MB 289kB/s 
Collecting pyproj>=2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/e5/c3/071e080230ac4b6c64f1a2e2f9161c9737a2bc7b683d2c90b024825000c0/pyproj-2.6.1.post1-cp36-cp36m-manylinux2010_x86_64.whl (10.9MB)
[K     |████████████████████████████████| 10.9MB 40.4MB/s 
Collecting munch
  Downloading https://files.pythonhosted.org/packages/cc/ab/85d8da5c9a45e072301beb37ad7f833cd344e04c817d97e0cc75681d248f/munch-2.5.0-py2.py3-none-any.whl
Collecting click-plugins>=1.0
  

In [2]:
import pandas as pd
import io
import os
from google.colab import drive
import geopandas as gpd
from shapely.geometry import Point, Polygon
import numpy as np

### 1) Connection to the drive

In [3]:
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
# Read data from Open data Montreal
spatial_boroughs = gpd.read_file('/content/drive/My Drive/Data/YCBS-299/quartiers-socio.geojson', encoding='utf-8')
spatial_casernes = gpd.read_file('/content/drive/My Drive/Data/YCBS-299/territoires-administratifs-casernes.geojson', encoding='utf-8')

### 2) Create a table with all days, shifts and stations

In [5]:
# Load datasets
interventions_15 = pd.read_csv('/content/drive/My Drive/Data/YCBS-299/donneesouvertes-interventions-sim.csv')
list_casernes = interventions_15['CASERNE'].unique()

In [6]:
def create_date_table(start='2005-01-01', end='2020-07-10'):
  df = pd.DataFrame({"Date": pd.date_range(start, end)})
  return df 

In [9]:
# Create date table
shifts = [1,2,3]

appended_data = []
for i in list_casernes:
  for j in shifts:
    dates = create_date_table(start='2005-01-01', end='2020-06-30')
    dates['Date'] = dates['Date'].dt.strftime('%m/%d/%Y')
    dates['Station'] = i
    dates['Shift'] = j
    appended_data.append(dates)

# see pd.concat documentation for more info
dates = pd.concat(appended_data)

In [10]:
dates.head()

Unnamed: 0,Date,Station,Shift
0,01/01/2005,77,1
1,01/02/2005,77,1
2,01/03/2005,77,1
3,01/04/2005,77,1
4,01/05/2005,77,1


In [11]:
# Print to csv
dates.to_csv('/content/drive/My Drive/Data/YCBS-299/time_space_frame.csv', index=False)

### 3) Table for Social Housing Data

In [None]:
social_housing =  pd.read_csv('/content/drive/My Drive/Data/YCBS-299/logsoc_donneesouvertes_20191231.csv', encoding='utf-8')

In [None]:
social_housing = social_housing[['nb_log', 'projettype', 'longitude', 'latitude']]
social_housing.head()

Unnamed: 0,nb_log,projettype,longitude,latitude
0,150,HLM,-73.645933,45.556394
1,32,HLM,-73.672437,45.445957
2,80,HLM,-73.560734,45.486068
3,168,HLM,-73.62353,45.623784
4,117,HLM,-73.580121,45.558369


In [None]:
list_casernes = [None] * len(social_housing)
list_boroughs = [None] * len(social_housing)

for house in range(0, len(social_housing)-1):

  longitude = social_housing['longitude'].iloc[house]
  latitude  = social_housing['latitude'].iloc[house]
  p = Point(longitude, latitude)

  for caserne in range(0, len(spatial_casernes)-1):
    if p.within(spatial_casernes['geometry'].iloc[caserne]):
      list_casernes[house] = spatial_casernes['NOM_CAS_AD'].iloc[caserne]

  for borough in range(0, len(spatial_boroughs)-1):
    if p.within(spatial_boroughs['geometry'].iloc[borough]):
      list_boroughs[house] = spatial_boroughs['Arrondissement'].iloc[borough]

In [None]:
key_caserne  = pd.DataFrame(list_casernes, columns = ["Station"])
key_boroughs = pd.DataFrame(list_boroughs, columns = ["Borough"])

keys = key_caserne.join(key_boroughs)
keys.head()

Unnamed: 0,Station,Borough
0,Caserne 49,Ahuntsic-Cartierville
1,Caserne 64,Lachine
2,Caserne 15,Le Sud-Ouest
3,Caserne 18,Montréal-Nord
4,Caserne 50,Rosemont–La Petite-Patrie


In [None]:
social_housing_keys = social_housing.join(keys)

In [None]:
social_housing_final = social_housing_keys.pivot_table(index=['Station', 'Borough'], columns='projettype', values='nb_log', aggfunc=np.sum)

In [None]:
social_housing_final.reset_index(level=0, inplace=True)
social_housing_final.reset_index(level=0, inplace=True)
social_housing_final['Station_'] = social_housing_final['Station'].str.slice(start=-2).astype(int)
social_housing_final = social_housing_final.drop(['Station'], axis=1)
social_housing_final.columns = ['borough','COOP_sum','HLM_sum', 'OBNL_sum', 'OMHM_sum','SHDM_sum', 'Station']
social_housing_final.head()

Unnamed: 0,borough,COOP_sum,HLM_sum,OBNL_sum,OMHM_sum,SHDM_sum,Station
0,Le Sud-Ouest,634.0,876.0,283.0,26.0,29.0,3
1,Côte-des-Neiges–Notre-Dame-de-Grâce,298.0,454.0,1189.0,67.0,384.0,4
2,Ville-Marie,133.0,1112.0,186.0,,64.0,5
3,Villeray–Saint-Michel–Parc-Extension,327.0,881.0,305.0,,135.0,9
4,Ville-Marie,,286.0,,,62.0,10


In [None]:
social_housing_final.to_csv('/content/drive/My Drive/Data/YCBS-299/social_housing.csv', index=False)