In [1]:
import pandas
import geopandas

import os, sys
sys.path.append(os.path.abspath(''))

from functions import get_raw_path, get_data_path, store_context_data, open_geopandas_csv,\
                      add_neighbourhood_column, print_df_columns, open_geopandas_csv, geom_dict_to_shape,\
                      get_neighbourhoods, get_gdf_using_geom_dict, get_overlay
import functions

import re

import numpy as np

In [None]:
raise Exception("Do not run all cells! -> This will overwrite the pre-reformated data!")

# Traffic Volumes

In [None]:
FILE_NAME = 'raw-data-2010-2019.csv'

STORE_NAME = 'Traffic Volumes.csv'

df = pandas.read_csv(get_raw_path(FILE_NAME))

# df.head()

The coordinates are stored in columns 'lng' and 'lat'

In [None]:
gdf = geopandas.GeoDataFrame(df.drop(['lng', 'lat'], axis=1), geometry=geopandas.points_from_xy(df['lng'], df['lat']))
gdf.crs = functions.CRS

In [None]:
gdf = add_neighbourhood_column(gdf)

In [None]:
gdf.head()

In [None]:
gdf['Neighbourhood'].isna().sum()

In [None]:
store_context_data(gdf, STORE_NAME)

In [None]:
gdf = open_geopandas_csv(get_data_path(STORE_NAME))

gdf.head()

# gdf.loc[:100].explore()

# Centrelines
Because this consists of lines, we use 'intersects' predicate over 'within' to match it.

Therefore an additional columns are added:

- NeighbourhoodPortion (the portion of the MultiLineString's full length that lies within the neighbourhood)

In [None]:
FILE_NAME  = "Centreline - Version 2 - 4326.csv"
STORE_NAME = "Centrelines.csv"

df = pandas.read_csv(get_raw_path(FILE_NAME))

df.head()

In [None]:
gdf = get_gdf_using_geom_dict(df)

In [None]:
## Get the neighbourhood dataset
neighbourhood = get_neighbourhoods()

print("Joining...")
## Merge the gdf with the neighbourhood stuff
joined = geopandas.sjoin(gdf.to_crs(functions.CRS), neighbourhood.to_crs(functions.CRS), how='inner', predicate='within').to_crs(epsg = functions.EPSG)

print("Filtering rows...")
## Filter out any un-matched columns
gdf = gdf.loc[joined.index]

print("Calculating NeighbourhoodPortions")
## Add the 'NeighbourhoodPortion' column
geometry_right = neighbourhood.to_crs(epsg=functions.EPSG).loc[joined['index_right']].set_index(joined.index)
numerator = np.array(joined.geometry.intersection(geometry_right).length)
denomenator = np.array(joined.geometry.length)
gdf['NeighbourhoodPortion'] =  numerator / denomenator 

print("Assigning Neighbourhood")
## Add the Neighbourhood column
gdf['Neighbourhood'] = joined['index_right']

## Drop where they are right on the border, and not inside... so NeighbourhoodPortion is 0
gdf = gdf.loc[gdf['NeighbourhoodPortion'] != 0]

In [None]:
store_context_data(gdf, STORE_NAME)

# Intersections
Some intersections lie on the border between neighbourhoods, so an additional field is added:
- SingleNeighbourhood: 1 if point lies entirely within a single neighbourhood

In [31]:
FILE_NAME  = "Centreline Intersection - 4326.csv"
STORE_NAME = "Intersections.csv"

df = pandas.read_csv(get_raw_path(FILE_NAME))

df.head()

Unnamed: 0,_id,INTERSECTION_ID,DATE_EFFECTIVE,DATE_EXPIRY,ELEVATION_ID,INTERSECTION_DESC,CLASSIFICATION,CLASSIFICATION_DESC,NUMBER_OF_ELEVATIONS,ELEVATION_FEATURE_CODE,...,ELEVATION_LEVEL,ELEVATION,ELEVATION_UNIT,HEIGHT_RESTRICTION,HEIGHT_RESTRICTION_UNIT,STATE,TRANS_ID_CREATE,TRANS_ID_EXPIRE,OBJECTID,geometry
0,1,13470264,,,13,Robindale Ave / Rimilton Ave,MNRSL,Minor-Single Level,1,501300.0,...,0,,,,,8,200000,-1,1,"{'type': 'MultiPoint', 'coordinates': [[-79.53..."
1,2,13470193,,,4718,Bellman Ave / Valermo Dr,MNRSL,Minor-Single Level,1,501300.0,...,0,,,,,8,200000,-1,4,"{'type': 'MultiPoint', 'coordinates': [[-79.53..."
2,3,13470188,,,32728,Rimilton Ave / Valermo Dr,SEUSL,Pseudo Intersection-Single Level,1,509200.0,...,0,,,,,8,200000,-1,5,"{'type': 'MultiPoint', 'coordinates': [[-79.53..."
3,4,13470203,,,21669,Valermo Dr / Goa Crt,MNRSL,Minor-Single Level,1,501300.0,...,0,,,,,8,200000,-1,7,"{'type': 'MultiPoint', 'coordinates': [[-79.53..."
4,5,13470228,,,36820,Valermo Dr / Thirtieth St,MNRSL,Minor-Single Level,1,501300.0,...,0,,,,,8,200000,-1,9,"{'type': 'MultiPoint', 'coordinates': [[-79.53..."


In [32]:
gdf = get_gdf_using_geom_dict(df)

# gdf.head()

Processing geometry column...
Creating GeoDataFrame


In [33]:
gdf_fully_within = add_neighbourhood_column(gdf)
gdf_fully_within['SingleNeighbourhood'] = np.ones(len(gdf_fully_within))

print("Searching for missing...")
missing_indexes = gdf.index.difference(gdf_fully_within.index)
missing = gdf.loc[missing_indexes]
print(len(missing))

gdf_bordering = add_neighbourhood_column(missing, predicate='intersects')
gdf_bordering['SingleNeighbourhood'] = np.zeros(len(gdf_bordering))
print("Bordering: ", len(gdf_bordering))

print("Searching for final missing ones...")
missing_indexes = missing.index.difference(gdf_bordering.index)
missing = missing.loc[missing_indexes]
print(len(missing))

# missing.head()
print("Combining now...")
gdf = geopandas.GeoDataFrame(pandas.concat([
    gdf_fully_within,
    gdf_bordering
])).set_geometry('geometry').to_crs(functions.CRS)

Joining now using: 'how=inner', 'predicate=within'
Searching for missing...
5268
Joining now using: 'how=inner', 'predicate=intersects'
Bordering:  10175
Searching for final missing ones...
154
Combining now...


In [34]:
# missing.explore()

In [35]:
store_context_data(gdf, STORE_NAME)

# Weather data
Working with the latest thing uploaded by Curtis

In [37]:
FILE_NAME = 'climate-daily-2.csv'
STORE_NAME = 'Climate.csv'

df = pandas.read_csv(get_raw_path(FILE_NAME), low_memory=False)

df.head()

Unnamed: 0,x,y,LOCAL_DATE,PROVINCE_CODE,ID,LOCAL_YEAR,MAX_REL_HUMIDITY,TOTAL_RAIN,TOTAL_PRECIPITATION_FLAG,TOTAL_SNOW_FLAG,...,MIN_TEMPERATURE_FLAG,TOTAL_RAIN_FLAG,LOCAL_MONTH,HEATING_DEGREE_DAYS,COOLING_DEGREE_DAYS_FLAG,MEAN_TEMPERATURE,LOCAL_DAY,MAX_REL_HUMIDITY_FLAG,COOLING_DEGREE_DAYS,MAX_TEMPERATURE_FLAG
0,-79.467788,43.78002,2000-01-01 00:00:00,ON,615S001.2000.1.1,2000,,0.8,,,...,,,1,15.0,,3.0,1,,0.0,
1,-79.467788,43.78002,2000-01-02 00:00:00,ON,615S001.2000.1.2,2000,,2.0,,,...,,,1,13.2,,4.8,2,,0.0,
2,-79.467788,43.78002,2000-01-03 00:00:00,ON,615S001.2000.1.3,2000,,3.4,,T,...,,,1,17.0,,1.0,3,,0.0,
3,-79.467788,43.78002,2000-01-04 00:00:00,ON,615S001.2000.1.4,2000,,0.0,T,T,...,,T,1,13.7,,4.3,4,,0.0,
4,-79.467788,43.78002,2000-01-05 00:00:00,ON,615S001.2000.1.5,2000,,0.0,T,T,...,,,1,22.8,,-4.8,5,,0.0,


In [44]:
gdf = geopandas.GeoDataFrame(df.drop(['x', 'y'], axis=1), geometry=geopandas.points_from_xy(df['x'], df['y']))
gdf.crs = functions.CRS

In [45]:
gdf_2 = add_neighbourhood_column(gdf)

Joining now using: 'how=inner', 'predicate=within'


In [46]:
len(gdf) - len(gdf_2)

37025

In [73]:
CRS = functions.CRS

def get_overlay(df_1: geopandas.GeoDataFrame, name_1: str, df_2: geopandas.GeoDataFrame, name_2: str) -> geopandas.GeoDataFrame:
    names    = pandas.Series(([name_1] * len(df_1)) + ([name_2] * len(df_2)), name = 'Name')
    geometry = pandas.concat([df_1.to_crs(CRS).geometry, df_2.to_crs(CRS).geometry])
    geometry.index = names.index

    print(geometry)
    print(names)
    
    return geopandas.GeoDataFrame(names, geometry=geometry)


In [74]:
get_overlay(get_neighbourhoods(), 'nbhood', gdf, 'Weather Station')

0        MULTIPOLYGON (((-79.38635 43.69783, -79.38623 ...
1        MULTIPOLYGON (((-79.39744 43.70693, -79.39837 ...
2        MULTIPOLYGON (((-79.43411 43.66015, -79.43537 ...
3        MULTIPOLYGON (((-79.43870 43.66766, -79.43841 ...
4        MULTIPOLYGON (((-79.38404 43.64497, -79.38502 ...
                               ...                        
69823                           POINT (-79.36861 43.86083)
69824                           POINT (-79.36861 43.86083)
69825                           POINT (-79.36861 43.86083)
69826                           POINT (-79.36861 43.86083)
69827                           POINT (-79.36861 43.86083)
Name: geometry, Length: 69828, dtype: geometry
0                 nbhood
1                 nbhood
2                 nbhood
3                 nbhood
4                 nbhood
              ...       
69823    Weather Station
69824    Weather Station
69825    Weather Station
69826    Weather Station
69827    Weather Station
Name: Name, Length: 69828, dt

Unnamed: 0,Name,geometry
0,nbhood,"MULTIPOLYGON (((-79.38635 43.69783, -79.38623 ..."
1,nbhood,"MULTIPOLYGON (((-79.39744 43.70693, -79.39837 ..."
2,nbhood,"MULTIPOLYGON (((-79.43411 43.66015, -79.43537 ..."
3,nbhood,"MULTIPOLYGON (((-79.43870 43.66766, -79.43841 ..."
4,nbhood,"MULTIPOLYGON (((-79.38404 43.64497, -79.38502 ..."
...,...,...
69823,Weather Station,POINT (-79.36861 43.86083)
69824,Weather Station,POINT (-79.36861 43.86083)
69825,Weather Station,POINT (-79.36861 43.86083)
69826,Weather Station,POINT (-79.36861 43.86083)


In [61]:
df_1 = get_neighbourhoods()
name_1 = 'nbhood'
df_2 = gdf
name_2 = 'Weather Station'

In [69]:
names    = pandas.Series(([name_1] * len(df_1)) + ([name_2] * len(df_2)), name = 'Name')
geometry = pandas.concat([df_1.to_crs(CRS).geometry, df_2.to_crs(CRS).geometry])
geometry.index = names.index

In [70]:
names

0                 nbhood
1                 nbhood
2                 nbhood
3                 nbhood
4                 nbhood
              ...       
69823    Weather Station
69824    Weather Station
69825    Weather Station
69826    Weather Station
69827    Weather Station
Name: Name, Length: 69828, dtype: object

In [71]:
geometry

0        MULTIPOLYGON (((-79.38635 43.69783, -79.38623 ...
1        MULTIPOLYGON (((-79.39744 43.70693, -79.39837 ...
2        MULTIPOLYGON (((-79.43411 43.66015, -79.43537 ...
3        MULTIPOLYGON (((-79.43870 43.66766, -79.43841 ...
4        MULTIPOLYGON (((-79.38404 43.64497, -79.38502 ...
                               ...                        
69823                           POINT (-79.36861 43.86083)
69824                           POINT (-79.36861 43.86083)
69825                           POINT (-79.36861 43.86083)
69826                           POINT (-79.36861 43.86083)
69827                           POINT (-79.36861 43.86083)
Name: geometry, Length: 69828, dtype: geometry