# Split lines into grid

In [1]:
# %load first_cell.py
%reload_ext autoreload
%autoreload 2
from pathlib import Path
import os

# system paths
import sys
home = str(Path.home())
sys.path = sys.path + [f'{home}/.conda/envs/norm_env/lib/python37.zip', 
                       f'{home}/.conda/envs/norm_env/lib/python3.7', 
                       f'{home}/.conda/envs/norm_env/lib/python3.7/lib-dynload', 
                       f'{home}/.conda/envs/norm_env/lib/python3.7/site-packages']
sys.prefix = '/home/soniame/.conda/envs/norm_env'
from paths import RAW_PATH, TREAT_PATH, OUTPUT_PATH, FIGURES_PATH

# aws connection*
import boto3
import awswrangler as wr

# local libraries
from src import utils

# analysis libraries
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import geopandas as gpd
import random
import itertools

import matplotlib.pyplot as plt

from shapely.geometry import box, Polygon, MultiPolygon, LineString, GeometryCollection, shape
from shapely import wkt
from shapely.ops import transform
from babelgrid import Babel

from multiprocessing.pool import Pool
from functools import partial

from loguru import logger

cm = str(datetime.today().strftime("%Y%m%d%H%m"))
cm

ModuleNotFoundError: No module named 'boto3'

## Sample dates

Executing 50 dates to sample from jams data set

In [None]:
from src.runners.sample_jams import _sample_dates

In [None]:
df = _sample_dates(start_date = '2019-01-01', end_date = '2021-08-12')

In [None]:
print(df.date_filter[0])

## Sample lines counts

Run a query to download unique lines and number of appearance. 

- Lines 21,904,128
- Jams 507,139,112
- 50 weekdays sampled

Data is split in 6 groups to run each intersection with grids. Function `_split_groups()`. Each split contains 3,650,688 lines.

In [None]:
from src.runners.split_polygon import _get_lines

In [None]:
df_lines = _get_lines(update_data = False)

In [None]:
print(df_lines.shape)
print(sum(df_lines.count_lines))
df_lines.head(3)

In [None]:
from siuba import group_by, summarize, _
(df_lines
  >> group_by(_.split)
  >> summarize(lines = _.line_wkt.count())
  )

## Coarse grid split

In [None]:
from src.runners import split_polygon

In [None]:
# Plotting H3 cells

# H3 cells grid
tiles = Babel('h3').polyfill(geometry, resolution=1)
# S2 cells grid
#tiles = Babel('s2').polyfill(geometry, resolution=4)
len(tiles)

if False:
    tiles_gdf = gpd.GeoDataFrame([t.to_dict() for t in tiles], geometry='shapely')
    df = tiles_gdf
    m = folium.Map(location=[40.70, -73.94], zoom_start=2, tiles='CartoDB positron')
    for _, r in df.iterrows():
        # Without simplifying the representation of each borough,
        # the map might not be displayed
        sim_geo = gpd.GeoSeries(r['shapely']).simplify(tolerance=0.001)
        geo_j = sim_geo.to_json()
        geo_j = folium.GeoJson(data=geo_j,
                               style_function=lambda x: {'fillColor': 'orange'})
        geo_j.add_to(m)
    m

In [None]:
tiles[0].geometry.wkt

In [None]:
# Run this in 6 separated instances 
logger.warning('Better to run it from terminal in different instances')
# df_coarse_split = _create_coarse_grid(df_lines, tiles, split = )

### Coarse grid union

Union of results from parallelization in 6 instances. 

- Cells: 57
- Lines: 22,214,341
- Run time per instance: 3 hrs.

In [None]:
from src.runners.split_polygon import _coarse_union

In [None]:
mypath = "/home/soniame/private/projects/corona_geo_id/coarse_grid/"
csv_files = [os.path.join(mypath, x) for x in os.listdir(mypath)]

In [None]:
# Sample: union of lines
csv_files = ['/home/soniame/private/projects/corona_geo_id/coarse_grid/coarse_id.csv',
 '/home/soniame/private/projects/corona_geo_id/coarse_grid/coarse_id_new_1.csv',
 '/home/soniame/private/projects/corona_geo_id/coarse_grid/coarse_id_new_2.csv',
 '/home/soniame/private/projects/corona_geo_id/coarse_grid/coarse_id_new_3.csv',
 '/home/soniame/private/projects/corona_geo_id/coarse_grid/coarse_id_new_4.csv',
 '/home/soniame/private/projects/corona_geo_id/coarse_grid/coarse_id_new_5.csv',
 '/home/soniame/private/projects/corona_geo_id/coarse_grid/coarse_id_new_6.csv']
csv_files
df_coarse = _coarse_union(csv_files)

In [None]:
# Export to csv 
coarse_path = '/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/coarse_id/coarse_grid_sample.csv'
df_coarse.to_csv(coarse_path, index = False)

### Coarse grid distribution

In [None]:
coarse_path = '/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/coarse_id/coarse_grid_sample.csv'
df_coarse_R1 = pd.read_csv(coarse_path)
df_coarse_R1 = df_coarse_R1.rename(columns = {'line':'line_wkt'})

In [None]:
print(df_coarse_R1.shape)
print(df_coarse_R1.coarse_wkt.nunique())
print(df_coarse_R1.line_wkt.nunique())
# df_coarse_R1.head()

In [None]:
# Union de coarse y merge con count of jams
df_merge = df_coarse_R1.merge(df_lines, how = 'left', on = 'line_wkt')
print(df_merge.shape)
df_merge.head()

In [None]:
from siuba import group_by, ungroup, arrange, summarize, _

# Table of distribution pero H3 Cell
tab = (df_merge
  >> group_by(_.coarse_wkt)
  >> summarize(lines = _.line_wkt.count(),
               jams  = _.count_lines.sum())
  >> ungroup()
  >> arrange(-_.lines)
  )
tab.head()

In [None]:
# Export to csv 
tab.to_csv('/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/figures/coarse_grid_distribution.csv', index = False)

Plots

In [None]:
tab = pd.read_csv('/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/figures/coarse_grid_distribution.csv')

In [None]:
tab['geometry'] = gpd.GeoSeries.from_wkt(tab['coarse_wkt'])
tab = gpd.GeoDataFrame(tab, geometry='geometry')
fig, ax = plt.subplots(1, 1)
tab.plot(column='lines', ax=ax, 
         legend=True, cmap='OrRd')
plt.title('Coarse Grid 1 - Number of lines')
plt.savefig('/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/figures/coarse_map_LinesR1.pdf')

In [None]:
tab['geometry'] = gpd.GeoSeries.from_wkt(tab['coarse_wkt'])
tab = gpd.GeoDataFrame(tab, geometry='geometry')
fig, ax = plt.subplots(1, 1)
tab.plot(column='jams', ax=ax, 
         legend=True, cmap='OrRd')
plt.title('Coarse Grid 1 - Number of Jams')
plt.savefig('/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/figures/coarse_map_JamsR1.pdf')

In [None]:
plt.scatter(tab.jams, tab.lines)

### Coarse grid union Big Polygons (R2)

Union of results from parallelization in 6 instances. 

- Cells: 
- Lines: 22,214,341
- Run time per polygon: 2 hrs.

In [None]:
from src.runners.split_polygon import _new_res_coarse_grid
from siuba import group_by, ungroup, arrange, summarize, _

In [None]:
# Ran in terminal
logger.warning("Only run to update resolution 2")
# _new_res_coarse_grid()

In [None]:
from src.runners.split_polygon import _coarse_union
mypath = "/home/soniame/private/projects/corona_geo_id/coarse_grid/"
csv_files = [os.path.join(mypath, x) for x in os.listdir(mypath)]

In [None]:
# Sample: union of lines for resolution 2
csv_files = [
 '/home/soniame/private/projects/corona_geo_id/coarse_grid/coarse_id_new_R2_1.csv',
 '/home/soniame/private/projects/corona_geo_id/coarse_grid/coarse_id_new_R2_2.csv',
 '/home/soniame/private/projects/corona_geo_id/coarse_grid/coarse_id_new_R2_3.csv',
 '/home/soniame/private/projects/corona_geo_id/coarse_grid/coarse_id_new_R2_4.csv',
 '/home/soniame/private/projects/corona_geo_id/coarse_grid/coarse_id_new_R2_5.csv',
 '/home/soniame/private/projects/corona_geo_id/coarse_grid/coarse_id_new_R2_6.csv']
csv_files
df_coarse_R2 = _coarse_union(csv_files)
df_coarse_R2 = df_coarse_R2.rename(columns={'coarse_wkt':'coarse_wkt_2', 'line':'line_wkt'})

In [None]:
print(len(df_coarse_R2))
print(len(df_coarse_R1))
print(len(df_lines))

In [None]:
df_coarse_R2.head()

In [None]:
tab = pd.read_csv('/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/figures/coarse_grid_distribution.csv')
bp = tab.sort_values('lines', ascending=False)[:6][['coarse_wkt']]
bp['bp'] = [1, 2, 3, 4, 5, 6]

In [None]:
# Union of previous coarse_wkt, big polygons and line jams
# Merge of coarse wkt R1 and R2
df_coarse_R = df_lines.\
     merge(df_coarse_R1, how = 'left', on = 'line_wkt'). \
     merge(df_coarse_R2, how = 'left', on = 'line_wkt'). \
     merge(bp, how = 'left', on = 'coarse_wkt'). \
     drop_duplicates()
 
print(len(df_coarse_R))
#df_coarse_R.head()

In [None]:
# Replace nulls for values
df_coarse_R['coarse_wkt_R'] = np.where(df_coarse_R['bp'].isna(), 
                                       df_coarse_R['coarse_wkt'], 
                                       df_coarse_R['coarse_wkt_2'])
print(sum(df_coarse_R['coarse_wkt_R'].isna()))

df_coarse_R['coarse_wkt_R'] = np.where(df_coarse_R['coarse_wkt_R'].isna(), 
                                       '(MISSING)', 
                                       df_coarse_R['coarse_wkt_R'])
print(sum(df_coarse_R['coarse_wkt_R'].isna()))

In [None]:
# Table of distribution pero wkt union
#df_dist = (df_coarse_R
#  >> group_by(_.coarse_wkt_R)
#  >> summarize(lines = _.line_wkt.count(),
#               jams  = _.count_lines.sum(), 
#               bp = _.bp.max())
#  >> ungroup()
#  >> arrange(-_.lines)
#  )
#min_wkt = str(df_dist.sort_values('lines', ascending=True)[:1].coarse_wkt_R)

In [None]:
# Export to csv 
coarse_R_path = '/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/coarse_id/coarse_grid_sample_R2.csv'
df_coarse_R[['line_wkt', 'count_lines', 'bp', 'coarse_wkt_R']].to_csv(coarse_R_path, index = False)

### Coarse grid distribution Big Polygons

In [None]:
from siuba import group_by, ungroup, arrange, summarize, _

In [None]:
df_coarse_R = pd.read_csv(coarse_R_path)

In [None]:
print(df_coarse_R.shape)
print(df_coarse_R.coarse_wkt_R.nunique())
print(df_coarse_R.line_wkt.nunique())
df_coarse_R.head()

In [None]:
# Table of distribution pero wkt union
tab = (df_coarse_R
  >> group_by(_.coarse_wkt_R)
  >> summarize(lines = _.line_wkt.count(),
               jams  = _.count_lines.sum(), 
               bp = _.bp.max())
  >> ungroup()
  >> arrange(-_.lines)
  )

In [None]:
# Export to csv 
tab.to_csv('/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/figures/coarse_grid_distribution_R.csv', index = False)

Plots

In [None]:
tab = pd.read_csv('/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/figures/coarse_grid_distribution_R.csv')

In [None]:
tab = tab[tab.coarse_wkt_R != '(MISSING)']
tab['geometry'] = gpd.GeoSeries.from_wkt(tab['coarse_wkt_R'])
tab = gpd.GeoDataFrame(tab, geometry='geometry')
fig, ax = plt.subplots(1, 1)
tab.plot(column='lines', ax=ax, 
         legend=True, cmap='OrRd')
plt.title('Coarse Grid 1 - Number of lines')
plt.savefig('/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/figures/coarse_map_LinesR2.pdf')

In [None]:
fig, ax = plt.subplots(1, 1)
tab.plot(column='jams', ax=ax, 
         legend=True, cmap='OrRd')
plt.title('Coarse Grid 1 - Number of Jams')
plt.savefig('/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/figures/coarse_map_JamsR2.pdf')

## Creating squares

In [3]:
from src.runners import split_polygon

In [4]:
from shapely.geometry import box, Polygon, MultiPolygon, GeometryCollection
from shapely.wkt import loads

In [5]:
# - Example 
# polygon = 'POLYGON((2.0117187499999822 44.38657313925715,-19.433593750000018 19.207272119703983,19.414062499999982 6.904449621538131,64.94140624999999 -3.096801256840523,81.46484374999999 37.21269961002643,45.78124999999998 24.106495997107682,53.69140624999998 51.22054369437158,3.7695312499999822 37.07257833232809,2.0117187499999822 44.38657313925715))'
# - Latin America 
# polygon = 'POLYGON ((-127.265625 34.30714385628804, -128.671875 -56.94497418085159, -28.4765625 -57.70414723434192, -29.8828125 16.97274101999902, -84.72656249999999 25.48295117535531, -116.71874999999999 35.746512259918504, -127.265625 34.30714385628804))'
# - Triángulo 
# polygon = Polygon([(0, 0), (5, 5), (5, 0)])
# - Peru
# polygon = 'POLYGON ((-77.10205078124999 -13.004557745339769, -72.158203125 -13.004557745339769, -72.158203125 -8.90678000752024, -77.10205078124999 -8.90678000752024, -77.10205078124999 -13.004557745339769))'
# - Latin america BID
polygon = 'POLYGON((-129.454 37.238,-90.781 27.311,-67.117 20.333,-68.721 17.506,-23.765 -9.114,-65.601 -60.714,-126.421 -23.479,-129.454 37.238))'
geometry = wkt.loads(polygon)
print(geometry.area)
#geometry

5970.269755


In [None]:
#df_lines = split_polygon._get_lines()

In [6]:
df_coarse = split_polygon._get_coarse_grid()

2021-08-26 11:55:44.992 | INFO     | src.runners.split_polygon:_get_coarse_grid:188 - Get coarse grid
2021-08-26 11:55:44.993 | DEBUG    | src.runners.split_polygon:_get_coarse_grid:190 - From /home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/coarse_id/coarse_grid_sample_R2.csv
2021-08-26 11:57:29.511 | DEBUG    | src.runners.split_polygon:_get_coarse_grid:194 - L: 21904128


In [7]:
df_dist   = pd.read_csv('/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/figures/coarse_grid_distribution_R.csv')

In [8]:
print(len(df_coarse))
print(len(df_dist))

21904128
91


In [9]:
df_coarse.head()

Unnamed: 0,line_wkt,count_lines,bp,coarse_wkt_R
0,"linestring (-35.232974 -5.731801,-35.233194 -5...",26,,POLYGON ((-32.5775356729615666 -12.75415506823...
1,"linestring (-103.34963 20.733682,-103.350022 2...",84,2.0,POLYGON ((-104.0069793612885434 17.72755605260...
2,"linestring (-103.270292 20.663531,-103.270276 ...",83,2.0,POLYGON ((-104.0069793612885434 17.72755605260...
3,"linestring (-99.14832 19.479112,-99.148334 19....",577,2.0,POLYGON ((-97.5919371226761996 18.289686316263...
4,"linestring (-47.221241 -22.858412,-47.221295 -...",71,1.0,POLYGON ((-46.3295683609798701 -25.28520383079...


In [None]:
# Lines 
# df_lines = _get_lines()
# if split != 0:
#     logger.debug(f"Split: {split}")
#     df_lines = df_lines[df_lines.split == split]
# if update_coarse_grid:
#    df_coarse = _create_coarse_grid(df_lines, tiles, split)
# else:

In [None]:
_threshold_density_func(geometry, .01, df_dist, df_coarse)

In [12]:
print(df_dist.shape)
print(df_coarse.shape)

(91, 4)
(21904128, 4)


In [11]:
df_coarse[df_coarse.coarse_wkt_R == '(MISSING)'].to_csv("/home/soniame/shared/spd-sdv-omitnik-waze/corona/geo_partition/figures/missing_ids.csv")

In [15]:
df_coarse[df_coarse.coarse_wkt_R == '(MISSING)'].head()

Unnamed: 0,line_wkt,count_lines,bp,coarse_wkt_R
12,"linestring (-69.914955 18.491923,-69.914914 18...",158,,(MISSING)
24,"linestring (-106.120685 28.728984,-106.122408 ...",5694,,(MISSING)
58,"linestring (-70.687404 19.44413,-70.687312 19....",7,,(MISSING)
151,"linestring (-70.689043 19.415806,-70.690132 19...",17,,(MISSING)
161,"linestring (-48.767952 -27.500757,-48.768771 -...",11,1.0,(MISSING)


## Lines intersection per square



In [None]:
def intersection_line(line, geometry):
    #print(line)
    line_wkt = wkt.loads(str(line))
    result = geometry.intersection(line_wkt)
    
    return(result)

In [None]:
df_lines['geo_id'] = ""
#df_lines.head()

In [None]:
for polygon in outdf.geometry:
    print(polygon)
    
    geometry = wkt.loads(polygon)
    lines = df_lines[df_lines['geo_id'] == ''].line_wkt
    if len(lines) > 0:
        # print(len(lines))
        intersect = [intersection_func(line, geometry) for line in df_lines.line_wkt]
        df_lines.loc[[x > 0 for x in intersect], 'geo_id'] = polygon
    

In [None]:
# Export to csv
df_lines.to_csv(f"~/private/geo_id_polygon/line_wkt_area_geo_id_{cm}.csv", index = False)

### Distribution of lines/jams per square

In [None]:
from siuba import group_by, ungroup, arrange, summarize, _
import geopandas as gpd
import folium

In [None]:
#df_geo_id = pd.read_csv(f"/home/soniame/private/projects/corona_geo_id/intersection/geo_grid_area_geo_id_2021073017071627679286.csv")
print(df_geo_id.shape)
df_geo_id.head()

In [None]:
tab = (df_geo_id
  >> group_by(_.geo_id)
  >> summarize(lines = _.line_wkt.count())
  >> ungroup()
  >> arrange("lines")
  )
tab['geometry'] = gpd.GeoSeries.from_wkt(tab['geo_id'])
tab = gpd.GeoDataFrame(tab, geometry='geometry')

tab.head()

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1)
tab.plot(column='lines', ax=ax, legend=True, cmap='OrRd')

In [None]:
df = tab
m = folium.Map(location=[40.70, -73.94], zoom_start=10, tiles='CartoDB positron')
for _, r in df.iterrows():
    # Without simplifying the representation of each borough,
    # the map might not be displayed
    sim_geo = gpd.GeoSeries(r['geometry']).simplify(tolerance=0.001)
    geo_j = sim_geo.to_json()
    geo_j = folium.GeoJson(data=geo_j,
                           style_function=lambda x: {'fillColor': 'orange'})
    folium.Popup(r['lines']).add_to(geo_j)
    geo_j.add_to(m)
m

In [None]:
#centroid = tiles[0].geometry.centroid
#m = folium.Map(location=(centroid.latitude, centroid.longitude))


In [None]:
for tile in tiles:
    folium.GeoJson(tile.geometry.geojson).add_to(m)
m