# Create geojson file from list of PUDO coords
See /home/test/Documents/PROJECTS/VFH/iPython_Notebooks/dv_pudo_profiles.ipynb for more info.  

ref: https://geoffboeing.com/2015/10/exporting-python-data-geojson/  
Huom: `with open(output_filename, 'wb') as output_file:` should be `'w'` not `'wb'`  

**Resources**  
geoJSON minifier: https://jsonformatter.org/json-minify  

In [1]:
import sys
sys.executable

'/home/test/anaconda3/envs/vfh/bin/python'

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import pandas.io.sql as pandasql
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker

from IPython.display import HTML, display
import datetime
import matplotlib.dates as mdates
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import os

from datetime import datetime

import json

# Functions

In [3]:
# https://geoffboeing.com/2015/10/exporting-python-data-geojson/
def df_to_geojson(df, properties, lat="latitude", lon="longitude"):
    geojson = {"type":"FeatureCollection", "features":[]}
    for _, row in df.iterrows():
        feature = {"type":"Feature",
                   "properties":{},
                   "geometry":{"type":"Point",
                               "coordinates":[]}}
        feature["geometry"]["coordinates"] = [row[lon],row[lat]]
        for prop in properties:
            feature["properties"][prop] = int(row[prop])
        geojson["features"].append(feature)
    return geojson

# GeoJSON properties for file

In [4]:
# Define the properties of the PUDO geojson file
prop = ["type","counts", "overlap"]

# Variables

In [9]:
ward=1
this_day = 'Monday'
this_tod = 'amPeak'

# Read PUDO data from csv file

In [10]:
fname = '../../../Documents/PROJECTS/VFH/COT_PAGE/INTERIM_DATA/FROM_PGADMIN/tripdata_092018_' + this_day + '.csv'
# fname = '~/Documents/PROJECTS/BDITTO/VFH/DATA_STORY/DATA/from_pgAdmin/tripdata_092018_' + this_day + '.csv'

df = pd.read_csv(fname)

print(df.shape)
df.head(5)

(494842, 17)


Unnamed: 0,dow,hh,timewindow,pickup_datetime,dropoff_datetime,pickup_ward2018,dropoff_ward2018,pickup_gc_intersection_id,dropoff_gc_intersection_id,pickup_neighbourhood,dropoff_neighbourhood,pickup_geom,dropoff_geom,lat_pickup,lon_pickup,lat_dropoff,lon_dropoff
0,Monday,0,nightI,2018-09-03 00:00:00,2018-09-03 00:34:01,18.0,10.0,13447475,13467654,51.0,77.0,0101000020E61000005BE2AF6670DA53C01D22DBEA93E2...,0101000020E610000013E6CE309BD953C0938FE7777DD2...,43.770139,-79.41311,43.644454,-79.400097
1,Monday,0,nightI,2018-09-03 00:00:00,2018-09-03 00:10:54,10.0,13.0,13467763,13464030,77.0,75.0,0101000020E610000051E519D30AD953C0C23B89A05BD2...,0101000020E610000079E6DB3E68D853C00F3AF08356D5...,43.643421,-79.391286,43.666703,-79.381363
2,Monday,0,nightI,2018-09-03 00:00:00,2018-09-03 00:12:34,13.0,10.0,13464551,13467763,71.0,77.0,0101000020E6100000A342F6F3C6D753C0E8EE4CFEEDD4...,0101000020E610000051E519D30AD953C0C23B89A05BD2...,43.663513,-79.371518,43.643421,-79.391286
3,Monday,0,nightI,2018-09-03 00:00:00,2018-09-03 00:24:18,13.0,3.0,13465140,13470547,75.0,19.0,0101000020E6100000DF5023ED39D853C020ED2E3976D4...,0101000020E610000004410BFFE5E153C041B447898CCC...,43.659858,-79.378536,43.598039,-79.529663
4,Monday,0,nightI,2018-09-03 00:00:00,2018-09-03 00:18:21,8.0,13.0,14044860,14673305,31.0,75.0,0101000020E6100000A37FC1EFB8DC53C070C432B4FCDB...,0101000020E61000008C84E11C68D853C062D98093A3D5...,43.718649,-79.448788,43.669054,-79.381355


In [11]:
# df['timewindow'].unique()
# There are 4 timewindows for weekdays (['nightI', 'nightII', 'amPeak', 'midday', 'pmPeak']) 
# and 3 for weekends (['nightI', 'nightII', 'amPeak', 'midday'])

# Housekeeping

## 1. Round lat, lon to 4 decimal places

In [12]:
n=4 # number of decimals to round to

df['lat_pickup'] = df['lat_pickup'].round(decimals=n)
df['lon_pickup'] = df['lon_pickup'].round(decimals=n)

df['lat_dropoff'] = df['lat_dropoff'].round(decimals=n)
df['lon_dropoff'] = df['lon_dropoff'].round(decimals=n)

## 2. Filter out rows where neither pickup or dropoff wards are the ward of interest
Make col `keep` = 1 for where pick-up or for drop-off wards are the ward of interest

In [13]:
# Assign a Pickup or Dropoff in column 'type'
df['keep'] = np.nan
df['latitude'] = np.nan
df['longitude'] = np.nan

df.loc[(df['pickup_ward2018']==ward) | (df['dropoff_ward2018']==ward), 'keep'] = 1

In [14]:
# Keep only rows where `keep` is not null
df = df[df['keep'].notnull()]

In [15]:
df.head(5)

Unnamed: 0,dow,hh,timewindow,pickup_datetime,dropoff_datetime,pickup_ward2018,dropoff_ward2018,pickup_gc_intersection_id,dropoff_gc_intersection_id,pickup_neighbourhood,dropoff_neighbourhood,pickup_geom,dropoff_geom,lat_pickup,lon_pickup,lat_dropoff,lon_dropoff,keep,latitude,longitude
6,Monday,0,nightI,2018-09-03 00:00:00,2018-09-03 00:13:26,1.0,1.0,30082386,13460542,1.0,1.0,0101000020E61000002E8E37EAE2E653C01D39868BD2DC...,0101000020E61000001223231E8BE653C012195BAA40D8...,43.7252,-79.6076,43.6895,-79.6022,1.0,,
65,Monday,0,nightI,2018-09-03 00:00:00,2018-09-03 00:13:36,6.0,1.0,30109966,13451158,26.0,2.0,0101000020E6100000F094730106DF53C0346577E63DDF...,0101000020E61000002595DD571BE553C04E5E8510A7DF...,43.7441,-79.4847,43.7473,-79.5798,1.0,,
77,Monday,0,nightI,2018-09-03 00:00:00,2018-09-03 00:22:29,1.0,9.0,13461025,14047180,1.0,94.0,0101000020E6100000A9ABCBC11DE653C075770AAAE7D7...,0101000020E6100000B94BF948BEDB53C0D4C5FE9626D6...,43.6868,-79.5956,43.6731,-79.4335,1.0,,
92,Monday,1,nightI,2018-09-03 01:00:00,2018-09-03 01:04:16,1.0,1.0,13451763,13450652,1.0,1.0,0101000020E6100000716B477BC0E653C0A2CEA29838DF...,0101000020E6100000DF7A0DF462E653C0EC776DDE14E0...,43.7439,-79.6055,43.7506,-79.5998,1.0,,
110,Monday,0,nightI,2018-09-03 00:00:00,2018-09-03 00:42:09,25.0,1.0,13441921,14075750,131.0,1.0,0101000020E61000008BC12B98A4CD53C04A9D99C3E2E8...,0101000020E6100000BBD25DD28BE653C029598BA581DC...,43.8194,-79.2132,43.7227,-79.6023,1.0,,


## 3. Mark if row is a pickup or a dropoff
pickup is where pickup ward == ward, dropoff is where dropoff_ward == ward  
Note that a row can contain **both** a pickup and a dropoff !!!  

In [16]:
df['pu'] = np.nan
df['do'] = np.nan

In [17]:
df.loc[df['pickup_ward2018']==ward, 'pu'] = 1

In [18]:
df.loc[df['dropoff_ward2018']==ward, 'do'] = 1

In [19]:
df.head(5)

Unnamed: 0,dow,hh,timewindow,pickup_datetime,dropoff_datetime,pickup_ward2018,dropoff_ward2018,pickup_gc_intersection_id,dropoff_gc_intersection_id,pickup_neighbourhood,...,dropoff_geom,lat_pickup,lon_pickup,lat_dropoff,lon_dropoff,keep,latitude,longitude,pu,do
6,Monday,0,nightI,2018-09-03 00:00:00,2018-09-03 00:13:26,1.0,1.0,30082386,13460542,1.0,...,0101000020E61000001223231E8BE653C012195BAA40D8...,43.7252,-79.6076,43.6895,-79.6022,1.0,,,1.0,1.0
65,Monday,0,nightI,2018-09-03 00:00:00,2018-09-03 00:13:36,6.0,1.0,30109966,13451158,26.0,...,0101000020E61000002595DD571BE553C04E5E8510A7DF...,43.7441,-79.4847,43.7473,-79.5798,1.0,,,,1.0
77,Monday,0,nightI,2018-09-03 00:00:00,2018-09-03 00:22:29,1.0,9.0,13461025,14047180,1.0,...,0101000020E6100000B94BF948BEDB53C0D4C5FE9626D6...,43.6868,-79.5956,43.6731,-79.4335,1.0,,,1.0,
92,Monday,1,nightI,2018-09-03 01:00:00,2018-09-03 01:04:16,1.0,1.0,13451763,13450652,1.0,...,0101000020E6100000DF7A0DF462E653C0EC776DDE14E0...,43.7439,-79.6055,43.7506,-79.5998,1.0,,,1.0,1.0
110,Monday,0,nightI,2018-09-03 00:00:00,2018-09-03 00:42:09,25.0,1.0,13441921,14075750,131.0,...,0101000020E6100000BBD25DD28BE653C029598BA581DC...,43.8194,-79.2132,43.7227,-79.6023,1.0,,,,1.0


## 4. Aggregate pickups and dropoffs for this_tod
Sums trips at same intersection.

In [20]:
def agg_counts(df, this_type):
    
    df_agg=pd.DataFrame()
    
    this_ward = this_type + '_ward2018'
    cols = ['dow','timewindow','lat_'+ this_type,'lon_'+ this_type,
            this_type+'_gc_intersection_id', this_type +'_neighbourhood']
    if this_type == 'pickup':
        idx_name = 'pcounts'
    else:
        idx_name = 'dcounts'

    df_agg=df.loc[(df[this_ward]==ward) & 
                  (df['timewindow']==this_tod)].groupby(cols).size().reset_index(name=idx_name)

    df_agg = df_agg.rename(columns={'lat_'+this_type: 'latitude'})
    df_agg = df_agg.rename(columns={'lon_'+this_type: 'longitude'})
    df_agg = df_agg.rename(columns={this_type+'_gc_intersection_id': 'intersection_id'})
    df_agg = df_agg.rename(columns={this_type+'_neighbourhood': 'nn'})
    
    # Add extra column to allow for stacking later on
    if this_type == 'pickup':
        df_agg = df_agg.assign(dcounts=0)
    else:
        idx_name = 'dcounts'
        df_agg = df_agg.assign(pcounts=0)
    
    return df_agg

In [21]:
df_agg_pu = pd.DataFrame()
df_agg_pu = agg_counts(df, 'pickup')

In [22]:
df_agg_do=pd.DataFrame()
df_agg_do = agg_counts(df, 'dropoff')

In [24]:
df_agg_do.head(5)

Unnamed: 0,dow,timewindow,latitude,longitude,intersection_id,nn,dcounts,pcounts
0,Monday,amPeak,43.6709,-79.5904,13463803,1.0,1,0
1,Monday,amPeak,43.6727,-79.5921,13463477,1.0,2,0
2,Monday,amPeak,43.6741,-79.5889,30029322,1.0,14,0
3,Monday,amPeak,43.6743,-79.5654,13463160,7.0,1,0
4,Monday,amPeak,43.6745,-79.5632,13463137,11.0,2,0


## 5. Stack pickups on top of dropoffs

In [25]:
# stack
print(df_agg_pu.shape)
print(df_agg_do.shape)

df_merge=pd.DataFrame()

df_merge=pd.concat([df_agg_pu, df_agg_do],sort=False)

print(df_merge.shape)
df_merge.head(11)

(642, 8)
(465, 8)
(1107, 8)


Unnamed: 0,dow,timewindow,latitude,longitude,intersection_id,nn,pcounts,dcounts
0,Monday,amPeak,43.6743,-79.5654,13463160,7.0,3,0
1,Monday,amPeak,43.6745,-79.5632,13463137,11.0,2,0
2,Monday,amPeak,43.6747,-79.5901,30029304,1.0,1,0
3,Monday,amPeak,43.6757,-79.5684,13462944,7.0,96,0
4,Monday,amPeak,43.6763,-79.5704,13462841,7.0,3,0
5,Monday,amPeak,43.6768,-79.5642,13462745,7.0,11,0
6,Monday,amPeak,43.6771,-79.5712,13462701,7.0,1,0
7,Monday,amPeak,43.6775,-79.568,13462642,7.0,3,0
8,Monday,amPeak,43.6781,-79.5688,13462541,7.0,1,0
9,Monday,amPeak,43.6782,-79.5648,13462514,7.0,4,0


In [None]:
# Check
this_id=20232410  #13453946
df_merge.loc[df_merge['intersection_id']==this_id]

# Compare with database
# SELECT * 
# FROM cnangini.trip_data_092018_timewin_monday
# WHERE timewindow = 'amPeak'
# AND dropoff_gc_intersection_id = 20232410
# 
# RESULT: 24 pickups and 212 dropoffs

## 6. Group on common `intersection_id`
For each same intersection, collapse row for pickups and row for dropoffs into one single row with `pcounts` and `dcounts` representing the total counts for each.

In [26]:
df_merge_gr=pd.DataFrame()

df_merge_gr=df_merge.groupby(['intersection_id','latitude','longitude'],as_index=False).sum()

## 7. Make the dataframes for each layer!
Need 3 separate dataframes for PUDOS, PU only, DO only.

In [28]:
df_pudo_layer = pd.DataFrame()
df_pu_layer = pd.DataFrame()
df_do_layer = pd.DataFrame()

In [29]:
df_pudo_layer = df_merge_gr.loc[(df_merge_gr['pcounts']>0) & (df_merge_gr['dcounts']>0)]
df_pu_layer = df_merge_gr.loc[(df_merge_gr['pcounts']>0) & (df_merge_gr['dcounts']==0)]
df_do_layer = df_merge_gr.loc[(df_merge_gr['pcounts']==0) & (df_merge_gr['dcounts']>0)]

# Save to geojson

In [None]:
print('w' +repr(ward), this_day, this_tod)

## Apply cut-off filter
Only counts >= `c` will be saved.  

In [30]:
c = 15 # cut-off

In [31]:
geojson_pu_layer = df_to_geojson(df_pu_layer.loc[(df_pu_layer["pcounts"]>=c)], ["pcounts","nn"])

In [32]:
geojson_do_layer = df_to_geojson(df_do_layer.loc[(df_do_layer["dcounts"]>=c)], ["dcounts","nn"])

In [33]:
geojson_pudo_layer = df_to_geojson(df_pudo_layer.loc[
    ((df_pudo_layer["pcounts"]+df_pudo_layer["dcounts"]) >= c)
                                                    ], ["pcounts","dcounts","nn"])

# geoJSON format for this_day, this_tod
Format:  
```
this_day: {
		this_tod: {
			'pu': geojson_pu_layer,
			'pudo':geojson_pudo_layer,
			'do':geojson_do_layer
		}
 ``` 
 
 e.g.
 
 ```
'Monday': {
		'amPeak': {
			'pu':geojson_pu_layer,
			'pudo':geojson_pudo_layer,
			'do':geojson_do_layer
		}
 ```

In [None]:
df=

# Write geojson to file
Use `separators=(',', ':')` with `json.dump` for a minified file.  

ref: https://stackoverflow.com/questions/33233313/python-json-dumpsval-to-output-minified-json

In [None]:
# mydir = '../../../Documents/PROJECTS/VFH/COT_PAGE/INTERIM_DATA/INDIVIDUAL_GEOJSONS/'
mydir = '/home/nangini/Documents/PROJECTS/BDITTO/VFH/DATA_STORY/DATA/INDIVIDUAL_GEOJSONS/'

output_filename = mydir + 'w' + repr(ward) + '_092018_' + this_day + '_' + this_tod + '_cutoff_nn.geojson'

with open(output_filename, 'w') as output_file:
    output_file.write('')
    json.dump(megageo, output_file, separators=(',', ':')) 

In [None]:
print("200 OK")

`
$ ls -lh ~/Documents/PROJECTS/VFH/COT_PAGE/INTERIM_DATA/INDIVIDUAL_GEOJSONS/
total 216K
-rw-r--r-- 1 test test 109K Dec  9 11:23 w1_092018_Monday_pmPeak_do_agg.geojson
-rw-r--r-- 1 test test 104K Dec  9 11:23 w1_092018_Monday_pmPeak_pu_agg.geojson
`


`
$ ls -lh src/geojson/
-rw-r--r-- 1 test test 328K Dec  4 15:46 w1_092018_Monday_amPeak_agg.geojson
-rw-r--r-- 1 test test 753K Dec  4 13:51 w1_092018_Monday_amPeak.geojson
`


# Histograms of counts
https://mode.com/example-gallery/python_histogram/

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter

def counts_hist(df, col, nbins):
    ax = df.hist(column=col, bins=nbins, grid=True, color='#86bf91', zorder=2, rwidth=0.9)

    ax = ax[0]
    for x in ax:

        # Despine
        x.spines['right'].set_visible(False)
        x.spines['top'].set_visible(False)
        x.spines['left'].set_visible(False)

        # Switch off ticks
        x.tick_params(axis="both", which="both", bottom="off", top="off", 
                      labelbottom="on", left="off", right="off", labelleft="on")

        # Draw horizontal axis lines
        vals = x.get_yticks()
        for tick in vals:
            x.axhline(y=tick, linestyle='dashed', alpha=0.4, color='#eeeeee', zorder=1)

        # Remove title
        x.set_title("")

        # Set x-axis label
        x.set_xlabel(col, labelpad=20, weight='bold', size=12)

        # Set y-axis label
        x.set_ylabel("Bin counts", labelpad=20, weight='bold', size=12)

        # Format y-axis label
        x.yaxis.set_major_formatter(StrMethodFormatter('{x:,g}'))

In [None]:
df_pudo_layer['sum'] = np.nan

df_pudo_layer['sum'] = df_pudo_layer['dcounts'] + df_pudo_layer['pcounts']
df_pudo_layer.head(10)

In [None]:
nbins = 10
print(df_pu_layer['pcounts'].max())
print('pickups bin width: ' + repr(df_pu_layer['pcounts'].max()/nbins))

counts_hist(df_pu_layer, 'pcounts', nbins)

print(df_do_layer['dcounts'].max())
print('dropoffs bin width: ' + repr(df_do_layer['dcounts'].max()/nbins))
counts_hist(df_do_layer, 'dcounts', nbins)

nbins = 28
print(df_pudo_layer['sum'].max())
print('pudos bin width: ' + repr(df_pudo_layer['sum'].max()/nbins))
counts_hist(df_pudo_layer, 'sum', nbins)