# Create geojson file from list of PUDO coords
See /home/test/Documents/PROJECTS/VFH/iPython_Notebooks/dv_pudo_profiles.ipynb for more info.  

ref: https://geoffboeing.com/2015/10/exporting-python-data-geojson/  
Huom: `with open(output_filename, 'wb') as output_file:` should be `'w'` not `'wb'`  

In [1]:
import sys
sys.executable

'/home/test/anaconda3/envs/vfh/bin/python'

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import pandas.io.sql as pandasql
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker

from IPython.display import HTML, display
import datetime
import matplotlib.dates as mdates
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import os

from datetime import datetime

import json

# Functions

In [3]:
# https://geoffboeing.com/2015/10/exporting-python-data-geojson/
def df_to_geojson(df, properties, lat='latitude', lon='longitude'):
    geojson = {'type':'FeatureCollection', 'features':[]}
    for _, row in df.iterrows():
        feature = {'type':'Feature',
                   'properties':{},
                   'geometry':{'type':'Point',
                               'coordinates':[]}}
        feature['geometry']['coordinates'] = [row[lon],row[lat]]
        for prop in properties:
            feature['properties'][prop] = row[prop]
        geojson['features'].append(feature)
    return geojson

# GeoJSON properties for file

In [4]:
# Define the properties of the PUDO geojson file
cols = ['type','dow','timewindow','latitude','longitude','intersection_id','counts']

# Variables

In [5]:
ward=1
this_day = "Monday"
this_tod = "amPeak"

# Read PUDO data from csv file

In [6]:
fname = '../../../Documents/PROJECTS/VFH/COT_PAGE/test_data/all_pudo_mon_ampeak.csv'
# fname = '../../../Documents/PROJECTS/VFH/COT_PAGE/dummydata/test_data/w1_pudo_092018_latlon.csv'
# fname = '~/Documents/PROJECTS/BDITTO/VFH/DATA_STORY/DATA/from_pgAdmin//w1_pudo_092018_latlon.csv'

df = pd.read_csv(fname)

print(df.shape)
df.head(5)

(47487, 12)


Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_ward2018,dropoff_ward2018,pickup_gc_intersection_id,dropoff_gc_intersection_id,dow,timewindow,lat_pickup,lon_pickup,lat_dropoff,lon_dropoff
0,2018-09-03 07:00:00,2018-09-03 07:04:57,14.0,14,13463478,30104238,Monday,amPeak,43.669251,-79.335609,43.665462,-79.354768
1,2018-09-03 07:00:00,2018-09-03 07:06:18,15.0,15,13455929,13453408,Monday,amPeak,43.713786,-79.3922,43.730241,-79.381454
2,2018-09-03 07:00:00,2018-09-03 07:06:18,15.0,15,13455929,13453408,Monday,amPeak,43.713786,-79.3922,43.730241,-79.381454
3,2018-09-03 07:00:00,2018-09-03 07:06:38,4.0,4,13465701,13467084,Monday,amPeak,43.657544,-79.465249,43.648833,-79.450028
4,2018-09-03 07:00:00,2018-09-03 07:06:07,10.0,10,13467512,13466378,Monday,amPeak,43.645373,-79.392061,43.652152,-79.382336


In [7]:
list(df)

['pickup_datetime',
 'dropoff_datetime',
 'pickup_ward2018',
 'dropoff_ward2018',
 'pickup_gc_intersection_id',
 'dropoff_gc_intersection_id',
 'dow',
 'timewindow',
 'lat_pickup',
 'lon_pickup',
 'lat_dropoff',
 'lon_dropoff']

In [8]:
df.loc[df['pickup_gc_intersection_id']==13454935]

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_ward2018,dropoff_ward2018,pickup_gc_intersection_id,dropoff_gc_intersection_id,dow,timewindow,lat_pickup,lon_pickup,lat_dropoff,lon_dropoff
30245,2018-09-10 08:00:00,2018-09-10 08:08:53,1.0,1,13454935,20232410,Monday,amPeak,43.722744,-79.598217,43.728927,-79.60573
32321,2018-09-10 08:00:00,2018-09-10 08:05:53,1.0,1,13454935,13459809,Monday,amPeak,43.722744,-79.598217,43.693351,-79.58852


In [9]:
df.loc[df['dropoff_gc_intersection_id']==13454935]

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_ward2018,dropoff_ward2018,pickup_gc_intersection_id,dropoff_gc_intersection_id,dow,timewindow,lat_pickup,lon_pickup,lat_dropoff,lon_dropoff
40288,2018-09-03 08:00:00,2018-09-03 08:03:28,1.0,1,13455196,13454935,Monday,amPeak,43.72126,-79.615747,43.722744,-79.598217


# Housekeeping

## 1. Round lat, lon to 4 decimal places

In [10]:
n=4 # number of decimals to round to

df['lat_pickup'] = df['lat_pickup'].round(decimals=n)
df['lon_pickup'] = df['lon_pickup'].round(decimals=n)

df['lat_dropoff'] = df['lat_dropoff'].round(decimals=n)
df['lon_dropoff'] = df['lon_dropoff'].round(decimals=n)

## 2. Filter out rows where neither pickup or dropoff wards are the ward of interest
Make col `keep` = 1 for where pick-up or for drop-off wards are the ward of interest

In [11]:
# Assign a Pickup or Dropoff in column 'type'
df['keep'] = np.nan
df['latitude'] = np.nan
df['longitude'] = np.nan

df.loc[(df['pickup_ward2018']==ward) | (df['dropoff_ward2018']==ward), 'keep'] = 1

In [12]:
# Check
df['keep'].unique()

array([nan,  1.])

In [13]:
# Keep only rows where `keep` is not null
df = df[df['keep'].notnull()]

In [14]:
# Check
df['keep'].unique()

array([1.])

In [15]:
print(df.shape)

(2208, 15)


In [16]:
df.head(5)

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_ward2018,dropoff_ward2018,pickup_gc_intersection_id,dropoff_gc_intersection_id,dow,timewindow,lat_pickup,lon_pickup,lat_dropoff,lon_dropoff,keep,latitude,longitude
15,2018-09-03 07:00:00,2018-09-03 07:08:33,1.0,7,30033811,13450342,Monday,amPeak,43.7228,-79.5451,43.7521,-79.5424,1.0,,
31,2018-09-03 07:00:00,2018-09-03 07:22:24,1.0,17,30017577,13445287,Monday,amPeak,43.7167,-79.6017,43.7854,-79.3292,1.0,,
53,2018-09-03 08:00:00,2018-09-03 08:12:07,1.0,7,13453157,13450560,Monday,amPeak,43.7343,-79.5569,43.7501,-79.5114,1.0,,
113,2018-09-03 08:00:00,2018-09-03 08:06:46,1.0,1,13454852,13451809,Monday,amPeak,43.7235,-79.619,43.7437,-79.6074,1.0,,
122,2018-09-03 07:00:00,2018-09-03 07:24:54,1.0,11,13460801,13463257,Monday,amPeak,43.6879,-79.5868,43.6713,-79.3872,1.0,,


## 3a. Split df into two dfs for pickups and dropoffs separately
one where pickup ward == ward, the other where dropoff_ward == ward  

Assign `type` = 1 for pickup df, 5 for dropoff df.

In [17]:
df_pu = pd.DataFrame()
df_pu = df.loc[df['pickup_ward2018']==ward]
df_pu['type'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
df_do = pd.DataFrame()
df_do = df.loc[df['dropoff_ward2018']==ward]
df_do['type'] = 5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
df_pu.head(4)

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_ward2018,dropoff_ward2018,pickup_gc_intersection_id,dropoff_gc_intersection_id,dow,timewindow,lat_pickup,lon_pickup,lat_dropoff,lon_dropoff,keep,latitude,longitude,type
15,2018-09-03 07:00:00,2018-09-03 07:08:33,1.0,7,30033811,13450342,Monday,amPeak,43.7228,-79.5451,43.7521,-79.5424,1.0,,,1
31,2018-09-03 07:00:00,2018-09-03 07:22:24,1.0,17,30017577,13445287,Monday,amPeak,43.7167,-79.6017,43.7854,-79.3292,1.0,,,1
53,2018-09-03 08:00:00,2018-09-03 08:12:07,1.0,7,13453157,13450560,Monday,amPeak,43.7343,-79.5569,43.7501,-79.5114,1.0,,,1
113,2018-09-03 08:00:00,2018-09-03 08:06:46,1.0,1,13454852,13451809,Monday,amPeak,43.7235,-79.619,43.7437,-79.6074,1.0,,,1


In [20]:
# Fill `latitude` and `longitude` col with pickup or dropoff lat/lon
df_pu['latitude'] = df_pu['lat_pickup']
df_pu['longitude'] = df_pu['lon_pickup']

df_do['latitude'] = df_do['lat_dropoff']
df_do['longitude'] = df_do['lon_dropoff']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pa

In [21]:
df_pu.head(4)

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_ward2018,dropoff_ward2018,pickup_gc_intersection_id,dropoff_gc_intersection_id,dow,timewindow,lat_pickup,lon_pickup,lat_dropoff,lon_dropoff,keep,latitude,longitude,type
15,2018-09-03 07:00:00,2018-09-03 07:08:33,1.0,7,30033811,13450342,Monday,amPeak,43.7228,-79.5451,43.7521,-79.5424,1.0,43.7228,-79.5451,1
31,2018-09-03 07:00:00,2018-09-03 07:22:24,1.0,17,30017577,13445287,Monday,amPeak,43.7167,-79.6017,43.7854,-79.3292,1.0,43.7167,-79.6017,1
53,2018-09-03 08:00:00,2018-09-03 08:12:07,1.0,7,13453157,13450560,Monday,amPeak,43.7343,-79.5569,43.7501,-79.5114,1.0,43.7343,-79.5569,1
113,2018-09-03 08:00:00,2018-09-03 08:06:46,1.0,1,13454852,13451809,Monday,amPeak,43.7235,-79.619,43.7437,-79.6074,1.0,43.7235,-79.619,1


In [22]:
df_do.head(4)

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_ward2018,dropoff_ward2018,pickup_gc_intersection_id,dropoff_gc_intersection_id,dow,timewindow,lat_pickup,lon_pickup,lat_dropoff,lon_dropoff,keep,latitude,longitude,type
113,2018-09-03 08:00:00,2018-09-03 08:06:46,1.0,1,13454852,13451809,Monday,amPeak,43.7235,-79.619,43.7437,-79.6074,1.0,43.7437,-79.6074,5
174,2018-09-10 08:00:00,2018-09-10 08:18:41,2.0,1,13468496,20232410,Monday,amPeak,43.6397,-79.5629,43.7289,-79.6057,1.0,43.7289,-79.6057,5
175,2018-09-10 08:00:00,2018-09-10 08:18:41,2.0,1,13468496,20232410,Monday,amPeak,43.6397,-79.5629,43.7289,-79.6057,1.0,43.7289,-79.6057,5
202,2018-09-10 07:00:00,2018-09-10 07:12:37,1.0,1,13450427,13453918,Monday,amPeak,43.7521,-79.589,43.7301,-79.5886,1.0,43.7301,-79.5886,5


## 3b. Aggregate pickups and dropoffs

In [23]:
df_pu_agg=pd.DataFrame()

df_pu_agg = df_pu.groupby(
    ['type','dow','timewindow','latitude','longitude','pickup_gc_intersection_id']
    ).size().reset_index(name='counts')

df_pu_agg.head(3)

Unnamed: 0,type,dow,timewindow,latitude,longitude,pickup_gc_intersection_id,counts
0,1,Monday,amPeak,43.6743,-79.5654,13463160,3
1,1,Monday,amPeak,43.6757,-79.5684,13462944,64
2,1,Monday,amPeak,43.6763,-79.5704,13462841,2


In [24]:
# Check
df_pu_agg.loc[df_pu_agg['pickup_gc_intersection_id']==13454935]

Unnamed: 0,type,dow,timewindow,latitude,longitude,pickup_gc_intersection_id,counts
187,1,Monday,amPeak,43.7227,-79.5982,13454935,2


In [25]:
df_do_agg=pd.DataFrame()

df_do_agg = df_do.groupby(
    ['type','dow','timewindow','latitude','longitude','dropoff_gc_intersection_id']
    ).size().reset_index(name='counts')

df_do_agg.head(3)

Unnamed: 0,type,dow,timewindow,latitude,longitude,dropoff_gc_intersection_id,counts
0,5,Monday,amPeak,43.6709,-79.5904,13463803,1
1,5,Monday,amPeak,43.6727,-79.5921,13463477,2
2,5,Monday,amPeak,43.6741,-79.5889,30029322,11


In [26]:
# Check
df_do_agg.loc[df_do_agg['dropoff_gc_intersection_id']==13454935]

Unnamed: 0,type,dow,timewindow,latitude,longitude,dropoff_gc_intersection_id,counts
167,5,Monday,amPeak,43.7227,-79.5982,13454935,1


## 4. Make a common table on common columns

### a. All points
Concat `df_pu` and `df_do`.

In [27]:
# First rename intersection id to common name to allow concat
df_pu = df_pu.rename(columns={'pickup_gc_intersection_id': 'intersection_id'})
df_do = df_do.rename(columns={'dropoff_gc_intersection_id': 'intersection_id'})

In [28]:
df_pudo=pd.DataFrame()

df_pudo=pd.concat([
    df_pu[['type','dow','timewindow','latitude','longitude','intersection_id']], 
    df_do[['type','dow','timewindow','latitude','longitude','intersection_id']]
    ])

df_pudo.head(4)

Unnamed: 0,type,dow,timewindow,latitude,longitude,intersection_id
15,1,Monday,amPeak,43.7228,-79.5451,30033811
31,1,Monday,amPeak,43.7167,-79.6017,30017577
53,1,Monday,amPeak,43.7343,-79.5569,13453157
113,1,Monday,amPeak,43.7235,-79.619,13454852


In [29]:
# Check
df_pudo.loc[df_pudo['intersection_id']==13454935]

Unnamed: 0,type,dow,timewindow,latitude,longitude,intersection_id
30245,1,Monday,amPeak,43.7227,-79.5982,13454935
32321,1,Monday,amPeak,43.7227,-79.5982,13454935
40288,5,Monday,amPeak,43.7227,-79.5982,13454935


### a. Aggregates
Concat `df_pu_agg` and `df_do_agg`.

In [30]:
# First rename intersection id to common name to allow concat
df_pu_agg = df_pu_agg.rename(columns={'pickup_gc_intersection_id': 'intersection_id'})
df_do_agg = df_do_agg.rename(columns={'dropoff_gc_intersection_id': 'intersection_id'})

In [31]:
df_pudo_agg=pd.DataFrame()

df_pudo_agg=pd.concat([df_pu_agg[cols], df_do_agg[cols] ])

df_pudo_agg.head(4)

Unnamed: 0,type,dow,timewindow,latitude,longitude,intersection_id,counts
0,1,Monday,amPeak,43.6743,-79.5654,13463160,3
1,1,Monday,amPeak,43.6757,-79.5684,13462944,64
2,1,Monday,amPeak,43.6763,-79.5704,13462841,2
3,1,Monday,amPeak,43.6768,-79.5642,13462745,5


In [32]:
# Check
df_pudo_agg.loc[df_pudo_agg['intersection_id']==13454935]

Unnamed: 0,type,dow,timewindow,latitude,longitude,intersection_id,counts
187,1,Monday,amPeak,43.7227,-79.5982,13454935,2
167,5,Monday,amPeak,43.7227,-79.5982,13454935,1


In [33]:
df=

SyntaxError: invalid syntax (<ipython-input-33-02c2a1c0215a>, line 1)

# Save to geojson

## All points

In [None]:
geojson = df_to_geojson(df_pudo, ['dow', 'timewindow', 'type'])

## Aggregated points

In [None]:
geojson_agg = df_to_geojson(df_pudo_agg, cols)

# Write geojson to file

## All points

In [None]:
output_filename = '../src/geojson/w1_092018_' + this_day + '_' + this_tod + '.geojson'
# output_filename = 'w1_092018_' + this_day + '_' + this_tod + '.geojson'

with open(output_filename, 'w') as output_file:
    output_file.write('')
    json.dump(geojson, output_file, indent=2) 

## Aggregated points

In [None]:
output_filename = '../src/geojson/w1_092018_' + this_day + '_' + this_tod + '_agg.geojson'
# output_filename = 'w1_092018_' + this_day + '_' + this_tod + '_agg.geojson'

with open(output_filename, 'w') as output_file:
    output_file.write('')
    json.dump(geojson_agg, output_file, indent=2) 

`
$ ls -lh src/geojson/
-rw-r--r-- 1 test test 328K Dec  4 15:46 w1_092018_Monday_amPeak_agg.geojson
-rw-r--r-- 1 test test 753K Dec  4 13:51 w1_092018_Monday_amPeak.geojson
`
