In [12]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [1]:
import random
from datetime import datetime

import geopandas as gpd
import googlemaps
import keyring
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import mplleaflet
import pandas as pd
import polyline
import seaborn as sns
from IPython.display import Markdown
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from shapely.geometry import Point, LineString

palette = sns.color_palette("colorblind", 20)


def to_geo(df, x_field='longitude', y_field='latitude'):
    geometry = [Point(xy) for xy in zip(df[x_field], df[y_field])]
    return gpd.GeoDataFrame(df, geometry=geometry)


spark = (
    SparkSession.builder
        .master("local")
        .appName("TFL Notebook")
        .config('spark.executor.memory', '8G')
        .config('spark.driver.memory', '16G')
        .config('spark.driver.maxResultSize', '10G')
        .config("spark.sql.crossJoin.enabled", "true")
        .getOrCreate()
)

BIKE_POINTS_FILE = "../data/bike-points.csv"

schema = T.StructType([
    T.StructField("idx",       T.IntegerType(), False),
    T.StructField("id",        T.IntegerType(), False),
    T.StructField("name",      T.StringType(),  False),
    T.StructField("latitude",  T.DoubleType(),  False),
    T.StructField("longitude", T.DoubleType(),  False),
    T.StructField("osgb_x",    T.DoubleType(),  False),
    T.StructField("osgb_y",    T.DoubleType(),  False),
    T.StructField("numdocks",  T.LongType(),    False),
    T.StructField("num_bikes", T.LongType(),    False),
    T.StructField("num_empty", T.LongType(),    False)
])
bike_points = spark.read.csv(BIKE_POINTS_FILE, schema=schema, header='true', mode="PERMISSIVE")
bike_points.createOrReplaceTempView("bike_points")

trips = spark.read.parquet("../data/parquet_trip")
trips.createOrReplaceTempView("trips")

df = spark.sql("""
    select (sum(duration) / 60) as duration, count(1) as trip_count, bike_id, start_year, start_month, start_day from trips
    group by bike_id, start_year, start_month, start_day
    order by trip_count desc
""")

df.createOrReplaceTempView("busy_bikes")

df.limit(10).toPandas()

Unnamed: 0,duration,trip_count,bike_id,start_year,start_month,start_day
0,334.0,28,11706,2017,9,1
1,609.0,27,2768,2015,7,9
2,496.0,25,12324,2017,6,21
3,442.0,24,7200,2016,7,30
4,523.0,24,10996,2015,7,9
5,486.0,24,2177,2015,7,9
6,428.0,24,6525,2015,8,6
7,635.0,24,1295,2015,8,7
8,468.0,23,9416,2015,7,9
9,454.55,23,9202,2017,8,15


In [2]:
journeys = spark.sql("""
    select 
     t.start_ts, 
     t.start_station_name, 
     t.end_ts, 
     t.end_station_name, 
     t.duration, 
     a.latitude as start_lat, 
     a.longitude as start_lon, 
     b.latitude as end_lat, 
     b.longitude as end_lon
    from trips t
    join bike_points a on (t.start_station_id = a.id)
    join bike_points b on (t.end_station_id = b.id)
    where bike_id = 12324 
      and start_year = 2017 
      and start_month = 6 
      and start_day = 21
    order by start_ts asc
""").toPandas()

journeys

Unnamed: 0,start_ts,start_station_name,end_ts,end_station_name,duration,start_lat,start_lon,end_lat,end_lon
0,2017-06-21 08:50:00,"Ackroyd Drive, Bow",2017-06-21 09:07:00,"Nesham Street, Wapping",1020,51.520398,-0.026768,51.507131,-0.06691
1,2017-06-21 09:47:00,"Nesham Street, Wapping",2017-06-21 09:51:00,"Watney Street, Shadwell",240,51.507131,-0.06691,51.511542,-0.056667
2,2017-06-21 09:56:00,"Watney Street, Shadwell",2017-06-21 10:07:00,"Granby Street, Shoreditch",660,51.511542,-0.056667,51.525645,-0.069543
3,2017-06-21 10:11:00,"Granby Street, Shoreditch",2017-06-21 10:36:00,"Lightermans Road, Millwall",1500,51.525645,-0.069543,51.499041,-0.020157
4,2017-06-21 10:53:00,"Lightermans Road, Millwall",2017-06-21 11:30:00,"Baylis Road, Waterloo",2220,51.499041,-0.020157,51.501444,-0.110699
5,2017-06-21 12:05:00,"Baylis Road, Waterloo",2017-06-21 12:43:00,"South Wharf Road, Paddington",2280,51.501444,-0.110699,51.517335,-0.17581
6,2017-06-21 12:46:00,"South Wharf Road, Paddington",2017-06-21 12:58:00,"Exhibition Road, Knightsbridge",720,51.517335,-0.17581,51.499917,-0.174554
7,2017-06-21 13:16:00,"Exhibition Road, Knightsbridge",2017-06-21 13:19:00,"South Kensington Station, South Kensington",180,51.499917,-0.174554,51.494412,-0.173881
8,2017-06-21 13:24:00,"South Kensington Station, South Kensington",2017-06-21 13:33:00,"Wright's Lane, Kensington",540,51.494412,-0.173881,51.500397,-0.193068
9,2017-06-21 13:38:00,"Wright's Lane, Kensington",2017-06-21 13:43:00,"Emperor's Gate, South Kensington",300,51.500397,-0.193068,51.495362,-0.185296


In [3]:
gmaps = googlemaps.Client(key=keyring.get_password('logicalgenetics', 'google'))

route_data = []

for index, row in journeys.iterrows():
    p1 = (row['start_lon'], row['start_lat'])
    p2 = (row['end_lon'], row['end_lat'])

    directions_result = gmaps.directions("{1}, {0}".format(*p1),
                                         "{1}, {0}".format(*p2),
                                         mode="bicycling",
                                         departure_time=datetime.now())

    encoded = directions_result[0]['overview_polyline']['points']
    points = [(lat, lon) for (lon, lat) in polyline.decode(encoded)]
    
    if len(points) > 1:
        route_data.append(LineString(points))

In [11]:
fig,ax=plt.subplots(figsize=(14,10))

palette = sns.color_palette("colorblind", journeys.count()[0])

df = to_geo(journeys[['start_lon', 'start_lat']], x_field='start_lon', y_field='start_lat')

routes = gpd.GeoDataFrame(pd.DataFrame(route_data, columns=['geometry']))
routes.plot(ax=ax, color=palette, linewidth=4.0)

df.plot(ax=ax, markersize=50, marker='o', color=palette)
mplleaflet.display(fig=fig, crs=df.crs, tiles='cartodb_positron')