In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import random
from datetime import datetime

import geopandas as gpd
import googlemaps
import keyring
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import mplleaflet
import pandas as pd
import polyline
import seaborn as sns
from IPython.display import Markdown
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from shapely.geometry import Point, LineString

palette = sns.color_palette("colorblind", 20)


def to_geo(df, x_field='longitude', y_field='latitude'):
    geometry = [Point(xy) for xy in zip(df[x_field], df[y_field])]
    return gpd.GeoDataFrame(df, geometry=geometry)


spark = (
    SparkSession.builder
        .master("local")
        .appName("TFL Notebook")
        .config('spark.executor.memory', '8G')
        .config('spark.driver.memory', '16G')
        .config('spark.driver.maxResultSize', '10G')
        .config("spark.sql.crossJoin.enabled", "true")
        .getOrCreate()
)

BIKE_POINTS_FILE = "../data/bike-points.csv"

schema = T.StructType([
    T.StructField("idx",       T.IntegerType(), False),
    T.StructField("id",        T.IntegerType(), False),
    T.StructField("name",      T.StringType(),  False),
    T.StructField("latitude",  T.DoubleType(),  False),
    T.StructField("longitude", T.DoubleType(),  False),
    T.StructField("osgb_x",    T.DoubleType(),  False),
    T.StructField("osgb_y",    T.DoubleType(),  False),
    T.StructField("numdocks",  T.LongType(),    False),
    T.StructField("num_bikes", T.LongType(),    False),
    T.StructField("num_empty", T.LongType(),    False)
])
bike_points = spark.read.csv(BIKE_POINTS_FILE, schema=schema, header='true', mode="PERMISSIVE")
bike_points.createOrReplaceTempView("bike_points")

trips = spark.read.parquet("../data/parquet_trip")
trips.createOrReplaceTempView("trips")

df = spark.sql("""
    select (sum(duration) / 60) as duration, count(1) as trip_count, bike_id, start_year, start_month, start_day from trips
    group by bike_id, start_year, start_month, start_day
    order by trip_count desc
""")

df.createOrReplaceTempView("busy_bikes")

top_bikes = df.limit(10).toPandas()

top_bikes

Unnamed: 0,duration,trip_count,bike_id,start_year,start_month,start_day
0,334.0,28,11706,2017,9,1
1,609.0,27,2768,2015,7,9
2,496.0,25,12324,2017,6,21
3,442.0,24,7200,2016,7,30
4,523.0,24,10996,2015,7,9
5,486.0,24,2177,2015,7,9
6,428.0,24,6525,2015,8,6
7,635.0,24,1295,2015,8,7
8,468.0,23,9416,2015,7,9
9,454.55,23,9202,2017,8,15


# Select Interesting Bike

In [5]:
my_bike = top_bikes.to_dict(orient='records')[0]

my_bike

{'bike_id': 11706,
 'duration': 334.0,
 'start_day': 1,
 'start_month': 9,
 'start_year': 2017,
 'trip_count': 28}

In [None]:

bike_id = my_bike['bike_id']
year = my_bike['year']
month = my_bike['month']
day = my_bike['day']

journeys = spark.sql("""
    select 
     t.start_ts, 
     t.start_station_name, 
     t.end_ts, 
     t.end_station_name, 
     t.duration, 
     a.latitude as start_lat, 
     a.longitude as start_lon, 
     b.latitude as end_lat, 
     b.longitude as end_lon
    from trips t
    join bike_points a on (t.start_station_id = a.id)
    join bike_points b on (t.end_station_id = b.id)
    where bike_id = {0} 
      and start_year = {1} 
      and start_month = {2} 
      and start_day = {3}
    order by start_ts asc
""".format(bike_id, year, month, day)).toPandas()

journeys

In [None]:
gmaps = googlemaps.Client(key=keyring.get_password('logicalgenetics', 'google'))

route_data = []

for index, row in journeys.iterrows():
    p1 = (row['start_lon'], row['start_lat'])
    p2 = (row['end_lon'], row['end_lat'])

    directions_result = gmaps.directions("{1}, {0}".format(*p1),
                                         "{1}, {0}".format(*p2),
                                         mode="bicycling",
                                         departure_time=datetime.now())

    encoded = directions_result[0]['overview_polyline']['points']
    points = [(lat, lon) for (lon, lat) in polyline.decode(encoded)]
    
    if len(points) > 1:
        route_data.append(LineString(points))
    else:
        route_data.append(LineString([p1, p1])) # Zero length line
        

In [None]:
fig,ax=plt.subplots(figsize=(14,10))

palette = sns.color_palette("colorblind", journeys.count()[0])

starts = to_geo(journeys[['start_lon', 'start_lat']], x_field='start_lon', y_field='start_lat')
ends = to_geo(journeys[['end_lon', 'end_lat']], x_field='end_lon', y_field='end_lat')

routes = gpd.GeoDataFrame(pd.DataFrame(route_data, columns=['geometry']))
routes.plot(ax=ax, color=palette, linewidth=4.0)

ends.plot(ax=ax, markersize=100, marker='s', color=palette)
starts.plot(ax=ax, markersize=75, marker='o', color=palette)
mplleaflet.display(fig=fig, crs=starts.crs, tiles='cartodb_positron')