In [1]:
import random
import pandas as pd
import geopandas as gpd
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import mplleaflet
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from shapely.geometry import Point, Polygon, LineString
from IPython.display import Markdown


palette = sns.color_palette("colorblind", 20)


def to_geo(df, x_field='longitude', y_field='latitude'):
    geometry = [Point(xy) for xy in zip(df[x_field], df[y_field])]
    return gpd.GeoDataFrame(df, geometry=geometry)


spark = (
    SparkSession.builder
        .master("local")
        .appName("TFL Notebook")
        .config('spark.executor.memory', '8G')
        .config('spark.driver.memory', '16G')
        .config('spark.driver.maxResultSize', '10G')
        .config("spark.sql.crossJoin.enabled", "true")
        .getOrCreate()
)

BIKE_POINTS_FILE = "../data/bike-points.csv"

schema = T.StructType([
    T.StructField("idx",       T.IntegerType(), False),
    T.StructField("id",        T.IntegerType(), False),
    T.StructField("name",      T.StringType(),  False),
    T.StructField("latitude",  T.DoubleType(),  False),
    T.StructField("longitude", T.DoubleType(),  False),
    T.StructField("osgb_x",    T.DoubleType(),  False),
    T.StructField("osgb_y",    T.DoubleType(),  False),
    T.StructField("numdocks",  T.LongType(),    False),
    T.StructField("num_bikes", T.LongType(),    False),
    T.StructField("num_empty", T.LongType(),    False)
])
bike_points = spark.read.csv(BIKE_POINTS_FILE, schema=schema, header='true', mode="PERMISSIVE")
bike_points.createOrReplaceTempView("bike_points")

trips = spark.read.parquet("../data/parquet_trip")
trips.createOrReplaceTempView("trips")

trip_counts = spark.sql("""
    select start_station_name, end_station_name, count(*) as trip_count, sum(duration) as duration
    from trips 
    group by start_station_name, end_station_name
""")
trip_counts.createOrReplaceTempView("trip_counts")

# Get the names only
spark.sql("""
    select name from bike_points
""").createOrReplaceTempView("bike_point_names")

# Self cross join to get all the name:name combos
spark.sql("""
    select 
        a.name as start_station_name, 
        b.name as end_station_name
    from bike_point_names a 
    cross join bike_point_names b
""").createOrReplaceTempView("possible_trips")


dead_trips = spark.sql("""
    select p.start_station_name, p.end_station_name
    from possible_trips p
    left outer join trip_counts c on (p.start_station_name = c.start_station_name and p.end_station_name = c.end_station_name)
    where c.end_station_name is null
""").sample(False, 0.01).collect()


In [2]:
c = bike_points.count()
expected = c * c

actual = trip_counts.count()

t = random.choice(dead_trips)

In [14]:
tweet =("""
Did you know, out of the {0:,} possible trips on @SantanderCycles (there are {1} stations) a whopping {2:,} have never been recorded?

Why not be the first to cycle from {3} to {4}?

#OpenData #DataViz #PySpark #Maps""".format(expected, c, expected - actual, t["start_station_name"], t["end_station_name"]))


In [4]:

df = to_geo(spark.sql('''
    select * from bike_points where name = "{0}" or name = "{1}"
'''
            .format(t["start_station_name"], t["end_station_name"]))
            .toPandas()
           )

p1 = tuple(df.loc[df['name'] == t["start_station_name"]][["longitude", "latitude"]].values[0])
p2 = tuple(df.loc[df['name'] == t["end_station_name"]][["longitude", "latitude"]].values[0])

line = LineString([p1, p2])
line_gdf = gpd.GeoDataFrame(geometry=[line])


In [5]:
import googlemaps
import keyring
from datetime import datetime
import polyline

gmaps = googlemaps.Client(key=keyring.get_password('logicalgenetics', 'google'))

directions_result = gmaps.directions("{1}, {0}".format(*p1),
                                     "{1}, {0}".format(*p2),
                                     mode="bicycling",
                                     departure_time=datetime.now())

encoded = directions_result[0]['overview_polyline']['points']
points = [(lat, lon) for (lon, lat) in polyline.decode(encoded)]

route = gpd.GeoDataFrame(pd.DataFrame([LineString(points)], columns=['geometry']))


# Results

In [15]:
Markdown(tweet)


Did you know, out of the 614,656 possible trips on @SantanderCycles (there are 784 stations) a whopping 62,229 have never been recorded?

Why not be the first to cycle from Pott Street, Bethnal Green to Bell Street , Marylebone?

#OpenData #DataViz #PySpark #Maps

In [11]:
fig,ax=plt.subplots(figsize=(10,10))
#line_gdf.plot(ax=ax, color='purple')

route.plot(ax=ax, color='purple', linewidth=6.0)

df.plot(ax=ax, markersize=2000, marker='*', color='purple')
mplleaflet.display(fig=fig, crs=df.crs, tiles='stamen_wc')
