In [1]:
from collections import namedtuple
from datetime import datetime, timedelta
from s2 import *


Pickup = namedtuple('Pickup', ['time', 'pos'])

time_fmt = '%Y-%m-%d %H:%M:%S'


# Only test one part of the whole data.
trip_data_file = 'data/trip_data_1.csv'

trip_data = sqlContext.read.format('com.databricks.spark.csv') \
    .options(header='true') \
    .load(trip_data_file)

S2_LEVEL = 13
    
    
def get_s2_id(lat, lon):
    # Sanity check for NYC.
    if 30 < lat < 50 and -80 < lon < -70:
        latlng = S2LatLng.FromDegrees(lat, lon)
        return S2CellId.FromLatLng(latlng).parent(S2_LEVEL).id()
    else:
        # Wrong lat & lon.
        return 0
    

# Use level-13 cell as an approximate 0.2 mile circle,
# without considering the actual position of pickup.
def key_by_region(row):
    lat = float(row['pickup_latitude'])
    lon = float(row['pickup_longitude'])
    tm = datetime.strptime(row['pickup_datetime'], time_fmt)
    # Position could be used later to verify the distance.
    pickup = Pickup(tm, (lat, lon))
    return (get_s2_id(lat, lon), pickup)
    
    
# Ignore pickups with error lat/lon.
regions = trip_data \
    .map(key_by_region) \
    .filter(lambda kv: kv[0] != 0) \
    .groupByKey()

Note: Rationale of using level 13 cell to represent 0.2 mile-radius circle:

![s2](http://i.imgur.com/AgAHlX0.png)

In [2]:
def calculate_matched_pickups(pickups):
    # Round time to nearest 5th min.
    matched = 0
    slots_5min = set()
    for p in pickups:
        tm = (p.time - 
              timedelta(minutes=p.time.minute % 5, 
                        seconds=p.time.second, 
                        microseconds=p.time.microsecond))
        if tm in slots_5min:
            matched += 1
        else:
            slots_5min.add(tm)
    return matched + 1
        

matched_pickups = regions \
    .mapValues(calculate_matched_pickups) \
    .values() \
    .sum()
print('Total matched pickups: {}'.format(matched_pickups))

total_pickups = trip_data.count()
print('Number of all pickups: {}'.format(total_pickups))

percent = float(matched_pickups) / total_pickups
print('Percent of matched pickups: {}'.format(percent))

Total matched pickups: 13749606
Number of all pickups: 14776615
Percent of matched pickups: 0.93049768164089


Such high percentage may be caused by:

1. bias of the data. (Is it possible the CSV files are segmented by the location?)
2. oversized S2 cells.
3. wrong algorithm.