<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#ADD-GPS-TO-STOPS" data-toc-modified-id="ADD-GPS-TO-STOPS-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>ADD GPS TO STOPS</a></span></li><li><span><a href="#CREATE-CLUSTERS" data-toc-modified-id="CREATE-CLUSTERS-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>CREATE CLUSTERS</a></span></li><li><span><a href="#MAP-STOPS-TO-CLUSTERS" data-toc-modified-id="MAP-STOPS-TO-CLUSTERS-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>MAP STOPS TO CLUSTERS</a></span></li><li><span><a href="#WRITE-DICTIONARY-TO-JSON-FILE" data-toc-modified-id="WRITE-DICTIONARY-TO-JSON-FILE-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>WRITE DICTIONARY TO JSON FILE</a></span></li><li><span><a href="#INSPECT-DICTIONARY" data-toc-modified-id="INSPECT-DICTIONARY-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>INSPECT DICTIONARY</a></span></li></ul></div>

In [1]:
from IPython.core.display import display, HTML
display(HTML('<style>.container {width:80% !important;}</style>'))

In [2]:
import pandas as pd
import numpy as np
from math import sin, cos
import matplotlib.pyplot as plt
from haversine import haversine, Unit
from sklearn import preprocessing
from sklearn.cluster import DBSCAN
import json

# ADD GPS TO STOPS

In [3]:
# load the latitude and longitude per busstop dictionary

json_path = '/Users/davidodwyer/Documents/studyCS/Semester_3/data/derived_data/db_stop_latlon.json'
json_obj = open(json_path)
json_obj = json_obj.read()
stops_latlon = json.loads(json_obj)
stops_latlon = {int(k): stops_latlon[k] for k in stops_latlon.keys()}

In [4]:
stops_latlon[10]

[53.35338694, -6.265383889]

In [5]:
type(stops_latlon[10])

list

In [6]:
stops_lat = {k:v[0] for k, v in stops_latlon.items()}
stops_lon = {k:v[1] for k, v in stops_latlon.items()}

In [8]:
def add_stop_lat_lon_features(df):
    
    """
    Function adds latitude and longitude featuers to 
    existing dataframe.
    """
    
    df['stop_lat'] = df.STOPPOINTID
    df['stop_lon'] = df.STOPPOINTID
    
    df = df.astype({
        'stop_lat':'int32',
        'stop_lon':'int32'
    })
    
    # for stops in dictionary and df, map; retain values
    # for stops not in dictionary
    
    df['stop_lat'] = df['stop_lat'].map(stops_lat).fillna(df['stop_lat'])
    df['stop_lon'] = df['stop_lon'].map(stops_lon).fillna(df['stop_lat'])
      
    # stops in the dublin but dataset and in the mapping dict
    stops_in_current_dataframe = list(set(df.STOPPOINTID.unique()).intersection(set(stops_latlon.keys())))
    
    # stops in dublin bus dataset but not in mapping dict
    stops_not_in_dict = list(set(df.STOPPOINTID.unique()).difference(set(stops_latlon.keys())))
    
    for stop in stops_not_in_dict:
        while stop not in stops_in_current_dataframe:
            stop -= 1
        df.loc[df.STOPPOINTID == stop, 'stop_lat'] = stops_latlon[stop][0]
        df.loc[df.STOPPOINTID == stop, 'stop_lon'] = stops_latlon[stop][1]
    
    return df

In [9]:
path = '/tmp/ssh_mount/data/dataframes/020719_postassessquality_leavetimes.feather'

df = pd.read_feather(path)

In [10]:
df.drop([
    'PROGRNUMBER',
    'VEHICLEID',
    'PLANNEDTIME_DEP',
    'ACTUALTIME_DEP'
], axis=1, inplace=True)

In [11]:
df.head(2)

Unnamed: 0,DAYOFSERVICE,TRIPID,STOPPOINTID,PLANNEDTIME_ARR,ACTUALTIME_ARR
0,2018-01-01,5972116,119,48030,48012
1,2018-01-01,5966674,119,54001,54023


In [12]:
# add latitude and longitude to the dataframe

df = add_stop_lat_lon_features(df)

In [13]:
df.head(2)

Unnamed: 0,DAYOFSERVICE,TRIPID,STOPPOINTID,PLANNEDTIME_ARR,ACTUALTIME_ARR,stop_lat,stop_lon
0,2018-01-01,5972116,119,48030,48012,53.375081,-6.250821
1,2018-01-01,5966674,119,54001,54023,53.375081,-6.250821


In [14]:
df.isna().sum()

DAYOFSERVICE       0
TRIPID             0
STOPPOINTID        0
PLANNEDTIME_ARR    0
ACTUALTIME_ARR     0
stop_lat           0
stop_lon           0
dtype: int64

In [15]:
df.drop_duplicates(subset=['STOPPOINTID', 'stop_lat', 'stop_lon'], inplace=True)

In [16]:
df.shape

(4774, 7)

# CREATE CLUSTERS

DBSCAN: "Density-based spatial clustering of applications with noise". We use sklearn's DBSCAN method to create bus clusters based on spacial proximity of stops. The ball-tree nearest neighbours algorithm is used rather than KD-tree, as it is superior for spaial data (https://osf.io/preprints/socarxiv/nzhdc/). Documentation on Ball-Tree: https://scikit-learn.org/stable/modules/neighbors.html#ball-tree. 

In [17]:
km_per_radian = 6371

In [18]:
coordinates = df.as_matrix(columns=['stop_lat', 'stop_lon'])

  """Entry point for launching an IPython kernel.


In [22]:
# The maximum distance between two samples for one to be 
# considered as in the neighborhood of the other.
# 400m

epsilon = .3 / km_per_radian

In [23]:
# compute DBSCAN

db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree',\
            metric='haversine', n_jobs = -1).fit(np.radians(coordinates))

In [24]:
# number of clusters

len(set(db.labels_))

780

In [25]:
# create clusters where each cluster is a collection of
# coordinates

clusters = pd.Series([coordinates[db.labels_ == n] for n in range(len(set(db.labels_)))])

In [26]:
clusters[0].tolist()[0]

[53.37508111, -6.250821111]

# MAP STOPS TO CLUSTERS

In [28]:
stop_to_cluster = {}

cluster = 1

for i in range(len(clusters)-1):
    key = cluster
    value = []
    try:
        for coordinate in clusters[i].tolist():
            value.append(list(stops_latlon.keys())[list(stops_latlon.values()).index(coordinate)])
    except:
        continue
    stop_to_cluster[key] = value
    cluster += 1
        

In [29]:
stop_to_cluster

{1: [119,
  44,
  7603,
  45,
  46,
  213,
  47,
  1981,
  1986,
  1987,
  118,
  48,
  49,
  52,
  261,
  270,
  4521,
  1935,
  1937,
  1938,
  1940,
  1941,
  1942,
  1943,
  1944,
  1946,
  1947,
  1951,
  1952,
  1953,
  1954,
  1955,
  1956,
  1957,
  1958,
  51,
  265,
  271,
  340,
  350,
  351,
  255,
  218,
  219,
  220,
  221,
  352,
  353,
  354,
  222,
  223,
  224,
  225,
  226,
  114,
  228,
  229,
  227,
  230,
  231,
  232,
  355,
  1982,
  4406,
  1984,
  1985,
  233,
  242,
  243,
  1988,
  1989,
  1990,
  1992,
  1744,
  1745,
  1746,
  6122,
  35,
  36,
  1993,
  1994,
  1995,
  1996,
  1997,
  1998,
  37,
  38,
  39,
  40,
  1999,
  2001,
  2002,
  1359,
  319,
  281,
  41,
  42,
  43,
  1980,
  7,
  11,
  14,
  15,
  17,
  18,
  19,
  21,
  7602,
  85,
  86,
  335,
  87,
  88,
  89,
  27,
  28,
  29,
  1939,
  1945,
  4680,
  90,
  91,
  92,
  93,
  1641,
  1642,
  1948,
  1949,
  1950,
  214,
  4432,
  395,
  396,
  397,
  398,
  399,
  400,
  404,
  747,
  842,

In [30]:
# check if every stop in the dataframe has been 
# assigned to a cluster

stops = set()

for k,v in stop_to_cluster.items():
    for x in v:
        stops.add(x)
        
stops.difference(set(df.STOPPOINTID.unique().tolist()))

set()

# WRITE DICTIONARY TO JSON FILE

In [31]:
path = '/Users/davidodwyer/Documents/studyCS/Semester_3/data/derived_data/stops_to_cluster_300m_radius.json'

with open(path, 'w') as json_file:
    json.dump(stop_to_cluster, json_file)

# INSPECT DICTIONARY

In [32]:
json_path = '/Users/davidodwyer/Documents/studyCS/Semester_3/data/derived_data/stops_to_cluster_300m_radius.json'
json_obj = open(json_path)
json_obj = json_obj.read()
clusters = json.loads(json_obj)

In [33]:
stops = set()

for k, v in clusters.items():
    for i in v:
        stops.add(i)

In [34]:
4320 in stops

True

In [35]:
stop_in_df = set(df.STOPPOINTID.unique().tolist())

In [36]:
len(stop_in_df)

4774

In [37]:
len(stop_in_df.difference(stops))

556

In [74]:
stop_in_df.difference(stops)

{124,
 128,
 132,
 139,
 140,
 313,
 347,
 349,
 401,
 403,
 430,
 431,
 432,
 433,
 434,
 462,
 464,
 465,
 466,
 467,
 468,
 486,
 661,
 662,
 663,
 674,
 765,
 788,
 805,
 806,
 815,
 816,
 863,
 864,
 865,
 866,
 867,
 869,
 870,
 873,
 874,
 935,
 936,
 951,
 952,
 953,
 963,
 964,
 965,
 966,
 967,
 968,
 973,
 974,
 975,
 976,
 979,
 988,
 989,
 990,
 991,
 997,
 998,
 999,
 1000,
 1001,
 1002,
 1003,
 1004,
 1006,
 1007,
 1008,
 1009,
 1010,
 1012,
 1039,
 1040,
 1041,
 1050,
 1131,
 1132,
 1133,
 1134,
 1138,
 1139,
 1252,
 1391,
 1392,
 1394,
 1395,
 1400,
 1421,
 1513,
 1530,
 1545,
 1601,
 1602,
 1603,
 1605,
 1644,
 1645,
 1646,
 1650,
 1651,
 1652,
 1766,
 1824,
 1825,
 1830,
 1831,
 1856,
 1857,
 1880,
 1881,
 2027,
 2028,
 2029,
 2030,
 2037,
 2038,
 2047,
 2048,
 2049,
 2050,
 2052,
 2079,
 2110,
 2111,
 2112,
 2113,
 2116,
 2117,
 2130,
 2199,
 2212,
 2226,
 2228,
 2237,
 2290,
 2294,
 2345,
 2363,
 2364,
 2449,
 2451,
 2464,
 2520,
 2521,
 2533,
 2534,
 2544,
 2545,


In [78]:
len(stop_in_df.difference(set(stops_latlon.keys())))

556

In [79]:
stop_in_df.difference(set(stops_latlon.keys()))

{124,
 128,
 132,
 139,
 140,
 313,
 347,
 349,
 401,
 403,
 430,
 431,
 432,
 433,
 434,
 462,
 464,
 465,
 466,
 467,
 468,
 486,
 661,
 662,
 663,
 674,
 765,
 788,
 805,
 806,
 815,
 816,
 863,
 864,
 865,
 866,
 867,
 869,
 870,
 873,
 874,
 935,
 936,
 951,
 952,
 953,
 963,
 964,
 965,
 966,
 967,
 968,
 973,
 974,
 975,
 976,
 979,
 988,
 989,
 990,
 991,
 997,
 998,
 999,
 1000,
 1001,
 1002,
 1003,
 1004,
 1006,
 1007,
 1008,
 1009,
 1010,
 1012,
 1039,
 1040,
 1041,
 1050,
 1131,
 1132,
 1133,
 1134,
 1138,
 1139,
 1252,
 1391,
 1392,
 1394,
 1395,
 1400,
 1421,
 1513,
 1530,
 1545,
 1601,
 1602,
 1603,
 1605,
 1644,
 1645,
 1646,
 1650,
 1651,
 1652,
 1766,
 1824,
 1825,
 1830,
 1831,
 1856,
 1857,
 1880,
 1881,
 2027,
 2028,
 2029,
 2030,
 2037,
 2038,
 2047,
 2048,
 2049,
 2050,
 2052,
 2079,
 2110,
 2111,
 2112,
 2113,
 2116,
 2117,
 2130,
 2199,
 2212,
 2226,
 2228,
 2237,
 2290,
 2294,
 2345,
 2363,
 2364,
 2449,
 2451,
 2464,
 2520,
 2521,
 2533,
 2534,
 2544,
 2545,


Result: A dictionary assigning stops to clusters. There are 556 stops in the dublin bus data that are not in the stops latitude & longitude dictionary. THis will need to be handled when assigning stops latitudes and longitudes and cluster labels going forward