In [1]:
import pandas as pd
import networkx as nx
from sodapy import Socrata
import math
import random
import matplotlib.pyplot as plt
import numpy as np
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import copy 

import seaborn as sns

sns.set_style('darkgrid')

#to supress some annoying warnings
# import warnings; warnings.simplefilter('ignore')

In [2]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", None)

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.

num_trips = 1000000
results = client.get("2yzn-sicd", limit=num_trips)

# Convert to pandas DataFrame
data = pd.DataFrame.from_records(results)



In [3]:
data

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,extra,fare_amount,imp_surcharge,mta_tax,passenger_count,payment_type,pickup_datetime,pickup_latitude,pickup_longitude,rate_code,ratecodeid,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,vendor_id
0,2015-08-15T20:29:33.000,40.74542236328125,-73.9794921875,0.5,5,0.3,0.5,1,1,2015-08-15T20:25:53.000,40.75445556640625,-73.968765258789063,,1,N,1.26,0,7.56,1.01,2
1,2015-08-31T15:32:39.000,40.755142211914063,-73.976226806640625,0,13,0.3,0.5,1,2,2015-08-31T15:11:55.000,40.769306182861328,-73.977615356445313,,1,N,0,0,13.8,1.49,2
2,2015-08-24T09:21:00.000,40.763984680175781,-73.955787658691406,0,6.5,0.3,0.5,1,1,2015-08-24T09:14:31.000,40.757289886474609,-73.970016479492188,,1,N,3,0,10.3,1.1,1
3,2015-08-05T09:12:25.000,40.756401062011719,-73.972610473632812,0,10,0.3,0.5,1,1,2015-08-05T08:57:43.000,40.77178955078125,-73.961479187011719,,1,N,1,0,11.8,1.22,2
4,2015-08-23T17:46:30.000,40.759124755859375,-73.982231140136719,0,8.5,0.3,0.5,1,1,2015-08-23T17:35:22.000,40.760608673095703,-73.994712829589844,,1,N,2.32,0,11.62,0.9,1
5,2015-08-02T12:59:08.000,40.775840759277344,-73.950897216796875,0,11,0.3,0.5,2,2,2015-08-02T12:46:46.000,40.781772613525391,-73.982994079589844,,1,N,0,0,11.8,2.3,1
6,2015-08-23T00:00:29.000,40.750583648681641,-73.983512878417969,0.5,7.5,0.3,0.5,4,1,2015-08-22T23:52:00.000,40.760780334472656,-73.969261169433594,,1,N,1.75,0,10.55,1.2,1
7,2015-08-16T09:56:33.000,0,0,0,14.5,0.3,0.5,1,1,2015-08-16T09:44:34.000,0,0,,1,N,3.06,0,18.36,4.2,1
8,2015-08-15T21:39:53.000,40.730751037597656,-73.981124877929688,0.5,6,0.3,0.5,6,1,2015-08-15T21:33:56.000,40.719711303710937,-73.989944458007813,,1,N,1.46,0,8.76,1.04,2
9,2015-08-20T21:54:41.000,40.676830291748047,-73.981475830078125,0.5,22.5,0.3,0.5,1,1,2015-08-20T21:29:28.000,40.712268829345703,-73.958877563476562,,1,N,2,0,25.8,6.4,1


In [4]:
# concatenate all pickup and dropoff coordiantes together to prep for clustering
points = data[['dropoff_latitude','dropoff_longitude']]
points = points.rename(index=str, columns={"dropoff_latitude":"pickup_latitude", "dropoff_longitude":"pickup_longitude" })
points
points = points.append(data[['pickup_latitude','pickup_longitude']], ignore_index=True)
points = points.rename(index=str, columns={"pickup_latitude":"latitude", "pickup_longitude":"longitude" })
points = points.astype('float')


In [5]:
points

Unnamed: 0,latitude,longitude
0,40.745422,-73.979492
1,40.755142,-73.976227
2,40.763985,-73.955788
3,40.756401,-73.972610
4,40.759125,-73.982231
5,40.775841,-73.950897
6,40.750584,-73.983513
7,0.000000,0.000000
8,40.730751,-73.981125
9,40.676830,-73.981476


In [6]:
from scipy.cluster.vq import kmeans2, whiten
centroids, labels = kmeans2(points, 20, 20)
dropoff_labels = labels[0:num_trips]
pickup_labels = labels[data.shape[0]:labels.shape[0]]
assert(len(pickup_labels) == len(dropoff_labels))
data.insert(loc=len(data.columns),column='pickupID', value=pickup_labels)
data.insert(loc=len(data.columns),column='dropoffID', value=dropoff_labels)




In [7]:
whiten(points)

array([[ 8.28883424, -8.29053465],
       [ 8.29081155, -8.29016871],
       [ 8.29261037, -8.28787819],
       ...,
       [ 8.29692894, -8.28980962],
       [ 8.2944868 , -8.29085441],
       [ 8.28679563, -8.29087835]])

In [8]:
def makeDigraph(data):
    G = nx.DiGraph();
    for ride in range(num_trips):
        src = data['pickupID'][ride]
        dest = data['dropoffID'][ride]
        G.add_edge(src,dest)
    return G

In [9]:
def degree_distribution(graph, orientation):
    max_degree = np.max(graph.degree)
    if (orientation is 'in'):
        in_degree = np.zeros(max_degree + 1)
        for k in G.nodes:
            in_num = G.in_degree[k]
            in_degree[in_num] += 1 
        sns.distplot(d)
        

In [10]:
G = makeDigraph(data)

In [11]:
print(list(G.degree))
print(G.edges())
print(G.number_of_edges())

[(7, 21), (5, 7), (0, 7), (9, 1), (8, 2), (13, 4), (3, 4), (10, 2), (14, 1), (18, 1), (19, 1), (1, 3), (11, 1), (6, 2), (15, 1), (12, 2)]
[(7, 7), (7, 0), (7, 5), (7, 10), (7, 3), (7, 6), (7, 8), (7, 15), (5, 5), (5, 7), (5, 0), (5, 13), (0, 7), (0, 0), (0, 5), (9, 7), (8, 7), (13, 13), (13, 0), (3, 7), (3, 3), (10, 7), (14, 7), (18, 7), (19, 7), (1, 7), (1, 1), (11, 7), (6, 7), (12, 12)]
30


In [12]:
data

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,extra,fare_amount,imp_surcharge,mta_tax,passenger_count,payment_type,pickup_datetime,...,rate_code,ratecodeid,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,vendor_id,pickupID,dropoffID
0,2015-08-15T20:29:33.000,40.74542236328125,-73.9794921875,0.5,5,0.3,0.5,1,1,2015-08-15T20:25:53.000,...,,1,N,1.26,0,7.56,1.01,2,7,7
1,2015-08-31T15:32:39.000,40.755142211914063,-73.976226806640625,0,13,0.3,0.5,1,2,2015-08-31T15:11:55.000,...,,1,N,0,0,13.8,1.49,2,7,7
2,2015-08-24T09:21:00.000,40.763984680175781,-73.955787658691406,0,6.5,0.3,0.5,1,1,2015-08-24T09:14:31.000,...,,1,N,3,0,10.3,1.1,1,7,7
3,2015-08-05T09:12:25.000,40.756401062011719,-73.972610473632812,0,10,0.3,0.5,1,1,2015-08-05T08:57:43.000,...,,1,N,1,0,11.8,1.22,2,7,7
4,2015-08-23T17:46:30.000,40.759124755859375,-73.982231140136719,0,8.5,0.3,0.5,1,1,2015-08-23T17:35:22.000,...,,1,N,2.32,0,11.62,0.9,1,7,7
5,2015-08-02T12:59:08.000,40.775840759277344,-73.950897216796875,0,11,0.3,0.5,2,2,2015-08-02T12:46:46.000,...,,1,N,0,0,11.8,2.3,1,7,7
6,2015-08-23T00:00:29.000,40.750583648681641,-73.983512878417969,0.5,7.5,0.3,0.5,4,1,2015-08-22T23:52:00.000,...,,1,N,1.75,0,10.55,1.2,1,7,7
7,2015-08-16T09:56:33.000,0,0,0,14.5,0.3,0.5,1,1,2015-08-16T09:44:34.000,...,,1,N,3.06,0,18.36,4.2,1,5,5
8,2015-08-15T21:39:53.000,40.730751037597656,-73.981124877929688,0.5,6,0.3,0.5,6,1,2015-08-15T21:33:56.000,...,,1,N,1.46,0,8.76,1.04,2,7,7
9,2015-08-20T21:54:41.000,40.676830291748047,-73.981475830078125,0.5,22.5,0.3,0.5,1,1,2015-08-20T21:29:28.000,...,,1,N,2,0,25.8,6.4,1,7,7
