In [1]:
import pandas as pd
import networkx as nx
from sodapy import Socrata
import math
import random
import matplotlib.pyplot as plt
import numpy as np
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import copy 

import seaborn as sns

sns.set_style('darkgrid')

#to supress some annoying warnings
# import warnings; warnings.simplefilter('ignore')

In [2]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cityofnewyork.us", None)

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("2yzn-sicd", limit=10000)

# Convert to pandas DataFrame
data = pd.DataFrame.from_records(results)



In [3]:
num_trips = data.shape[0]
data

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,extra,fare_amount,mta_tax,passenger_count,payment_type,pickup_datetime,pickup_latitude,pickup_longitude,rate_code,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,vendor_id
0,2015-03-14T06:47:02.000,40.710891723632812,-73.951156616210937,0,22.5,0.5,6,1,2015-03-14T06:23:01.000,40.758869171142578,-73.986717224121094,1,N,5.82,0,29.12,5.82,2
1,2015-02-22T21:55:33.000,40.746559143066406,-73.9151611328125,0.5,13.5,0.5,1,1,2015-02-22T21:42:18.000,40.773097991943359,-73.885330200195313,1,N,2.96,0,17.76,3.66,2
2,2015-06-06T02:13:15.000,40.801097869873047,-73.965339660644531,0.5,17,0.5,2,1,2015-06-06T01:55:06.000,40.749649047851562,-74.002716064453125,1,N,3.65,0,21.95,4.8,1
3,2015-03-28T22:20:01.000,40.736175537109375,-73.9864501953125,0.5,6,0.5,2,1,2015-03-28T22:12:44.000,40.736274719238281,-73.993080139160156,1,N,1,0,8.3,0.5,1
4,2015-01-22T13:56:52.000,40.761867523193359,-73.968414306640625,0,9.5,0.5,2,1,2015-01-22T13:43:26.000,40.752170562744141,-73.982177734375,1,N,2.05,0,12.35,1.2,1
5,2015-02-27T12:22:59.000,40.757522583007812,-73.99835205078125,0,7.5,0.5,1,2,2015-02-27T12:14:01.000,40.750019073486328,-73.995208740234375,1,N,0,0,8.3,1,1
6,2015-02-23T21:03:59.000,40.774662017822266,-73.982391357421875,0.5,8.5,0.5,2,2,2015-02-23T20:55:44.000,40.783336639404297,-73.959098815917969,1,N,0,0,9.8,1.98,2
7,2015-05-04T16:44:04.000,40.764469146728516,-73.978790283203125,1,9,0.5,4,1,2015-05-04T16:32:15.000,40.7510986328125,-73.99407958984375,1,N,2.16,0,12.96,1.47,2
8,2015-05-06T18:23:38.000,40.759925842285156,-73.969856262207031,1,9,0.5,1,1,2015-05-06T18:11:59.000,40.776126861572266,-73.95587158203125,1,N,2.16,0,12.96,1.52,2
9,2015-04-19T00:43:12.000,40.744293212890625,-73.983551025390625,0.5,5,0.5,2,1,2015-04-19T00:39:03.000,40.750030517578125,-73.991744995117188,1,Y,1.25,0,7.55,0.7,1


In [4]:
# concatenate all pickup and dropoff coordiantes together to prep for clustering
points = data[['dropoff_latitude','dropoff_longitude']]
points = points.rename(index=str, columns={"dropoff_latitude":"pickup_latitude", "dropoff_longitude":"pickup_longitude" })
points
points = points.append(data[['pickup_latitude','pickup_longitude']], ignore_index=True)
points = points.rename(index=str, columns={"pickup_latitude":"latitude", "pickup_longitude":"longitude" })
points = points.astype('float')


In [5]:
from scipy.cluster.vq import kmeans2, whiten
centroids, labels = kmeans2(whiten(points), 20, 20)
dropoff_labels = labels[0:num_trips]
pickup_labels = labels[data.shape[0]:labels.shape[0]]
data.insert(loc=len(data.columns),column='pickupID', value=pickup_labels)
data.insert(loc=len(data.columns),column='dropoffID', value=dropoff_labels)




In [6]:
labels

array([ 7,  7,  7, ..., 10,  7,  7], dtype=int32)

In [7]:
def makeDigraph(data):
    G = nx.DiGraph();
    for ride in range(num_trips):
        src = data['pickupID'][ride]
        dest = data['dropoffID'][ride]
        G.add_edge(src,dest)
    return G

In [8]:
def degree_distribution(graph, orientation):
    max_degree = np.max(graph.degree)
    if (orientation is 'in'):
        in_degree = np.zeros(max_degree + 1)
        for k in G.nodes:
            in_num = G.in_degree[k]
            in_degree[in_num] += 1 
        sns.distplot(d)
        

In [16]:
G = makeDigraph(data)
list(G.degree)

[(7, 4), (10, 4), (9, 2)]

In [10]:
data

Unnamed: 0,dropoff_datetime,dropoff_latitude,dropoff_longitude,extra,fare_amount,mta_tax,passenger_count,payment_type,pickup_datetime,pickup_latitude,pickup_longitude,rate_code,store_and_fwd_flag,tip_amount,tolls_amount,total_amount,trip_distance,vendor_id,pickupID,dropoffID
0,2015-03-14T06:47:02.000,40.710891723632812,-73.951156616210937,0,22.5,0.5,6,1,2015-03-14T06:23:01.000,40.758869171142578,-73.986717224121094,1,N,5.82,0,29.12,5.82,2,7,7
1,2015-02-22T21:55:33.000,40.746559143066406,-73.9151611328125,0.5,13.5,0.5,1,1,2015-02-22T21:42:18.000,40.773097991943359,-73.885330200195313,1,N,2.96,0,17.76,3.66,2,7,7
2,2015-06-06T02:13:15.000,40.801097869873047,-73.965339660644531,0.5,17,0.5,2,1,2015-06-06T01:55:06.000,40.749649047851562,-74.002716064453125,1,N,3.65,0,21.95,4.8,1,7,7
3,2015-03-28T22:20:01.000,40.736175537109375,-73.9864501953125,0.5,6,0.5,2,1,2015-03-28T22:12:44.000,40.736274719238281,-73.993080139160156,1,N,1,0,8.3,0.5,1,7,7
4,2015-01-22T13:56:52.000,40.761867523193359,-73.968414306640625,0,9.5,0.5,2,1,2015-01-22T13:43:26.000,40.752170562744141,-73.982177734375,1,N,2.05,0,12.35,1.2,1,7,7
5,2015-02-27T12:22:59.000,40.757522583007812,-73.99835205078125,0,7.5,0.5,1,2,2015-02-27T12:14:01.000,40.750019073486328,-73.995208740234375,1,N,0,0,8.3,1,1,7,7
6,2015-02-23T21:03:59.000,40.774662017822266,-73.982391357421875,0.5,8.5,0.5,2,2,2015-02-23T20:55:44.000,40.783336639404297,-73.959098815917969,1,N,0,0,9.8,1.98,2,7,7
7,2015-05-04T16:44:04.000,40.764469146728516,-73.978790283203125,1,9,0.5,4,1,2015-05-04T16:32:15.000,40.7510986328125,-73.99407958984375,1,N,2.16,0,12.96,1.47,2,7,7
8,2015-05-06T18:23:38.000,40.759925842285156,-73.969856262207031,1,9,0.5,1,1,2015-05-06T18:11:59.000,40.776126861572266,-73.95587158203125,1,N,2.16,0,12.96,1.52,2,7,7
9,2015-04-19T00:43:12.000,40.744293212890625,-73.983551025390625,0.5,5,0.5,2,1,2015-04-19T00:39:03.000,40.750030517578125,-73.991744995117188,1,Y,1.25,0,7.55,0.7,1,7,7
