In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from os import listdir
import networkx as nx
import numpy as np
import collections
import statistics

In [3]:
# Generate Database

rides = pd.DataFrame()

filepaths = [("Indego Data/" + f) for f in listdir("Indego Data/") if f.endswith('.csv')]
for filepath in filepaths:
    data = pd.read_csv(filepath, index_col='trip_id')
    if 'start_station_id' in data.columns:
        if 'start_station' in data.columns:
            data['start_station'] = data['start_station'].fillna(data['start_station_id'], inplace=True)
        else:
            data['start_station'] = data['start_station_id']
        data = data.drop('start_station_id', axis=1)
    if 'end_station_id' in data.columns:
        if 'end_station' in data.columns:
            data['end_station'] = data['end_station'].fillna(data['end_station_id'], inplace=True)
        else:
            data['end_station'] = data['end_station_id']
        data = data.drop('end_station_id', axis=1)
            
    rides = rides.append(data)

In [4]:
#Sanitize Data

sanitized_rides = rides

print('Initial number of rows: ' + str(len(sanitized_rides)))

#Drop unused columns
sanitized_rides = sanitized_rides.drop(['start_lat', 'start_lon', 'end_lat', 'end_lon'], axis=1)

#Replaces Day Pass with One Day Pass from before there was Two Day Pass
sanitized_rides['passholder_type'] = sanitized_rides['passholder_type'].replace(to_replace={'^Day Pass': 'One Day Pass'}, regex=True)

#Replace nans of bike type with standard because the column was only added when electric bikes were added
sanitized_rides['bike_type'] = sanitized_rides['bike_type'].fillna('standard')

#Removes trips that start or end at a Virtual Station b/c these are usually anomalous
sanitized_rides = sanitized_rides[sanitized_rides['start_station'] != 3000]
sanitized_rides = sanitized_rides[sanitized_rides['end_station'] != 3000]

#Remove most recent station because not enough data collected on it
sanitized_rides = sanitized_rides[sanitized_rides['start_station'] != 3226]
sanitized_rides = sanitized_rides[sanitized_rides['end_station'] != 3226]

#Remove stations that are not known
sanitized_rides = sanitized_rides[sanitized_rides['start_station'] != 3083]
sanitized_rides = sanitized_rides[sanitized_rides['end_station'] != 3083]

#Duration should be capped at 24 hr per Indego documentation but is not always done
sanitized_rides = sanitized_rides[sanitized_rides['duration'] < (24*60)]

#Drops incomplete ride data
sanitized_rides = sanitized_rides.dropna()

#Misc Cleaning
sanitized_rides = sanitized_rides[sanitized_rides['start_station'] < 4000]
sanitized_rides = sanitized_rides[sanitized_rides['end_station'] < 4000]

print('Final number of rows: ' + str(len(sanitized_rides)))

#Fixes dtypes
sanitized_rides = sanitized_rides.astype({'duration':'int32',
                                          'start_time':'datetime64',
                                          'end_time':'datetime64',
                                          'start_station':'int32',
                                          'end_station':'int32',
                                          'bike_id':'str',
                                          'plan_duration':'int32',
                                          'trip_route_category':'category',
                                          'passholder_type':'category',
                                          'bike_type':'category'})

Initial number of rows: 4121196
Final number of rows: 3834909


In [5]:
#Make Graph

rides_graph_data = sanitized_rides.groupby(['start_station','end_station']).size().reset_index(name='counts')
rides_graph_data['percent'] = rides_graph_data['counts']/len(sanitized_rides)
rides_graph = nx.from_pandas_edgelist(rides_graph_data,'start_station','end_station', 'percent', create_using=nx.DiGraph())

#Remove inactive nodes
rides_graph.remove_nodes_from([3023,3026,3027,3036,3048,3095,3103,3105,3109,3122,3129,3195])
print(len(rides_graph.nodes()))
print(len(rides_graph.edges()))

144
19353


In [6]:
#Connected Components
print('Number of Strongly Connected Components: ' + str(nx.number_strongly_connected_components(rides_graph)))

#Degree Distribution
degrees = sorted([d for n, d in rides_graph.degree()])
plt.hist(degrees, 50)
plt.title('Degree Distribution')
plt.ylabel("Count")
plt.xlabel("Degree")
plt.savefig('Degree Distribution.png')
plt.clf()
print('Mean Degree: ' + str(statistics.mean(degrees)))

#Average Path Length
avg_path_len = []
for node in rides_graph.nodes:
    path_length_dict = dict(nx.shortest_path_length(rides_graph, source=node))
    path_length = np.mean(list(path_length_dict.values()))
    avg_path_len.append(path_length)
plt.hist(avg_path_len, 100)
plt.title('Path Length Distribution')
plt.ylabel("Count")
plt.xlabel("Path Length")
plt.savefig('Average Path Length.png')
plt.clf()
print('Mean Shortest Path Length: ' + str(nx.average_shortest_path_length(rides_graph)))

#Clustering Coefficient
cc = nx.clustering(rides_graph).values()
plt.hist(cc, 50)
plt.title('Clustering Coefficient Distribution')
plt.ylabel("Count")
plt.xlabel("Clustering Coefficient")
plt.savefig('Clustering Coefficient.png')
plt.clf()
print('Mean Clustering Coefficient: ' + str(nx.average_clustering(rides_graph)))

Number of Strongly Connected Components: 1
Mean Degree: 268.7916666666667
Mean Shortest Path Length: 1.0671620046620047
Mean Clustering Coefficient: 0.9608753270412045


<Figure size 432x288 with 0 Axes>

In [7]:
# Invert graph
inverted_graph = nx.complete_graph(rides_graph.nodes, create_using=nx.DiGraph())
inverted_graph.remove_edges_from(rides_graph.edges)

pos = nx.spring_layout(inverted_graph, k=5)
nx.draw(inverted_graph, with_labels=True)
plt.clf()

#Nothing of interest. Farther locations for likely. A lot are new or inactive

<Figure size 432x288 with 0 Axes>

In [8]:
# Difference between in degree and out degree

in_degrees_percent = [d for n, d in rides_graph.in_degree(weight='percent')]
out_degrees_percent = [d for n, d in rides_graph.out_degree(weight='percent')]
net_out_degrees_percent = [out_degree_percent - in_degree_percent for in_degree_percent, out_degree_percent in zip(in_degrees_percent, out_degrees_percent)]
plt.hist(net_out_degrees_percent, 100)
plt.title('Net Out Degree Percent Distribution')
plt.ylabel("Count")
plt.xlabel("Percent Difference between Out and In Degree")
plt.savefig('Net Out Degree Distribution Percent.png')
plt.clf()

print(net_out_degrees_percent)

#Difference is non-zero but typically very close (<.1). Anomalies for closed and new stations. Wonder how that changes with time of day

[-0.0006120093071308938, -0.00048032430495742896, 0.001092072849707782, -0.0009298786490109755, 0.00024954959817821004, -0.0004610278887973423, 4.6676466116928744e-05, 0.0008615589053090969, 0.0007434335469238065, 0.00010378342745552289, -1.6949554735194022e-05, -0.0004302579278934659, -3.7549782797975315e-05, -8.161862510948684e-05, -0.00032021620330496305, -1.4341930929784828e-05, 0.0019330315269540897, -0.003976626303257798, -0.00013377110121778196, 2.060022806277869e-05, 3.389910947041233e-06, -0.0015384980451948107, 0.0006542528127786162, -0.0001316850021734449, 0.0003408164313677374, -0.00040731083840581106, -0.0004792812554352526, -0.00032256306472983574, 0.0013713493592677158, -0.0005481225238982111, 0.00041591599696369497, 0.0010414849478827311, 0.0018013465247806518, -0.0010341836012275644, -2.6076238054149505e-07, -6.753745656024306e-05, -0.00031395790617195704, -0.0006735492289386726, -0.0008313104691662927, -0.0007442158340654282, 6.519059513526021e-06, 0.00149495072764437

<Figure size 432x288 with 0 Axes>

In [9]:
#Most and least central nodes
in_centrality = nx.in_degree_centrality(rides_graph)
out_centrality = nx.out_degree_centrality(rides_graph)

#Same results as other data

In [10]:
edge_percents = []
for n1, n2, w in rides_graph.edges(data='percent'):
    edge_percents.append(w)
plt.hist(edge_percents, 50)
plt.title('Edge Weight Percent Distribution')
plt.xlabel("Weight")
plt.savefig('Edge Weight Percent Distribution.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [11]:
#Display Graph

most_popular_routes = rides_graph.copy()

threshold = 0.001
most_popular_routes.remove_edges_from([(n1, n2) for n1, n2, w in most_popular_routes.edges(data='percent') if w < threshold])
most_popular_routes.remove_nodes_from(list(nx.isolates(most_popular_routes)))

pos = nx.spring_layout(most_popular_routes, k=1)
nx.draw(most_popular_routes, with_labels=True, pos=pos)

print(most_popular_routes.edges(data='percent'))
plt.savefig('General 001 Popular Routes.png')
plt.clf()

threshold = 0.0015
most_popular_routes.remove_edges_from([(n1, n2) for n1, n2, w in most_popular_routes.edges(data='percent') if w < threshold])
most_popular_routes.remove_nodes_from(list(nx.isolates(most_popular_routes)))

pos = nx.spring_layout(most_popular_routes, k=1)
nx.draw(most_popular_routes, with_labels=True, pos=pos)

print(most_popular_routes.edges(data='percent'))
plt.savefig('General 0015 Popular Routes.png')
plt.clf()

[(3010, 3010, 0.00135622514119631), (3012, 3020, 0.0012847762489279406), (3020, 3012, 0.0012154134557039033), (3020, 3032, 0.0025838944287856634), (3028, 3028, 0.0010318367398026915), (3032, 3020, 0.0016172482841183453), (3038, 3032, 0.0011556988705599012), (3045, 3045, 0.0011301441572668348), (3049, 3049, 0.0012667836446705776), (3053, 3053, 0.0013622226759487644), (3054, 3054, 0.0011520481972323202), (3054, 3102, 0.0015859567984533663), (3057, 3057, 0.004772994613431505), (3066, 3020, 0.0010234923436253638), (3102, 3054, 0.0011901195047913783), (3102, 3102, 0.0015776124022760384), (3163, 3163, 0.0012811255756003598), (3212, 3212, 0.0011348378801165817)]
[(3020, 3032, 0.0025838944287856634), (3032, 3020, 0.0016172482841183453), (3054, 3102, 0.0015859567984533663), (3057, 3057, 0.004772994613431505), (3102, 3102, 0.0015776124022760384)]


<Figure size 432x288 with 0 Axes>

In [12]:
#trip_route_category by passholder_type
x = sanitized_rides.groupby(['passholder_type', 'trip_route_category']).size().reset_index(name='counts')
print(x)
# < 10 for 30, <5 for 365, <10 for flex, >20 for 1, <10 for 2, 15 for walkup

   passholder_type trip_route_category   counts
0         Indego30             One Way  2767358
1         Indego30          Round Trip   195269
2        Indego365             One Way   349949
3        Indego365          Round Trip    17752
4       IndegoFlex             One Way    21368
5       IndegoFlex          Round Trip     1693
6     One Day Pass             One Way   214952
7     One Day Pass          Round Trip    57073
8     Two Day Pass             One Way     1492
9     Two Day Pass          Round Trip      102
10         Walk-up             One Way   178540
11         Walk-up          Round Trip    29361


In [13]:
#compare by passholder_type to see different behaviors of different groups
#subtract graph from each other of percent total traffic
threshold = 0.001

walkup_pass_data = sanitized_rides[sanitized_rides['passholder_type'] == 'Walk-up']

walkup_graph_data = walkup_pass_data.groupby(['start_station','end_station']).size().reset_index(name='counts')
walkup_graph_data['percent'] = walkup_graph_data['counts']/len(walkup_pass_data)

walkup_graph = nx.from_pandas_edgelist(walkup_graph_data,'start_station','end_station', 'percent', create_using=nx.DiGraph())

#Remove inactive nodes
walkup_graph.remove_nodes_from([3023,3026,3027,3036,3048,3095,3103,3105,3109,3122,3129,3195])

walkup_graph.remove_edges_from([(n1, n2) for n1, n2, w in walkup_graph.edges(data='percent') if w < threshold])
walkup_graph.remove_nodes_from(list(nx.isolates(walkup_graph)))

pos = nx.spring_layout(walkup_graph, k=1)
nx.draw(walkup_graph, with_labels=True, pos=pos)

print(walkup_graph.edges(data='percent'))
plt.savefig('Walkup 001 Popular Routes.png')
plt.clf()


day_pass_data = sanitized_rides[sanitized_rides['passholder_type'] == 'One Day Pass']

day_graph_data = day_pass_data.groupby(['start_station','end_station']).size().reset_index(name='counts')
day_graph_data['percent'] = day_graph_data['counts']/len(day_pass_data)

day_graph = nx.from_pandas_edgelist(day_graph_data,'start_station','end_station', 'percent', create_using=nx.DiGraph())

#Remove inactive nodes
day_graph.remove_nodes_from([3023,3026,3027,3036,3048,3095,3103,3105,3109,3122,3129,3195])

day_graph.remove_edges_from([(n1, n2) for n1, n2, w in day_graph.edges(data='percent') if w < threshold])
day_graph.remove_nodes_from(list(nx.isolates(day_graph)))

pos = nx.spring_layout(day_graph, k=1)
nx.draw(day_graph, with_labels=True, pos=pos)

print(day_graph.edges(data='percent'))
plt.savefig('Day 001 Popular Routes.png')
plt.clf()


month_pass_data = sanitized_rides[sanitized_rides['passholder_type'] == 'Indego30']

month_graph_data = month_pass_data.groupby(['start_station','end_station']).size().reset_index(name='counts')
month_graph_data['percent'] = month_graph_data['counts']/len(month_pass_data)

month_graph = nx.from_pandas_edgelist(month_graph_data,'start_station','end_station', 'percent', create_using=nx.DiGraph())

#Remove inactive nodes
month_graph.remove_nodes_from([3023,3026,3027,3036,3048,3095,3103,3105,3109,3122,3129,3195])

month_graph.remove_edges_from([(n1, n2) for n1, n2, w in month_graph.edges(data='percent') if w < threshold])
month_graph.remove_nodes_from(list(nx.isolates(month_graph)))

pos = nx.spring_layout(month_graph, k=1)
nx.draw(month_graph, with_labels=True, pos=pos)

print(month_graph.edges(data='percent'))
plt.savefig('Month 001 Popular Routes.png')
plt.clf()


year_pass_data = sanitized_rides[sanitized_rides['passholder_type'] == 'Indego365']

year_graph_data = year_pass_data.groupby(['start_station','end_station']).size().reset_index(name='counts')
year_graph_data['percent'] = year_graph_data['counts']/len(year_pass_data)

year_graph = nx.from_pandas_edgelist(year_graph_data,'start_station','end_station', 'percent', create_using=nx.DiGraph())

#Remove inactive nodes
year_graph.remove_nodes_from([3023,3026,3027,3036,3048,3095,3103,3105,3109,3122,3129,3195])

year_graph.remove_edges_from([(n1, n2) for n1, n2, w in year_graph.edges(data='percent') if w < threshold])
year_graph.remove_nodes_from(list(nx.isolates(year_graph)))

pos = nx.spring_layout(year_graph, k=1)
nx.draw(year_graph, with_labels=True, pos=pos)

print(year_graph.edges(data='percent'))
plt.savefig('Year 001 Popular Routes.png')
plt.clf()

[(3004, 3004, 0.0029581387294914407), (3004, 3056, 0.0012794551252759726), (3004, 3057, 0.0013852747221033087), (3005, 3005, 0.0015536240806922526), (3006, 3006, 0.0015584340623662225), (3007, 3007, 0.0014718543922347656), (3009, 3009, 0.0021981616250042087), (3010, 3010, 0.0020346222480892347), (3010, 3028, 0.0015824839707360716), (3012, 3012, 0.0011255357117089383), (3015, 3015, 0.001375654758755369), (3018, 3018, 0.0027945993525764668), (3018, 3047, 0.0011495856200787874), (3018, 3057, 0.00107743589496924), (3020, 3012, 0.0010245260965555721), (3020, 3020, 0.0013371749053636105), (3020, 3032, 0.0013035050336458217), (3021, 3021, 0.002578150177247825), (3022, 3022, 0.001866272889500291), (3024, 3024, 0.0016642536591935585), (3028, 3028, 0.0016402037508237094), (3029, 3029, 0.0019672825046536573), (3031, 3031, 0.0012505952352321538), (3032, 3032, 0.002082722064828933), (3037, 3037, 0.0020057623580454157), (3037, 3057, 0.001188065473470546), (3040, 3040, 0.0010630059499473308), (3041, 

<Figure size 432x288 with 0 Axes>

In [14]:
threshold = 0.0015

walkup_pass_data = sanitized_rides[sanitized_rides['passholder_type'] == 'Walk-up']

walkup_graph_data = walkup_pass_data.groupby(['start_station','end_station']).size().reset_index(name='counts')
walkup_graph_data['percent'] = walkup_graph_data['counts']/len(walkup_pass_data)

walkup_graph = nx.from_pandas_edgelist(walkup_graph_data,'start_station','end_station', 'percent', create_using=nx.DiGraph())

#Remove inactive nodes
walkup_graph.remove_nodes_from([3023,3026,3027,3036,3048,3095,3103,3105,3109,3122,3129,3195])

walkup_graph.remove_edges_from([(n1, n2) for n1, n2, w in walkup_graph.edges(data='percent') if w < threshold])
walkup_graph.remove_nodes_from(list(nx.isolates(walkup_graph)))

pos = nx.spring_layout(walkup_graph, k=1)
nx.draw(walkup_graph, with_labels=True, pos=pos)

print(walkup_graph.edges(data='percent'))
plt.savefig('Walkup 0015 Popular Routes.png')
plt.clf()


day_pass_data = sanitized_rides[sanitized_rides['passholder_type'] == 'One Day Pass']

day_graph_data = day_pass_data.groupby(['start_station','end_station']).size().reset_index(name='counts')
day_graph_data['percent'] = day_graph_data['counts']/len(day_pass_data)

day_graph = nx.from_pandas_edgelist(day_graph_data,'start_station','end_station', 'percent', create_using=nx.DiGraph())

#Remove inactive nodes
day_graph.remove_nodes_from([3023,3026,3027,3036,3048,3095,3103,3105,3109,3122,3129,3195])

day_graph.remove_edges_from([(n1, n2) for n1, n2, w in day_graph.edges(data='percent') if w < threshold])
day_graph.remove_nodes_from(list(nx.isolates(day_graph)))

pos = nx.spring_layout(day_graph, k=1)
nx.draw(day_graph, with_labels=True, pos=pos)

print(day_graph.edges(data='percent'))
plt.savefig('Day 0015 Popular Routes.png')
plt.clf()


month_pass_data = sanitized_rides[sanitized_rides['passholder_type'] == 'Indego30']

month_graph_data = month_pass_data.groupby(['start_station','end_station']).size().reset_index(name='counts')
month_graph_data['percent'] = month_graph_data['counts']/len(month_pass_data)

month_graph = nx.from_pandas_edgelist(month_graph_data,'start_station','end_station', 'percent', create_using=nx.DiGraph())

#Remove inactive nodes
month_graph.remove_nodes_from([3023,3026,3027,3036,3048,3095,3103,3105,3109,3122,3129,3195])

month_graph.remove_edges_from([(n1, n2) for n1, n2, w in month_graph.edges(data='percent') if w < threshold])
month_graph.remove_nodes_from(list(nx.isolates(month_graph)))

pos = nx.spring_layout(month_graph, k=1)
nx.draw(month_graph, with_labels=True, pos=pos)

print(month_graph.edges(data='percent'))
plt.savefig('Month 0015 Popular Routes.png')
plt.clf()


year_pass_data = sanitized_rides[sanitized_rides['passholder_type'] == 'Indego365']

year_graph_data = year_pass_data.groupby(['start_station','end_station']).size().reset_index(name='counts')
year_graph_data['percent'] = year_graph_data['counts']/len(year_pass_data)

year_graph = nx.from_pandas_edgelist(year_graph_data,'start_station','end_station', 'percent', create_using=nx.DiGraph())

#Remove inactive nodes
year_graph.remove_nodes_from([3023,3026,3027,3036,3048,3095,3103,3105,3109,3122,3129,3195])

year_graph.remove_edges_from([(n1, n2) for n1, n2, w in year_graph.edges(data='percent') if w < threshold])
year_graph.remove_nodes_from(list(nx.isolates(year_graph)))

pos = nx.spring_layout(year_graph, k=1)
nx.draw(year_graph, with_labels=True, pos=pos)

print(year_graph.edges(data='percent'))
plt.savefig('Year 0015 Popular Routes.png')
plt.clf()

[(3004, 3004, 0.0029581387294914407), (3005, 3005, 0.0015536240806922526), (3006, 3006, 0.0015584340623662225), (3009, 3009, 0.0021981616250042087), (3010, 3010, 0.0020346222480892347), (3010, 3028, 0.0015824839707360716), (3018, 3018, 0.0027945993525764668), (3021, 3021, 0.002578150177247825), (3022, 3022, 0.001866272889500291), (3024, 3024, 0.0016642536591935585), (3028, 3028, 0.0016402037508237094), (3029, 3029, 0.0019672825046536573), (3032, 3032, 0.002082722064828933), (3037, 3037, 0.0020057623580454157), (3045, 3045, 0.002385750910289032), (3046, 3046, 0.0020490521931111443), (3047, 3047, 0.0035305265486938494), (3049, 3049, 0.005714258228676149), (3051, 3051, 0.0023665109835931527), (3052, 3052, 0.0018614629078263212), (3054, 3054, 0.00243385072702873), (3056, 3056, 0.0019961423946974763), (3057, 3004, 0.0023136011851794844), (3057, 3057, 0.01566130033044574), (3057, 3108, 0.002299171240157575), (3060, 3060, 0.0015632440440401922), (3061, 3061, 0.0020634821381330538), (3101, 310

<Figure size 432x288 with 0 Axes>

In [15]:
#time differences (in day, in year)
#Net difference with threshold
#Network of average duration by edge