In [1]:
import pandas as pd
import geopandas as gp
import matplotlib as pl
from shapely.geometry import Point
import matplotlib.cm as cm

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
#Stations file
stations = pd.read_csv('../data/external/citibike_data/citi_bike_station_info.csv')
geometry = gp.GeoSeries([Point(xy) for xy in zip(stations.Longitude, stations.Latitude)])
geo_stations = gp.GeoDataFrame(stations, geometry=geometry)
geo_stations.crs = {'init' :'epsg:4326'}
geo_stations.drop(['Location', 'Latitude', 'Longitude'], axis = 1, inplace = True)
geo_stations.rename(columns={'Station_id' : 'station_id'}, inplace=True)
geo_stations.head()

Unnamed: 0,station_id,Station_Name,geometry
0,72,W 52 St & 11 Ave,POINT (-73.99392888 40.76727216)
1,79,Franklin St & W Broadway,POINT (-74.00666661 40.71911552)
2,82,St James Pl & Pearl St,POINT (-74.00016545 40.71117416)
3,83,Atlantic Ave & Fort Greene Pl,POINT (-73.97632328 40.68382604)
4,116,W 17 St & 8 Ave,POINT (-74.00149746 40.74177603)


In [3]:
#Complied master file
master = pd.read_csv('../data/processed/master_norm.csv')
master.head()

Unnamed: 0,station_id,ridership_0115,ridership_0215,ridership_0315,ridership_0415,ridership_0515,ridership_0615,ridership_0715,ridership_0815,ridership_0915,...,ridership_1215,avg_ridership_2015,bike_lane_score,street_quality_score,tree_score,traffic_volume,median_hh_income,pop_density,park,subway_entrance
0,72,-0.361416,-0.259163,-0.019305,-0.017391,0.013852,-0.044803,-0.053536,0.290089,0.380089,...,-0.037283,0.134845,-1.037727,0.790673,-0.554668,0.001493,0.1069154,0.791144,1.0,0.0
1,79,-0.485013,-0.599695,-0.423754,-0.190652,-0.278161,-0.223134,-0.296086,-0.030448,0.089412,...,-0.087781,-0.153108,0.722092,1.584334,-0.904716,-0.424358,2.130764e-15,0.218505,0.0,1.0
2,82,-0.924468,-0.738256,-0.85239,-0.930948,-0.938365,-0.991566,-0.960664,-0.681822,-0.728118,...,-0.721558,-0.924446,-1.037727,-0.135265,0.240854,0.155045,-0.3670914,-0.17081,1.0,0.0
3,83,-1.013732,-1.005985,-1.02035,-0.961663,-0.921605,-0.966611,-0.744548,-0.443273,-0.577905,...,-0.718631,-0.849354,-1.037727,0.09622,-1.33488,2.144695,-0.03877741,-1.079345,0.0,0.0
4,116,1.45477,1.415314,1.430531,-0.029204,-0.322344,1.012182,1.117474,1.390545,1.661106,...,1.454217,1.256217,1.308698,1.485126,0.8139,0.08669,0.5403333,0.581773,1.0,0.0


In [4]:
master.columns

Index([u'station_id', u'ridership_0115', u'ridership_0215', u'ridership_0315',
       u'ridership_0415', u'ridership_0515', u'ridership_0615',
       u'ridership_0715', u'ridership_0815', u'ridership_0915',
       u'ridership_1015', u'ridership_1115', u'ridership_1215',
       u'avg_ridership_2015', u'bike_lane_score', u'street_quality_score',
       u'tree_score', u'traffic_volume', u'median_hh_income', u'pop_density',
       u'park', u'subway_entrance'],
      dtype='object')

In [5]:
#Dataframe of ridership only
ridership = master.copy()
ridership.drop([u'bike_lane_score', u'park',
       u'street_quality_score', u'subway_entrance', u'tree_score',
       u'traffic_volume', u'median_hh_income', u'pop_density'], axis = 1, inplace = True)
ridership.head()

Unnamed: 0,station_id,ridership_0115,ridership_0215,ridership_0315,ridership_0415,ridership_0515,ridership_0615,ridership_0715,ridership_0815,ridership_0915,ridership_1015,ridership_1115,ridership_1215,avg_ridership_2015
0,72,-0.361416,-0.259163,-0.019305,-0.017391,0.013852,-0.044803,-0.053536,0.290089,0.380089,0.450563,0.130947,-0.037283,0.134845
1,79,-0.485013,-0.599695,-0.423754,-0.190652,-0.278161,-0.223134,-0.296086,-0.030448,0.089412,0.014532,-0.017566,-0.087781,-0.153108
2,82,-0.924468,-0.738256,-0.85239,-0.930948,-0.938365,-0.991566,-0.960664,-0.681822,-0.728118,-0.757243,-0.766095,-0.721558,-0.924446
3,83,-1.013732,-1.005985,-1.02035,-0.961663,-0.921605,-0.966611,-0.744548,-0.443273,-0.577905,-0.732535,-0.726134,-0.718631,-0.849354
4,116,1.45477,1.415314,1.430531,-0.029204,-0.322344,1.012182,1.117474,1.390545,1.661106,1.554691,1.402552,1.454217,1.256217


In [6]:
ridership = pd.merge(ridership, geo_stations, on = 'station_id', how = 'inner')
ridership.head()

Unnamed: 0,station_id,ridership_0115,ridership_0215,ridership_0315,ridership_0415,ridership_0515,ridership_0615,ridership_0715,ridership_0815,ridership_0915,ridership_1015,ridership_1115,ridership_1215,avg_ridership_2015,Station_Name,geometry
0,72,-0.361416,-0.259163,-0.019305,-0.017391,0.013852,-0.044803,-0.053536,0.290089,0.380089,0.450563,0.130947,-0.037283,0.134845,W 52 St & 11 Ave,POINT (-73.99392888 40.76727216)
1,79,-0.485013,-0.599695,-0.423754,-0.190652,-0.278161,-0.223134,-0.296086,-0.030448,0.089412,0.014532,-0.017566,-0.087781,-0.153108,Franklin St & W Broadway,POINT (-74.00666661 40.71911552)
2,82,-0.924468,-0.738256,-0.85239,-0.930948,-0.938365,-0.991566,-0.960664,-0.681822,-0.728118,-0.757243,-0.766095,-0.721558,-0.924446,St James Pl & Pearl St,POINT (-74.00016545 40.71117416)
3,83,-1.013732,-1.005985,-1.02035,-0.961663,-0.921605,-0.966611,-0.744548,-0.443273,-0.577905,-0.732535,-0.726134,-0.718631,-0.849354,Atlantic Ave & Fort Greene Pl,POINT (-73.97632328 40.68382604)
4,116,1.45477,1.415314,1.430531,-0.029204,-0.322344,1.012182,1.117474,1.390545,1.661106,1.554691,1.402552,1.454217,1.256217,W 17 St & 8 Ave,POINT (-74.00149746 40.74177603)


In [7]:
ridership = gp.GeoDataFrame(ridership)
ridership.to_file('../data/processed/ridership_norm')

In [8]:
#Dataframe of the matrics
regressors = master.copy()
regressors.drop([u'ridership_0115', u'ridership_0215', u'ridership_0315',
       u'ridership_0415', u'ridership_0515', u'ridership_0615',
       u'ridership_0715', u'ridership_0815', u'ridership_0915',
       u'ridership_1015', u'ridership_1115', u'ridership_1215',
       u'avg_ridership_2015'], axis = 1, inplace = True)
regressors.head()

Unnamed: 0,station_id,bike_lane_score,street_quality_score,tree_score,traffic_volume,median_hh_income,pop_density,park,subway_entrance
0,72,-1.037727,0.790673,-0.554668,0.001493,0.1069154,0.791144,1.0,0.0
1,79,0.722092,1.584334,-0.904716,-0.424358,2.130764e-15,0.218505,0.0,1.0
2,82,-1.037727,-0.135265,0.240854,0.155045,-0.3670914,-0.17081,1.0,0.0
3,83,-1.037727,0.09622,-1.33488,2.144695,-0.03877741,-1.079345,0.0,0.0
4,116,1.308698,1.485126,0.8139,0.08669,0.5403333,0.581773,1.0,0.0


In [9]:
regressors = pd.merge(regressors, geo_stations, on = 'station_id', how = 'inner')
regressors.head()

Unnamed: 0,station_id,bike_lane_score,street_quality_score,tree_score,traffic_volume,median_hh_income,pop_density,park,subway_entrance,Station_Name,geometry
0,72,-1.037727,0.790673,-0.554668,0.001493,0.1069154,0.791144,1.0,0.0,W 52 St & 11 Ave,POINT (-73.99392888 40.76727216)
1,79,0.722092,1.584334,-0.904716,-0.424358,2.130764e-15,0.218505,0.0,1.0,Franklin St & W Broadway,POINT (-74.00666661 40.71911552)
2,82,-1.037727,-0.135265,0.240854,0.155045,-0.3670914,-0.17081,1.0,0.0,St James Pl & Pearl St,POINT (-74.00016545 40.71117416)
3,83,-1.037727,0.09622,-1.33488,2.144695,-0.03877741,-1.079345,0.0,0.0,Atlantic Ave & Fort Greene Pl,POINT (-73.97632328 40.68382604)
4,116,1.308698,1.485126,0.8139,0.08669,0.5403333,0.581773,1.0,0.0,W 17 St & 8 Ave,POINT (-74.00149746 40.74177603)


In [10]:
regressors.shape

(452, 11)

In [11]:
regressors = gp.GeoDataFrame(ridership)
regressors.to_file('../data/processed/regressors_norm')

In [12]:
master_shape = pd.merge(master, geo_stations, on = 'station_id', how = 'inner')
master_shape = gp.GeoDataFrame(master_shape)
master_shape.head()

Unnamed: 0,station_id,ridership_0115,ridership_0215,ridership_0315,ridership_0415,ridership_0515,ridership_0615,ridership_0715,ridership_0815,ridership_0915,...,bike_lane_score,street_quality_score,tree_score,traffic_volume,median_hh_income,pop_density,park,subway_entrance,Station_Name,geometry
0,72,-0.361416,-0.259163,-0.019305,-0.017391,0.013852,-0.044803,-0.053536,0.290089,0.380089,...,-1.037727,0.790673,-0.554668,0.001493,0.1069154,0.791144,1.0,0.0,W 52 St & 11 Ave,POINT (-73.99392888 40.76727216)
1,79,-0.485013,-0.599695,-0.423754,-0.190652,-0.278161,-0.223134,-0.296086,-0.030448,0.089412,...,0.722092,1.584334,-0.904716,-0.424358,2.130764e-15,0.218505,0.0,1.0,Franklin St & W Broadway,POINT (-74.00666661 40.71911552)
2,82,-0.924468,-0.738256,-0.85239,-0.930948,-0.938365,-0.991566,-0.960664,-0.681822,-0.728118,...,-1.037727,-0.135265,0.240854,0.155045,-0.3670914,-0.17081,1.0,0.0,St James Pl & Pearl St,POINT (-74.00016545 40.71117416)
3,83,-1.013732,-1.005985,-1.02035,-0.961663,-0.921605,-0.966611,-0.744548,-0.443273,-0.577905,...,-1.037727,0.09622,-1.33488,2.144695,-0.03877741,-1.079345,0.0,0.0,Atlantic Ave & Fort Greene Pl,POINT (-73.97632328 40.68382604)
4,116,1.45477,1.415314,1.430531,-0.029204,-0.322344,1.012182,1.117474,1.390545,1.661106,...,1.308698,1.485126,0.8139,0.08669,0.5403333,0.581773,1.0,0.0,W 17 St & 8 Ave,POINT (-74.00149746 40.74177603)


In [13]:
master_shape.to_file('../data/processed/master_shape_norm')

In [14]:
#Checking silhouette score
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans, DBSCAN
import matplotlib as plt
X=regressors.iloc[:, range(1,9)]
X.head()

Unnamed: 0,ridership_0115,ridership_0215,ridership_0315,ridership_0415,ridership_0515,ridership_0615,ridership_0715,ridership_0815
0,-0.361416,-0.259163,-0.019305,-0.017391,0.013852,-0.044803,-0.053536,0.290089
1,-0.485013,-0.599695,-0.423754,-0.190652,-0.278161,-0.223134,-0.296086,-0.030448
2,-0.924468,-0.738256,-0.85239,-0.930948,-0.938365,-0.991566,-0.960664,-0.681822
3,-1.013732,-1.005985,-1.02035,-0.961663,-0.921605,-0.966611,-0.744548,-0.443273
4,1.45477,1.415314,1.430531,-0.029204,-0.322344,1.012182,1.117474,1.390545


In [18]:
# range_n_clusters = range(2,10)
# for n_clusters in range_n_clusters:
#     km = KMeans(n_clusters=n_clusters, random_state=324)
#     cluster_labels = km.fit_predict(X)
#     silhouette_avg = silhouette_score(X, cluster_labels)
#     print("For n_clusters ={},".format(n_clusters)+" the average silhouette_score is :{}".format(silhouette_avg))

In [19]:
# dbscan_cluster = KMeans(6).fit(X)
# regressors['dbscan_labels'] = dbscan_cluster.labels_

In [21]:
# regressors.dbscan_labels.unique()

In [20]:
# regressors = gp.GeoDataFrame(regressors)
# regressors.to_file('../data/processed/KMeans')

In [25]:
# master.head()

In [96]:
# master['cluster'] = regressors.dbscan_labels

In [22]:
# master = gp.GeoDataFrame(master, geometry = stations.geometry)
# master.to_file('../data/processed/master_cluster')

In [23]:
# master

In [24]:
# master.avg_ridership_2015.sort_values(ascending=False)