In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.basemap import Basemap as bm
import math

%matplotlib inline

In [15]:
trips = pd.read_csv('data/trip.csv',parse_dates=["start_date", "end_date"], infer_datetime_format=True)

In [36]:
chunked_trips = pd.read_csv('data/trip.csv',parse_dates=["start_date", "end_date"],\
                            infer_datetime_format=True, iterator=True, chunksize=1000)

In [3]:
stations = pd.read_csv('data/station.csv')
stations.head()

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
0,2,San Jose Diridon Caltrain Station,37.329732,-121.901782,27,San Jose,8/6/2013
1,3,San Jose Civic Center,37.330698,-121.888979,15,San Jose,8/5/2013
2,4,Santa Clara at Almaden,37.333988,-121.894902,11,San Jose,8/6/2013
3,5,Adobe on Almaden,37.331415,-121.8932,19,San Jose,8/5/2013
4,6,San Pedro Square,37.336721,-121.894074,15,San Jose,8/7/2013


In [53]:
chunked_status = pd.read_csv('data/status.csv',parse_dates=["time"],\
                             infer_datetime_format=True, iterator=True, chunksize=10000)

In [16]:
status = pd.read_csv('data/status.csv',parse_dates=["time"])

In [8]:
for status in chunked_status:
    status['daytime'] = status.time.apply(rango)

In [15]:
chunked_status.get_chunk(1)

Unnamed: 0,station_id,bikes_available,docks_available,time
0,2,2,25,2013-08-29 12:06:01


In [12]:
weather = pd.read_csv('data/weather.csv', parse_dates=['date'], infer_datetime_format=True)

In [13]:
weather.head()

Unnamed: 0,date,max_temperature_f,mean_temperature_f,min_temperature_f,max_dew_point_f,mean_dew_point_f,min_dew_point_f,max_humidity,mean_humidity,min_humidity,...,mean_visibility_miles,min_visibility_miles,max_wind_Speed_mph,mean_wind_speed_mph,max_gust_speed_mph,precipitation_inches,cloud_cover,events,wind_dir_degrees,zip_code
0,2013-08-29,74.0,68.0,61.0,61.0,58.0,56.0,93.0,75.0,57.0,...,10.0,10.0,23.0,11.0,28.0,0,4.0,,286.0,94107
1,2013-08-30,78.0,69.0,60.0,61.0,58.0,56.0,90.0,70.0,50.0,...,10.0,7.0,29.0,13.0,35.0,0,2.0,,291.0,94107
2,2013-08-31,71.0,64.0,57.0,57.0,56.0,54.0,93.0,75.0,57.0,...,10.0,10.0,26.0,15.0,31.0,0,4.0,,284.0,94107
3,2013-09-01,74.0,66.0,58.0,60.0,56.0,53.0,87.0,68.0,49.0,...,10.0,10.0,25.0,13.0,29.0,0,4.0,,284.0,94107
4,2013-09-02,75.0,69.0,62.0,61.0,60.0,58.0,93.0,77.0,61.0,...,10.0,6.0,23.0,12.0,30.0,0,6.0,,277.0,94107


In [6]:
##### Top 5 de estaciones que se quedan sin bicis
empty_stations = status.loc[status.bikes_available == 0].groupby('station_id')
empty_stations.size().sort_values(ascending=False).head()

station_id
45    29198
76    24725
48    24401
62    24077
60    23143
dtype: int64

In [None]:
# La estación donde comienzan más viajes:
trips.groupby('start_station_id').size().sort_values(ascending=False)
stations.loc[stations.id == 70]

In [None]:
# La estación donde terminan más viajes:
trips.groupby('end_station_id').size().sort_values(ascending=False)
stations.loc[stations.id == 70]

In [88]:
# Agrego una columna para distinguir franjas horarias dentro de trips y status
# A: 00-06, B: 07-12, C: 13-20, D:21-24
    
def rango(start_date):
    if (start_date.hour < 12):
        if (start_date.hour < 6):
            return 'A'
        else:
            return 'B'
    else:
        if (start_date.hour < 18):
            return 'C'
        else:
            return 'D'
    

In [91]:
trips['daytime'] = trips.start_date.apply(rango)

In [92]:
daytime_grouped = trips.groupby('daytime')
daytime_grouped.size()

daytime
A      8865
B    266259
C    268307
D    126528
dtype: int64

In [93]:
# Estación con más alquileres en la franja A
# A: 00-06, B: 07-12, C: 13-20, D:21-24
daytime_grouped.get_group('A').groupby('start_station_id').size().sort_values(ascending=False)
stations[stations.id == 73]

# La estación 73 está sobre una avenida principal dentro del distrito financiero

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
61,73,Grant Avenue at Columbus Avenue,37.798522,-122.407245,15,San Francisco,8/21/2013


In [15]:
# Estación con más devoluciones en la franja A
# A: 00-06, B: 07-12, C: 13-20, D:21-24
daytime_grouped.get_group('A').groupby('end_station_id').size().sort_values(ascending=False)
stations[stations.id == 77]

# La estación 77 está en un mercado cerca del distrito financiero

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
65,77,Market at Sansome,37.789625,-122.400811,27,San Francisco,8/25/2013


In [18]:
# Estación con más alquileres en la franja B
# A: 00-06, B: 07-12, C: 13-20, D:21-24
daytime_grouped.get_group('B').groupby('start_station_id').size().sort_values(ascending=False)
stations[stations.id == 70]

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
58,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.39526,19,San Francisco,8/23/2013


In [21]:
# Estación con más devoluciones en la franja A
# A: 00-06, B: 07-12, C: 13-20, D:21-24
daytime_grouped.get_group('A').groupby('end_station_id').size().sort_values(ascending=False)
stations[stations.id == 77]

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
65,77,Market at Sansome,37.789625,-122.400811,27,San Francisco,8/25/2013


In [23]:
# Estación con más alquileres en la franja C
daytime_grouped.get_group('C').groupby('start_station_id').size().sort_values(ascending=False)
stations[stations.id == 60]

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
48,60,Embarcadero at Sansome,37.80477,-122.403234,15,San Francisco,8/21/2013


In [26]:
# Estación con más devoluciones en la franja C
# A: 00-06, B: 07-12, C: 13-20, D:21-24
daytime_grouped.get_group('C').groupby('end_station_id').size().sort_values(ascending=False)
stations[stations.id == 70]

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
58,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.39526,19,San Francisco,8/23/2013


In [28]:
# Estación con más alquileres en la franja D 
daytime_grouped.get_group('D').groupby('start_station_id').size().sort_values(ascending=False)
stations[stations.id == 70]

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
58,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.39526,19,San Francisco,8/23/2013


In [31]:
# Estación con más devoluciones en la franja D
# A: 00-06, B: 07-12, C: 13-20, D:21-24
daytime_grouped.get_group('D').groupby('end_station_id').size().sort_values(ascending=False)
stations[stations.id == 70]

Unnamed: 0,id,name,lat,long,dock_count,city,installation_date
58,70,San Francisco Caltrain (Townsend at 4th),37.776617,-122.39526,19,San Francisco,8/23/2013


In [98]:
trips['weekday'] = trips.start_date.dt.weekday_name
trips.dtypes

id                             int64
duration                       int64
start_date            datetime64[ns]
start_station_name            object
start_station_id               int64
end_date              datetime64[ns]
end_station_name              object
end_station_id                 int64
bike_id                        int64
subscription_type             object
zip_code                      object
weekday                       object
daytime                       object
dtype: object

In [99]:
days = trips['start_date'].unique()
week_days=[0,0,0,0,0,0,0]
for date in days:
    week_days[date.dt.dayofweek] += 1
    
week_days

AttributeError: 'numpy.datetime64' object has no attribute 'dt'

In [87]:
trips.groupby('start_station_id').size().sort_values(ascending=False).head()


start_station_id
70    49092
69    33742
50    32934
60    27713
55    26089
dtype: int64