In [60]:
import numpy as np
import pandas as pd
from datetime import datetime

## Step 1: Helper Functions 

### Convert to datetime object and add new time features

In [74]:
def to_datetime(df):
    df1 = df.copy()
    df1['starttime'] = pd.to_datetime(df1['starttime'])
    df1['stoptime'] = pd.to_datetime(df1['stoptime'])
    df1['start_date'] = df1['starttime'].dt.date
    df1['start_time'] = df1['starttime'].dt.time
    df1['start_hour'] = df1['starttime'].dt.hour
    df1['start_dayofweek'] = df1['starttime'].dt.weekday   # Monday is 0, Sunday is 6
    #df1['start_dayofweek'] = df1['starttime'].dt.weekday_name  # The name of day in a week (e.g. Monday)
    df1['stop_date'] = df1['stoptime'].dt.date
    df1['stop_time'] = df1['stoptime'].dt.time
    df1['stop_hour'] = df1['stoptime'].dt.hour
    df1['stop_dayofweek'] = df1['stoptime'].dt.weekday
    return df1
# More datetime attributes can be found: https://kite.com/python/docs/pandas.core.indexes.accessors.DatetimeProperties 

### Take subset of each dataset

In [82]:
# Only focus on bike usage from 5am till 8pm 
def ignore_offpeak(df):
    df1 = df.copy()
    mask = df1['start_hour'].apply(lambda x: 5 <= x <= 20)
    return df1[mask]

## Step 2: 

In [75]:
May2016 = pd.read_csv('./Tripdata/201605-citibike-tripdata.csv')

In [76]:
May2016.dtypes

tripduration                 int64
starttime                   object
stoptime                    object
start station id             int64
start station name          object
start station latitude     float64
start station longitude    float64
end station id               int64
end station name            object
end station latitude       float64
end station longitude      float64
bikeid                       int64
usertype                    object
birth year                 float64
gender                       int64
dtype: object

In [77]:
May2016.shape

(1212280, 15)

In [78]:
May2016.sample(5)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
672750,104,5/19/2016 13:23:22,5/19/2016 13:25:06,295,Pike St & E Broadway,40.714067,-73.992939,361,Allen St & Hester St,40.716059,-73.991908,18190,Subscriber,1991.0,2
661426,403,5/19/2016 08:47:23,5/19/2016 08:54:06,402,Broadway & E 22 St,40.740343,-73.989551,379,W 31 St & 7 Ave,40.749156,-73.9916,25011,Subscriber,1967.0,1
892815,368,5/24/2016 17:43:04,5/24/2016 17:49:13,486,Broadway & W 29 St,40.746201,-73.988557,470,W 20 St & 8 Ave,40.743453,-74.00004,15091,Subscriber,1988.0,1
642730,468,5/18/2016 18:20:09,5/18/2016 18:27:57,2006,Central Park S & 6 Ave,40.765909,-73.976342,3159,W 67 St & Broadway,40.774925,-73.982666,21102,Subscriber,1985.0,1
598147,418,5/17/2016 18:28:44,5/17/2016 18:35:43,477,W 41 St & 8 Ave,40.756405,-73.990026,72,W 52 St & 11 Ave,40.767272,-73.993929,24854,Subscriber,1986.0,1


In [79]:
May2016 = to_datetime(May2016)
May2016.sample(5)

In [83]:
# Only focus on time from 5 am till 8 pm 
May2016 = ignore_offpeak(May2016)
May2016

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,...,birth year,gender,start_date,start_time,start_hour,start_dayofweek,stop_date,stop_time,stop_hour,stop_dayofweek
1039,344,2016-05-01 05:00:07,2016-05-01 05:05:52,496,E 16 St & 5 Ave,40.737262,-73.992390,470,W 20 St & 8 Ave,40.743453,...,1995.0,2,2016-05-01,05:00:07,5,6,2016-05-01,05:05:52,5,6
1040,554,2016-05-01 05:02:51,2016-05-01 05:12:05,484,W 44 St & 5 Ave,40.755003,-73.980144,478,11 Ave & W 41 St,40.760301,...,1991.0,1,2016-05-01,05:02:51,5,6,2016-05-01,05:12:05,5,6
1041,147,2016-05-01 05:04:09,2016-05-01 05:06:37,319,Fulton St & Broadway,40.711066,-74.009447,306,Cliff St & Fulton St,40.708235,...,1984.0,2,2016-05-01,05:04:09,5,6,2016-05-01,05:06:37,5,6
1042,466,2016-05-01 05:07:36,2016-05-01 05:15:22,515,W 43 St & 10 Ave,40.760094,-73.994618,388,W 26 St & 10 Ave,40.749718,...,1954.0,1,2016-05-01,05:07:36,5,6,2016-05-01,05:15:22,5,6
1043,518,2016-05-01 05:09:25,2016-05-01 05:18:03,3153,E 71 St & 2 Ave,40.768175,-73.959103,456,E 53 St & Madison Ave,40.759711,...,1977.0,1,2016-05-01,05:09:25,5,6,2016-05-01,05:18:03,5,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1208825,609,2016-05-31 20:59:49,2016-05-31 21:09:59,116,W 17 St & 8 Ave,40.741776,-74.001497,483,E 12 St & 3 Ave,40.732233,...,1992.0,1,2016-05-31,20:59:49,20,1,2016-05-31,21:09:59,21,1
1208826,687,2016-05-31 20:59:53,2016-05-31 21:11:20,459,W 20 St & 11 Ave,40.746745,-74.007756,2004,6 Ave & Broome St,40.724399,...,1992.0,1,2016-05-31,20:59:53,20,1,2016-05-31,21:11:20,21,1
1208827,245,2016-05-31 20:59:56,2016-05-31 21:04:01,312,Allen St & Stanton St,40.722055,-73.989111,326,E 11 St & 1 Ave,40.729538,...,,0,2016-05-31,20:59:56,20,1,2016-05-31,21:04:01,21,1
1208828,420,2016-05-31 20:59:57,2016-05-31 21:06:58,530,11 Ave & W 59 St,40.771522,-73.990541,514,12 Ave & W 40 St,40.760875,...,1985.0,1,2016-05-31,20:59:57,20,1,2016-05-31,21:06:58,21,1


In [84]:
# Inspect Missing Values 
May2016.isna().sum(axis = 0)

tripduration                    0
starttime                       0
stoptime                        0
start station id                0
start station name              0
start station latitude          0
start station longitude         0
end station id                  0
end station name                0
end station latitude            0
end station longitude           0
bikeid                          0
usertype                        0
birth year                 162533
gender                          0
start_date                      0
start_time                      0
start_hour                      0
start_dayofweek                 0
stop_date                       0
stop_time                       0
stop_hour                       0
stop_dayofweek                  0
dtype: int64