## Preliminaries

In [1]:
import math
import numpy as np
import pandas as pd

## Loading Data

In [6]:
# url = 'https://drive.google.com/file/d/1GHcfZdbJ2OdFJbv4kvTUWvs2iyOFokk5/view?usp=sharing'
# path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
path = 'https://drive.google.com/uc?export=download&id=1GHcfZdbJ2OdFJbv4kvTUWvs2iyOFokk5&confirm=t'
df = pd.read_csv(path, encoding='latin1')

df.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,Central Park West & W 76 St,40.778968,-73.973747,3283.0,W 89 St & Columbus Ave,40.788221,-73.970416,15839,Subscriber,1971,1
1,316,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,Pershing Square North,40.751873,-73.977706,518.0,E 39 St & 2 Ave,40.747804,-73.973442,32723,Subscriber,1964,1
2,591,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3154.0,E 77 St & 3 Ave,40.773142,-73.958562,27451,Subscriber,1987,1
3,2719,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,1 Ave & E 16 St,40.732219,-73.981656,3709.0,W 15 St & 6 Ave,40.738046,-73.99643,21579,Subscriber,1990,1
4,303,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,Great Jones St,40.727434,-73.99379,503.0,E 20 St & Park Ave,40.738274,-73.98752,35379,Subscriber,1979,1


In [19]:
df.shape

(967287, 18)

In [8]:
df.dtypes

tripduration                 int64
starttime                   object
stoptime                    object
start station id           float64
start station name          object
start station latitude     float64
start station longitude    float64
end station id             float64
end station name            object
end station latitude       float64
end station longitude      float64
bikeid                       int64
usertype                    object
birth year                   int64
gender                       int64
dtype: object

## Extracting Time-related Features

In [12]:
df['starttime'] = pd.to_datetime(df['starttime'])
df['stoptime'] = pd.to_datetime(df['stoptime'])

In [13]:
df.dtypes

tripduration                        int64
starttime                  datetime64[ns]
stoptime                   datetime64[ns]
start station id                  float64
start station name                 object
start station latitude            float64
start station longitude           float64
end station id                    float64
end station name                   object
end station latitude              float64
end station longitude             float64
bikeid                              int64
usertype                           object
birth year                          int64
gender                              int64
dtype: object

Here are the <a href="https://pandas.pydata.org/pandas-docs/stable/reference/series.html#datetimelike-properties">datetimelike properties</a> you could use in Pandas.

In [41]:
df['dayofweek'] = df['starttime'].dt.dayofweek
df['dayname'] = df['starttime'].dt.day_name()
df['hourofday'] = df['starttime'].dt.hour
df['year'] = df['starttime'].dt.year
df['month'] = df['starttime'].dt.month

df['dayname'].value_counts()

Wednesday    189109
Tuesday      161683
Thursday     158315
Friday       149767
Monday       117331
Sunday        98473
Saturday      92609
Name: dayname, dtype: int64

In [42]:
df.dtypes

tripduration                        int64
starttime                  datetime64[ns]
stoptime                   datetime64[ns]
start station id                  float64
start station name                 object
start station latitude            float64
start station longitude           float64
end station id                    float64
end station name                   object
end station latitude              float64
end station longitude             float64
bikeid                              int64
usertype                           object
birth year                          int64
gender                              int64
dayofweek                           int64
hourofday                           int64
dayname                            object
year                                int64
month                               int64
dtype: object

## Computing Distance

 We use an existing function that calculates the geodesic distance using the Haversine formula given the starting and ending longitude and latitudes: calculate_distance(lat1, lon1, lat2, lon2)

Credits to <a href="https://gist.github.com/rochacbruno/2883505">Wayne Dyck</a> for the function.

In [20]:
def calculate_distance(lat1, lon1, lat2, lon2):
    """
    Calculates the distance provided a pair of longitudes and latitudes
    using the Haversine formula
    
    Returns the distance in kilometers.
    """
    radius = 6371 # km

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c

    return d

In [38]:
%%time

df[['start station latitude', 'start station longitude',
    'end station latitude', 'end station longitude']].head(100000) \
    .apply(lambda x: calculate_distance(*x), axis=1)

CPU times: user 1.77 s, sys: 7.76 ms, total: 1.78 s
Wall time: 1.91 s


0        1.066491
1        0.577722
2        2.034013
3        1.403367
4        1.316072
           ...   
99995    0.000000
99996    0.535680
99997    3.853541
99998    0.512259
99999    1.314006
Length: 100000, dtype: float64

In [39]:
%%time

distances = []

for index, row in df[['start station latitude', 'start station longitude',
             'end station latitude', 'end station longitude']].head(100000).iterrows(): 
  distance = calculate_distance(row[0], row[1], row[2], row[3])
  distances.append(distance)

CPU times: user 5.97 s, sys: 6.14 ms, total: 5.98 s
Wall time: 6.17 s
