In [243]:
import pandas as pd #data manipulation and analysis
import numpy as np #data arrays
from datetime import datetime #date time
import glob #join csv's together
import os # makes file join OS independent
import plotly.express as px #data visualisation
from geopy import distance

In [244]:
#load downloaded csv in, change date strings to datetime64 (date/time format)
#define variables that we will use for concatenating
path =r'C:\Users\zaydm\Documents\Repos\Google_Data_Analytics_Capstone\CSV files'
file_type = 'csv'
seperator =','

#define our dataframe. Change date strings to datetime64 
#'infer_datetime_format' infers the format of the datetime strings - in this case parsing speed = 10x faster
bike_share = pd.concat([pd.read_csv(f, sep=seperator,parse_dates=['started_at', 'ended_at'],infer_datetime_format=True, dayfirst=True) 

#using glob/for loop to find the csv files from the specified folder to join them together
for f in glob.glob(os.path.join(path + "/*."+file_type))],ignore_index=True)

#rename datetime columns so they're better defined
bike_share.rename(columns= {'started_at' : 'start_time','ended_at' : 'end_time' }, inplace=True)

In [245]:
#The dataframe includes a few hundred entries when bikes were taken out of docks and checked for quality. These are not needed
bike_share.drop(index=bike_share[bike_share['rideable_type'] =='docked_bike'].index, inplace=True)

bike_share['rideable_type'].unique()

array(['electric_bike', 'classic_bike'], dtype=object)

In [246]:
#create new column that displays ride length
bike_share['ride_length_seconds'] = bike_share['end_time'] - bike_share['start_time']

#create ride length in seconds for analysis. The error below is false positive 
bike_share['ride_length_seconds'] = bike_share['ride_length_seconds'].dt.total_seconds()

In [247]:
bike_share.head(5)

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_length_seconds
0,C2F7DD78E82EC875,electric_bike,2022-01-13 11:59:47,2022-01-13 12:02:44,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.0128,-87.665906,42.01256,-87.674367,casual,177.0
1,A6CF8980A652D272,electric_bike,2022-01-10 08:41:56,2022-01-10 08:46:17,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.012763,-87.665967,42.01256,-87.674367,casual,261.0
2,BD0F91DFF741C66D,classic_bike,2022-01-25 04:53:40,2022-01-25 04:58:01,Sheffield Ave & Fullerton Ave,TA1306000016,Greenview Ave & Fullerton Ave,TA1307000001,41.925602,-87.653708,41.92533,-87.6658,member,261.0
3,CBB80ED419105406,classic_bike,2022-01-04 00:18:04,2022-01-04 00:33:00,Clark St & Bryn Mawr Ave,KA1504000151,Paulina St & Montrose Ave,TA1309000021,41.983593,-87.669154,41.961507,-87.671387,casual,896.0
4,DDC963BFDDA51EEA,classic_bike,2022-01-20 01:31:10,2022-01-20 01:37:12,Michigan Ave & Jackson Blvd,TA1309000002,State St & Randolph St,TA1305000029,41.87785,-87.62408,41.884621,-87.627834,member,362.0


In [248]:
#get day of week (dow) integer for the dates in "start time" column.
bike_share['day_of_week'] = bike_share['start_time'].dt.dayofweek

days = {0:1, 1:2, 2:3, 3:4, 4:5, 5:6, 6:7}
bike_share['day_of_week'] = bike_share['day_of_week'].replace(days)

In [249]:
bike_share.head()

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_length_seconds,day_of_week
0,C2F7DD78E82EC875,electric_bike,2022-01-13 11:59:47,2022-01-13 12:02:44,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.0128,-87.665906,42.01256,-87.674367,casual,177.0,4
1,A6CF8980A652D272,electric_bike,2022-01-10 08:41:56,2022-01-10 08:46:17,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.012763,-87.665967,42.01256,-87.674367,casual,261.0,1
2,BD0F91DFF741C66D,classic_bike,2022-01-25 04:53:40,2022-01-25 04:58:01,Sheffield Ave & Fullerton Ave,TA1306000016,Greenview Ave & Fullerton Ave,TA1307000001,41.925602,-87.653708,41.92533,-87.6658,member,261.0,2
3,CBB80ED419105406,classic_bike,2022-01-04 00:18:04,2022-01-04 00:33:00,Clark St & Bryn Mawr Ave,KA1504000151,Paulina St & Montrose Ave,TA1309000021,41.983593,-87.669154,41.961507,-87.671387,casual,896.0,2
4,DDC963BFDDA51EEA,classic_bike,2022-01-20 01:31:10,2022-01-20 01:37:12,Michigan Ave & Jackson Blvd,TA1309000002,State St & Randolph St,TA1305000029,41.87785,-87.62408,41.884621,-87.627834,member,362.0,4


In [250]:
missing_values_count = bike_share.isnull().sum()

missing_values_count[:]
#We can see from start_station_id there are 34840 missing values and from end_station_id there are 38282 missing values

ride_id                    0
rideable_type              0
start_time                 0
end_time                   0
start_station_name     34840
start_station_id       34840
end_station_name       38247
end_station_id         38247
start_lat                  0
start_lng                  0
end_lat                  128
end_lng                  128
member_casual              0
ride_length_seconds        0
day_of_week                0
dtype: int64

In [251]:
#since we can't create start/end station names, we can drop the entire rows using dropna()
bike_share = bike_share.dropna()

#second check ensuring we have no null values
missing_values_count_2 = bike_share.isnull().sum()
missing_values_count_2[:]

ride_id                0
rideable_type          0
start_time             0
end_time               0
start_station_name     0
start_station_id       0
end_station_name       0
end_station_id         0
start_lat              0
start_lng              0
end_lat                0
end_lng                0
member_casual          0
ride_length_seconds    0
day_of_week            0
dtype: int64

In [252]:
#checking to ensure no duplicate ride_id
unique_id = len(pd.unique(bike_share["ride_id"]))

print("# of unique Id: " + str(unique_id))

# of unique Id: 167019


In [253]:
#checking for any negative ride lengths as previous year data had some errors
#neg_time = bike_share[bike_share['ride_length_seconds'] < 0]
#print(neg_time)

After realising we have both lat and long start and finish points, we can work out the distance between the start and finish times. Its important to note that whilst these wont tell us the exact distance of the rides as it'll work out the path in a straight line and ofcourse won't include roads,obstructions etc. The purpose of this calculation is to see how much riders travel relative to each other and then compare volumes too. For e.g. member rides ride on average 7 miles on each trip, casual riders ride 10 miles on each trip. We then see volume on weekdays - this could indicate that member riders are commuting - faster speeds, same distance whereas the casual riders are riding more on weekends, slower speeds but longer distances. 

In [254]:
index_0_start = (42.012800,	-87.665906)
index_0_end = (42.012560,	-87.674367)
print(distance.distance(index_0_start, index_0_end).miles)

0.43580980480248127


In [255]:
#change latitude and longitude columns from string to float
bike_share.astype({'start_lat':'float', 'start_lng':'float', 'end_lat':'float', 'end_lng':'float'})
bike_share.dtypes

ride_id                        object
rideable_type                  object
start_time             datetime64[ns]
end_time               datetime64[ns]
start_station_name             object
start_station_id               object
end_station_name               object
end_station_id                 object
start_lat                     float64
start_lng                     float64
end_lat                       float64
end_lng                       float64
member_casual                  object
ride_length_seconds           float64
day_of_week                     int64
dtype: object

In [256]:
#Joining latitude and longitude in a single column to provide a start/end location
bike_share['start_lat_long'] = [', '.join(str(x) for x in y) for y in map(tuple, bike_share[['start_lat', 'start_lng']].values)]
bike_share['end_lat_long'] = [', '.join(str(x) for x in y) for y in map(tuple, bike_share[['end_lat', 'end_lng']].values)]

In [257]:
index_0_start = ('41.89571428, -87.6722095')
index_0_end = ('41.879255, -87.639904')
print(distance.distance(index_0_start, index_0_end).miles)
#Vincenty distance (vincenty) uses a more accurate ellipsoidal model of the earth.

2.016462858476188


In [258]:
# Geopy can calculate geodesic distance between two points. The geodesic distance is the shortest distance on the surface of an ellipsoidal model of the earth. 
# The default algorithm uses the method is given by Karney (2013) (geodesic); this is accurate to round-off and always converges.
# To allow for it to work for columns, we define distancer and use row to allow each ID's distance to be calculated and then placed into the journey distance column
# Apply does take some time to execute
# Uk units = miles
def distancer(row):
    coords_1 = (row['start_lat_long'])
    coords_2 = (row['end_lat_long'])
    return distance.distance(coords_1, coords_2).miles

bike_share['journey_distance_miles'] = bike_share.apply(distancer, axis=1)

In [259]:
bike_share.head()

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_length_seconds,day_of_week,start_lat_long,end_lat_long,journey_distance_miles
0,C2F7DD78E82EC875,electric_bike,2022-01-13 11:59:47,2022-01-13 12:02:44,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.0128,-87.665906,42.01256,-87.674367,casual,177.0,4,"42.0128005, -87.665906","42.01256011541, -87.6743671152",0.435817
1,A6CF8980A652D272,electric_bike,2022-01-10 08:41:56,2022-01-10 08:46:17,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.012763,-87.665967,42.01256,-87.674367,casual,261.0,1,"42.012763, -87.6659675","42.01256011541, -87.6743671152",0.432562
2,BD0F91DFF741C66D,classic_bike,2022-01-25 04:53:40,2022-01-25 04:58:01,Sheffield Ave & Fullerton Ave,TA1306000016,Greenview Ave & Fullerton Ave,TA1307000001,41.925602,-87.653708,41.92533,-87.6658,member,261.0,2,"41.9256018819, -87.6537080423","41.92533, -87.6658",0.623515
3,CBB80ED419105406,classic_bike,2022-01-04 00:18:04,2022-01-04 00:33:00,Clark St & Bryn Mawr Ave,KA1504000151,Paulina St & Montrose Ave,TA1309000021,41.983593,-87.669154,41.961507,-87.671387,casual,896.0,2,"41.983593, -87.669154","41.961507, -87.671387",1.528651
4,DDC963BFDDA51EEA,classic_bike,2022-01-20 01:31:10,2022-01-20 01:37:12,Michigan Ave & Jackson Blvd,TA1309000002,State St & Randolph St,TA1305000029,41.87785,-87.62408,41.884621,-87.627834,member,362.0,4,"41.87785, -87.62408","41.88462107257936, -87.62783423066139",0.505842


In [260]:
#drop columns we no longer need - cleaner DF
bike_share.drop(columns= ['start_lat', 'start_lng','end_lat', 'end_lng'], inplace=True)

In [261]:
bike_share.head()

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,member_casual,ride_length_seconds,day_of_week,start_lat_long,end_lat_long,journey_distance_miles
0,C2F7DD78E82EC875,electric_bike,2022-01-13 11:59:47,2022-01-13 12:02:44,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,casual,177.0,4,"42.0128005, -87.665906","42.01256011541, -87.6743671152",0.435817
1,A6CF8980A652D272,electric_bike,2022-01-10 08:41:56,2022-01-10 08:46:17,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,casual,261.0,1,"42.012763, -87.6659675","42.01256011541, -87.6743671152",0.432562
2,BD0F91DFF741C66D,classic_bike,2022-01-25 04:53:40,2022-01-25 04:58:01,Sheffield Ave & Fullerton Ave,TA1306000016,Greenview Ave & Fullerton Ave,TA1307000001,member,261.0,2,"41.9256018819, -87.6537080423","41.92533, -87.6658",0.623515
3,CBB80ED419105406,classic_bike,2022-01-04 00:18:04,2022-01-04 00:33:00,Clark St & Bryn Mawr Ave,KA1504000151,Paulina St & Montrose Ave,TA1309000021,casual,896.0,2,"41.983593, -87.669154","41.961507, -87.671387",1.528651
4,DDC963BFDDA51EEA,classic_bike,2022-01-20 01:31:10,2022-01-20 01:37:12,Michigan Ave & Jackson Blvd,TA1309000002,State St & Randolph St,TA1305000029,member,362.0,4,"41.87785, -87.62408","41.88462107257936, -87.62783423066139",0.505842


In [262]:
mc_type = bike_share.groupby('member_casual')
mc_type['ride_length_seconds'].mean()

member_casual
casual    1083.350012
member     627.947491
Name: ride_length_seconds, dtype: float64

In [263]:
mc_day_type = bike_share.groupby(['member_casual','day_of_week'])

bike_share_mean = mc_day_type['ride_length_seconds'].mean().reset_index()

In [264]:
weekday = {1:'Mon', 2:'Tue', 3:'Wed', 4:'Thur', 5:'Fri', 6:'Sat', 7:'Sun'}

bike_share_mean['day_of_week'] = bike_share_mean['day_of_week'].replace(weekday)

bike_share_mean.set_index(['member_casual', 'day_of_week'], inplace = True)

bike_share_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,ride_length_seconds
member_casual,day_of_week,Unnamed: 2_level_1
casual,Mon,1189.811076
casual,Tue,975.144537
casual,Wed,997.779924
casual,Thur,926.678722
casual,Fri,1003.43048
casual,Sat,1091.172324
casual,Sun,1274.848961
member,Mon,632.176459
member,Tue,607.971865
member,Wed,606.826445


In [277]:
volume_count =mc_day_type['journey_distance_miles','ride_length_seconds'].mean().reset_index()

volume_count['day_of_week'] = volume_count['day_of_week'].replace(weekday)

pd.to_timedelta(volume_count['ride_length_seconds'], unit= 's')

volume_count.set_index(['member_casual', 'day_of_week'], inplace = True)

volume_count

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,journey_distance_miles,ride_length_seconds
member_casual,day_of_week,Unnamed: 2_level_1,Unnamed: 3_level_1
casual,Mon,1.161229,1189.811076
casual,Tue,1.088365,975.144537
casual,Wed,1.122839,997.779924
casual,Thur,1.070813,926.678722
casual,Fri,1.304461,1003.43048
casual,Sat,1.126692,1091.172324
casual,Sun,1.177478,1274.848961
member,Mon,1.096914,632.176459
member,Tue,1.090527,607.971865
member,Wed,1.065407,606.826445


In [267]:
mc_type['ride_length_seconds','journey_distance_miles'].mean()

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,ride_length_seconds,journey_distance_miles
member_casual,Unnamed: 1_level_1,Unnamed: 2_level_1
casual,1083.350012,1.152687
member,627.947491,1.066377


total average ride length, median, max, min. TYPE OF BIKE

In [268]:
global_desc = bike_share[['ride_length_seconds','journey_distance_miles']].describe().applymap('{:.1f}'.format)

global_desc

Unnamed: 0,ride_length_seconds,journey_distance_miles
count,167019.0,167019.0
mean,697.4,1.1
std,1300.8,2.0
min,0.0,0.0
25%,300.0,0.5
50%,480.0,0.8
75%,780.0,1.3
max,88628.0,740.8


In [269]:
#we have two bike types. groupby bike type and then find the mean of the ride length and distance
bike_type = bike_share.groupby(['rideable_type'])

bike_type_avg = bike_type['ride_length_seconds','journey_distance_miles'].mean().reset_index()

bike_type_avg.set_index(('rideable_type'), inplace = True)

bike_type_avg

  after removing the cwd from sys.path.


Unnamed: 0_level_0,ride_length_seconds,journey_distance_miles
rideable_type,Unnamed: 1_level_1,Unnamed: 2_level_1
classic_bike,736.553072,0.995423
electric_bike,613.316541,1.259992
