In [2]:
import pandas as pd #data manipulation and analysis
import numpy as np #data arrays
from datetime import datetime #datetime
from datetime import time
from datetime import timedelta
import datetime
import glob #join csv's together
import os # makes file join OS independent
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px #data visualisation
from geopy import distance # Vincenty formula for latitude and longitude distance calculations

In [3]:
#load downloaded csv in, change date strings to datetime64 (date/time format)
#define variables that we will use for concatenating
path =r'C:\Users\zaydm\Documents\Repos\Google_Data_Analytics_Capstone\CSV'
file_type = 'csv'
seperator =','

#define our dataframe. Change date strings to datetime64 
#'infer_datetime_format' infers the format of the datetime strings - in this case parsing speed = 10x faster
bike_share = pd.concat([pd.read_csv(f, sep=seperator,parse_dates=['started_at', 'ended_at'],infer_datetime_format=True, dayfirst=True) 

#using glob/for loop to find the csv files from the specified folder to join them together
for f in glob.glob(os.path.join(path + "/*."+file_type))],ignore_index=True)

#rename datetime columns so they're better defined
bike_share.rename(columns= {'started_at' : 'start_time','ended_at' : 'end_time' }, inplace=True)

In [4]:
bike_share.head()

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,6C992BD37A98A63F,classic_bike,2021-04-12 18:25:36,2021-04-12 18:56:55,State St & Pearson St,TA1307000061,Southport Ave & Waveland Ave,13235,41.897448,-87.628722,41.94815,-87.66394,member
1,1E0145613A209000,docked_bike,2021-04-27 17:27:11,2021-04-27 18:31:29,Dorchester Ave & 49th St,KA1503000069,Dorchester Ave & 49th St,KA1503000069,41.805772,-87.592464,41.805772,-87.592464,casual
2,E498E15508A80BAD,docked_bike,2021-04-03 12:42:45,2021-04-07 11:40:24,Loomis Blvd & 84th St,20121,Loomis Blvd & 84th St,20121,41.741487,-87.65841,41.741487,-87.65841,casual
3,1887262AD101C604,classic_bike,2021-04-17 09:17:42,2021-04-17 09:42:48,Honore St & Division St,TA1305000034,Southport Ave & Waveland Ave,13235,41.903119,-87.673935,41.94815,-87.66394,member
4,C123548CAB2A32A5,docked_bike,2021-04-03 12:42:25,2021-04-03 14:13:42,Loomis Blvd & 84th St,20121,Loomis Blvd & 84th St,20121,41.741487,-87.65841,41.741487,-87.65841,casual


In [5]:
#The dataframe includes a few hundred entries when bikes were taken out of docks and checked for quality. These are not needed
bike_share.drop(index=bike_share[bike_share['rideable_type'] =='docked_bike'].index, inplace=True)

bike_share['rideable_type'].unique()

array(['classic_bike', 'electric_bike'], dtype=object)

In [6]:
missing_values_count = bike_share.isnull().sum()

missing_values_count[:]
# We can see from start_station_id there are 745376 missing values and from end_station_id there are 795782 missing values. 
# However these still include ride start/end times so we can keep them. 
# The data with end_lat| end_lng (4251) missing values will need to be removed (as using lat/lng for distance calculations)

ride_id                    0
rideable_type              0
start_time                 0
end_time                   0
start_station_name    745376
start_station_id      745373
end_station_name      795782
end_station_id        795782
start_lat                  0
start_lng                  0
end_lat                 4251
end_lng                 4251
member_casual              0
dtype: int64

In [7]:
bike_share.dropna(subset=['end_lat','end_lng'], inplace=True)

In [8]:
#create ride length column in minutes - needed for analysis
bike_share['ride_length_mins'] = (bike_share['end_time'] - bike_share['start_time'])/pd.Timedelta(minutes = 1)

In [9]:
#checking for any negative ride lengths as previous year data had some errors
print(bike_share[bike_share['ride_length_mins'] < 0])

                  ride_id  rideable_type          start_time  \
22361    BC53ECCBC76278FD   classic_bike 2021-04-07 16:11:33   
31844    209C097828F9CD43  electric_bike 2021-04-27 17:13:44   
292566   6E81034B446FC2FD  electric_bike 2021-04-23 09:43:39   
292678   318DD838369AEA61   classic_bike 2021-04-30 10:56:32   
293033   8ADD13BD8F6A7567   classic_bike 2021-04-17 12:43:36   
...                   ...            ...                 ...   
4969725  5AA2BC364BC7A569  electric_bike 2021-11-07 01:59:53   
4971172  F4E4485BFB33D916  electric_bike 2021-11-07 01:57:53   
4972542  B506DCD44974C575  electric_bike 2021-11-07 01:53:34   
5623702  2D97E3C98E165D80   classic_bike 2022-03-05 11:00:57   
5626879  7407049C5D89A13D  electric_bike 2022-03-05 11:38:04   

                   end_time                      start_station_name  \
22361   2021-04-07 16:11:26                 Ashland Ave & Grand Ave   
31844   2021-04-27 17:11:32                                     NaN   
292566  2021-04-23

In [10]:
#drop rows where columns have negative ride lengths (some bikes were taken out of circulation/ QC issues thus need to be deleted)
bike_share.drop(index=bike_share[bike_share['ride_length_mins'] < 0].index, inplace=True)

In [11]:
print(bike_share[bike_share['ride_length_mins'] < 0])

Empty DataFrame
Columns: [ride_id, rideable_type, start_time, end_time, start_station_name, start_station_id, end_station_name, end_station_id, start_lat, start_lng, end_lat, end_lng, member_casual, ride_length_mins]
Index: []


In [12]:
#change specific columns to categorical data - easier to manipulate
bike_share = bike_share.astype({'rideable_type':'category','member_casual':'category'})

In [13]:
#Create new columns for day and month - will be used later on for groupby analysis
bike_share['day'] = bike_share['start_time'].dt.day_name().str.slice(stop=3)
bike_share['month'] = bike_share['start_time'].dt.month_name().str.slice(stop=3)
bike_share.head()

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,ride_length_mins,day,month
0,6C992BD37A98A63F,classic_bike,2021-04-12 18:25:36,2021-04-12 18:56:55,State St & Pearson St,TA1307000061,Southport Ave & Waveland Ave,13235,41.897448,-87.628722,41.94815,-87.66394,member,31.316667,Mon,Apr
3,1887262AD101C604,classic_bike,2021-04-17 09:17:42,2021-04-17 09:42:48,Honore St & Division St,TA1305000034,Southport Ave & Waveland Ave,13235,41.903119,-87.673935,41.94815,-87.66394,member,25.1,Sat,Apr
5,097E76F3651B1AC1,classic_bike,2021-04-25 18:43:18,2021-04-25 18:43:59,Clinton St & Polk St,15542,Clinton St & Polk St,15542,41.871467,-87.640949,41.871467,-87.640949,casual,0.683333,Sun,Apr
6,53C38EB01E6FA5C4,classic_bike,2021-04-03 16:28:21,2021-04-03 16:29:47,Ashland Ave & 63rd St,16948,Ashland Ave & 63rd St,16948,41.779374,-87.664843,41.779374,-87.664843,casual,1.433333,Sat,Apr
7,D53AC014EFD6E2BA,electric_bike,2021-04-06 16:35:06,2021-04-06 17:00:56,Dorchester Ave & 49th St,KA1503000069,Dorchester Ave & 49th St,KA1503000069,41.805832,-87.592478,41.805803,-87.592662,casual,25.833333,Tue,Apr


In [14]:
#rename days via list and change dtype to category
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

bike_share['day'] = pd.Categorical(bike_share['day'], categories = days)

In [15]:
#rename months via list and change dtype to category
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

bike_share['month'] = pd.Categorical(bike_share['month'], categories = months)

In [16]:
#checking to ensure no duplicate ride_id
unique_id = len(pd.unique(bike_share['ride_id']))

print("# of unique Id: " + str(unique_id))

# of unique Id: 5415152


In [17]:
#change the lat/long data type to float and then round to 4 d.p. (for coordinates 4 d.p is accurate to 11m which is sufficient in our case)
bike_share.astype({'start_lat':'float', 'start_lng':'float', 'end_lat':'float', 'end_lng':'float'})

rnd_col = ['start_lat','start_lng','end_lat','end_lng']
bike_share[rnd_col] = bike_share[rnd_col].round(4)

In [18]:
#Joining latitude and longitude in a single column to provide a start/end location
bike_share['start_lat_long'] = [', '.join(str(x) for x in y) for y in map(tuple, bike_share[['start_lat', 'start_lng']].values)]
bike_share['end_lat_long'] = [', '.join(str(x) for x in y) for y in map(tuple, bike_share[['end_lat', 'end_lng']].values)]

In [19]:
#drop columns we no longer need - cleaner DF
bike_share.drop(columns= ['start_lat', 'start_lng','end_lat', 'end_lng'], inplace=True)

In [20]:
bike_share.head()

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,member_casual,ride_length_mins,day,month,start_lat_long,end_lat_long
0,6C992BD37A98A63F,classic_bike,2021-04-12 18:25:36,2021-04-12 18:56:55,State St & Pearson St,TA1307000061,Southport Ave & Waveland Ave,13235,member,31.316667,Mon,Apr,"41.8974, -87.6287","41.9482, -87.6639"
3,1887262AD101C604,classic_bike,2021-04-17 09:17:42,2021-04-17 09:42:48,Honore St & Division St,TA1305000034,Southport Ave & Waveland Ave,13235,member,25.1,Sat,Apr,"41.9031, -87.6739","41.9482, -87.6639"
5,097E76F3651B1AC1,classic_bike,2021-04-25 18:43:18,2021-04-25 18:43:59,Clinton St & Polk St,15542,Clinton St & Polk St,15542,casual,0.683333,Sun,Apr,"41.8715, -87.6409","41.8715, -87.6409"
6,53C38EB01E6FA5C4,classic_bike,2021-04-03 16:28:21,2021-04-03 16:29:47,Ashland Ave & 63rd St,16948,Ashland Ave & 63rd St,16948,casual,1.433333,Sat,Apr,"41.7794, -87.6648","41.7794, -87.6648"
7,D53AC014EFD6E2BA,electric_bike,2021-04-06 16:35:06,2021-04-06 17:00:56,Dorchester Ave & 49th St,KA1503000069,Dorchester Ave & 49th St,KA1503000069,casual,25.833333,Tue,Apr,"41.8058, -87.5925","41.8058, -87.5927"


In [21]:
# Geopy can calculate geodesic distance between two points. The geodesic distance is the shortest distance on the surface of an ellipsoidal model of the earth. 
# The default algorithm uses the method is given by Karney (2013) (geodesic); this is accurate to round-off and always converges.
# To allow for it to work for columns, we define distancer and use row to allow each ID's distance to be calculated and then placed into the journey distance column
# Apply does take some time to execute
# Uk units = miles


def distancer(row):
    coords_1 = (row['start_lat_long'])
    coords_2 = (row['end_lat_long'])
    return distance.distance(coords_1, coords_2).miles

bike_share['journey_distance_miles'] = bike_share.apply(distancer, axis=1)

In [22]:
len(pd.unique(bike_share['start_lat_long']))

28420

In [23]:
len(pd.unique(bike_share['end_lat_long']))

71428

In [24]:
bike_share.head()

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,member_casual,ride_length_mins,day,month,start_lat_long,end_lat_long,journey_distance_miles
0,6C992BD37A98A63F,classic_bike,2021-04-12 18:25:36,2021-04-12 18:56:55,State St & Pearson St,TA1307000061,Southport Ave & Waveland Ave,13235,member,31.316667,Mon,Apr,"41.8974, -87.6287","41.9482, -87.6639",3.94768
3,1887262AD101C604,classic_bike,2021-04-17 09:17:42,2021-04-17 09:42:48,Honore St & Division St,TA1305000034,Southport Ave & Waveland Ave,13235,member,25.1,Sat,Apr,"41.9031, -87.6739","41.9482, -87.6639",3.155043
5,097E76F3651B1AC1,classic_bike,2021-04-25 18:43:18,2021-04-25 18:43:59,Clinton St & Polk St,15542,Clinton St & Polk St,15542,casual,0.683333,Sun,Apr,"41.8715, -87.6409","41.8715, -87.6409",0.0
6,53C38EB01E6FA5C4,classic_bike,2021-04-03 16:28:21,2021-04-03 16:29:47,Ashland Ave & 63rd St,16948,Ashland Ave & 63rd St,16948,casual,1.433333,Sat,Apr,"41.7794, -87.6648","41.7794, -87.6648",0.0
7,D53AC014EFD6E2BA,electric_bike,2021-04-06 16:35:06,2021-04-06 17:00:56,Dorchester Ave & 49th St,KA1503000069,Dorchester Ave & 49th St,KA1503000069,casual,25.833333,Tue,Apr,"41.8058, -87.5925","41.8058, -87.5927",0.010327


In [25]:
#groupby member/casual
mc_grp = bike_share.groupby('member_casual')

In [26]:
#apply groupby on the ride length and get some descriptive statistics
mc_grp['ride_length_mins'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
member_casual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
casual,2239313.0,23.256307,37.86765,0.0,8.45,14.583333,26.166667,1559.683333
member,3175839.0,13.116439,20.254973,0.0,5.433333,9.366667,16.216667,1499.933333


In [27]:
# average ride length of casual/member = 1.77 longer
23.256307/13.116439

1.77306561636127

In [28]:
#apply groupby on the ride distance and get some descriptive statistics
mc_grp['journey_distance_miles'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
member_casual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
casual,2239313.0,1.424146,1.359453,0.0,0.620952,1.096272,1.885854,740.825358
member,3175839.0,1.307537,1.174553,0.0,0.538432,0.949691,1.706932,19.881308


In [29]:
#On average, casual riders ride 10% longer distance than members, the max value seems to be an outlier 
1.42/1.31

1.0839694656488548

The below outlier proves why doing distance calculations proved to be great, because whilst with the time the ride may have looked genuine, upon comparing distance and time together it is clearly an outlier (e.g. test carried out for maintennace etc)

In [30]:
#upon searching for the max distance value we can see its an outlier since the journey distance was [~ 3 minutes yet distance was 740 miles, ~55 minutes yet distance was 71 miles)
outliers = (bike_share['journey_distance_miles'] > 70)
bike_share.loc[outliers]

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,member_casual,ride_length_mins,day,month,start_lat_long,end_lat_long,journey_distance_miles
4914898,9F438AD0AB380E3F,electric_bike,2021-11-09 06:47:52,2021-11-09 07:42:56,Narragansett & McLean,309,,,casual,55.066667,Tue,Nov,"41.92, -87.79","41.39, -88.97",71.188914
5231872,3327172413547F64,electric_bike,2022-01-14 11:13:15,2022-01-14 11:15:50,Pawel Bialowas - Test- PBSC charging station,Pawel Bialowas - Test- PBSC charging station,Pawel Bialowas - Test- PBSC charging station,Pawel Bialowas - Test- PBSC charging station,casual,2.583333,Fri,Jan,"45.635, -73.7965","41.8646, -87.681",740.825358


In [31]:
#dropping outlier
bike_share = bike_share.drop(index=bike_share[bike_share['journey_distance_miles'] > 70].index)

In [32]:
mc_grp = bike_share.groupby('member_casual')

mc_grp['ride_length_mins'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
member_casual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
casual,2239311.0,23.256302,37.867658,0.0,8.45,14.583333,26.166667,1559.683333
member,3175839.0,13.116439,20.254973,0.0,5.433333,9.366667,16.216667,1499.933333


In [33]:
#groupby member casual and bike type 
mc_ride_grp = bike_share.groupby(['member_casual', 'rideable_type'])

#remove scientific number format 
pd.set_option('display.float_format', lambda x: '%.2f' % x)

#descriptive stats of of rides by member casual and bike type 
mc_ride_grp[['ride_length_mins', 'journey_distance_miles']].agg([len,np.sum,np.mean,np.median,np.min,np.max])

Unnamed: 0_level_0,Unnamed: 1_level_0,ride_length_mins,ride_length_mins,ride_length_mins,ride_length_mins,ride_length_mins,ride_length_mins,journey_distance_miles,journey_distance_miles,journey_distance_miles,journey_distance_miles,journey_distance_miles,journey_distance_miles
Unnamed: 0_level_1,Unnamed: 1_level_1,len,sum,mean,median,amin,amax,len,sum,mean,median,amin,amax
member_casual,rideable_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
casual,classic_bike,1254430,32794848.07,26.14,15.95,0.0,1559.68,1254430,1661040.51,1.32,1.05,0.0,18.46
casual,electric_bike,984881,19283245.13,19.58,13.0,0.0,487.27,984881,1527256.33,1.55,1.2,0.0,19.99
member,classic_bike,1991926,26987183.75,13.55,9.78,0.0,1499.93,1991926,2450457.1,1.23,0.92,0.0,17.84
member,electric_bike,1183913,14668515.77,12.39,8.67,0.0,480.52,1183913,1702069.8,1.44,1.03,0.0,19.88


In [34]:
#descriptive stats of of rides by day
day_grp = bike_share.groupby(['member_casual','day'])

day_grp[['ride_length_mins', 'journey_distance_miles']].agg([len,np.sum,np.mean,np.median,np.min,np.max])

Unnamed: 0_level_0,Unnamed: 1_level_0,ride_length_mins,ride_length_mins,ride_length_mins,ride_length_mins,ride_length_mins,ride_length_mins,journey_distance_miles,journey_distance_miles,journey_distance_miles,journey_distance_miles,journey_distance_miles,journey_distance_miles
Unnamed: 0_level_1,Unnamed: 1_level_1,len,sum,mean,median,amin,amax,len,sum,mean,median,amin,amax
member_casual,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
casual,Mon,256837,6014218.37,23.42,14.43,0.0,1499.92,256837,351451.71,1.37,1.03,0.0,18.17
casual,Tue,247548,5150725.1,20.81,13.02,0.0,1499.93,247548,340578.35,1.38,1.05,0.0,18.02
casual,Wed,258644,5310680.28,20.53,13.02,0.0,1499.92,258644,359340.25,1.39,1.07,0.0,17.8
casual,Thu,265544,5401504.63,20.34,12.87,0.0,1499.93,265544,368951.47,1.39,1.07,0.0,18.52
casual,Fri,322539,7034140.77,21.81,13.83,0.0,1499.93,322539,454790.74,1.41,1.08,0.0,19.99
casual,Sat,474766,12060403.45,25.4,16.3,0.0,1559.68,474766,703124.05,1.48,1.17,0.0,18.62
casual,Sun,413433,11106420.6,26.86,16.98,0.0,1499.95,413433,610060.25,1.48,1.15,0.0,19.26
member,Mon,439294,5584428.63,12.71,8.97,0.0,1499.9,439294,561474.62,1.28,0.92,0.0,18.3
member,Tue,489950,6032641.58,12.31,8.87,0.0,1499.9,489950,628206.84,1.28,0.92,0.0,14.69
member,Wed,499777,6213972.33,12.43,9.02,0.0,1499.92,499777,647710.46,1.3,0.94,0.0,15.37


In [35]:
#descriptive stats of of rides by month
month_grp = bike_share.groupby(['member_casual','month'])

month_grp[['ride_length_mins', 'journey_distance_miles']].agg([len,np.sum,np.mean,np.median,np.min,np.max])

Unnamed: 0_level_0,Unnamed: 1_level_0,ride_length_mins,ride_length_mins,ride_length_mins,ride_length_mins,ride_length_mins,ride_length_mins,journey_distance_miles,journey_distance_miles,journey_distance_miles,journey_distance_miles,journey_distance_miles,journey_distance_miles
Unnamed: 0_level_1,Unnamed: 1_level_1,len,sum,mean,median,amin,amax,len,sum,mean,median,amin,amax
member_casual,month,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
casual,Jan,17519,289745.47,16.54,9.75,0.0,1499.95,17519,20329.09,1.16,0.88,0.0,11.49
casual,Feb,20017,359327.55,17.95,10.47,0.0,1499.87,20017,24553.42,1.23,0.93,0.0,15.13
casual,Mar,81420,1788884.32,21.97,13.32,0.0,1559.68,81420,111563.78,1.37,1.03,0.0,18.46
casual,Apr,111700,2929118.45,26.22,15.78,0.0,1499.92,111700,151063.54,1.35,1.04,0.0,18.32
casual,May,213224,5726708.05,26.86,16.83,0.0,1499.88,213224,296915.5,1.39,1.07,0.0,18.87
casual,Jun,318437,8059607.37,25.31,15.97,0.0,1499.93,318437,457120.42,1.44,1.13,0.0,18.62
casual,Jul,383767,9269699.35,24.15,15.43,0.0,1499.92,383767,557898.18,1.45,1.14,0.0,19.99
casual,Aug,367036,8629220.23,23.51,15.13,0.0,1499.92,367036,539523.29,1.47,1.14,0.0,17.8
casual,Sep,328123,7458773.67,22.73,14.5,0.0,1499.93,328123,485932.47,1.48,1.15,0.0,19.26
casual,Oct,234137,4802451.98,20.51,12.93,0.0,1499.92,234137,332605.08,1.42,1.08,0.0,17.95


In [36]:
#less casual rides, but the ride lengths are longer, nearly 2x avg time
mc_grp['ride_length_mins'].agg([len,np.sum,np.mean,np.median,np.min,np.max])

Unnamed: 0_level_0,len,sum,mean,median,amin,amax
member_casual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
casual,2239311,52078093.2,23.26,14.58,0.0,1559.68
member,3175839,41655699.52,13.12,9.37,0.0,1499.93


In [57]:
#top 10 most popular start stations for casual riders
start_grp = bike_share.groupby(['member_casual','start_station_name'])

start_grp['ride_id'].count().sort_values(ascending=False).loc['casual'][:5]

start_station_name
Streeter Dr & Grand Ave    49115
Millennium Park            23887
Michigan Ave & Oak St      23042
Wells St & Concord Ln      18440
Theater on the Lake        16586
Name: ride_id, dtype: int64

In [56]:
#top 10 most popular start stations for member riders
start_grp['ride_id'].count().sort_values(ascending=False).loc['member'][:5]

start_station_name
Kingsbury St & Kinzie St    25147
Clark St & Elm St           24880
Wells St & Concord Ln       24100
Wells St & Elm St           21268
Dearborn St & Erie St       19415
Name: ride_id, dtype: int64

In [55]:
end_grp = bike_share.groupby(['member_casual','end_station_name'])

end_grp['ride_id'].count().sort_values(ascending=False).loc['casual'][:5]

end_station_name
Streeter Dr & Grand Ave    49440
Millennium Park            24444
Michigan Ave & Oak St      24024
Wells St & Concord Ln      18056
Theater on the Lake        17638
Name: ride_id, dtype: int64

In [60]:
bike_share

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,member_casual,ride_length_mins,day,month,start_lat_long,end_lat_long,journey_distance_miles
0,6C992BD37A98A63F,classic_bike,2021-04-12 18:25:36,2021-04-12 18:56:55,State St & Pearson St,TA1307000061,Southport Ave & Waveland Ave,13235,member,31.32,Mon,Apr,"41.8974, -87.6287","41.9482, -87.6639",3.95
3,1887262AD101C604,classic_bike,2021-04-17 09:17:42,2021-04-17 09:42:48,Honore St & Division St,TA1305000034,Southport Ave & Waveland Ave,13235,member,25.10,Sat,Apr,"41.9031, -87.6739","41.9482, -87.6639",3.16
5,097E76F3651B1AC1,classic_bike,2021-04-25 18:43:18,2021-04-25 18:43:59,Clinton St & Polk St,15542,Clinton St & Polk St,15542,casual,0.68,Sun,Apr,"41.8715, -87.6409","41.8715, -87.6409",0.00
6,53C38EB01E6FA5C4,classic_bike,2021-04-03 16:28:21,2021-04-03 16:29:47,Ashland Ave & 63rd St,16948,Ashland Ave & 63rd St,16948,casual,1.43,Sat,Apr,"41.7794, -87.6648","41.7794, -87.6648",0.00
7,D53AC014EFD6E2BA,electric_bike,2021-04-06 16:35:06,2021-04-06 17:00:56,Dorchester Ave & 49th St,KA1503000069,Dorchester Ave & 49th St,KA1503000069,casual,25.83,Tue,Apr,"41.8058, -87.5925","41.8058, -87.5927",0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5723525,1CEE41690C73108B,classic_bike,2022-03-21 15:12:17,2022-03-21 15:23:01,Michigan Ave & Oak St,13042,Streeter Dr & Grand Ave,13022,member,10.73,Mon,Mar,"41.901, -87.6238","41.8923, -87.612",0.85
5723526,A2A6F07D3DF4D0D6,electric_bike,2022-03-09 20:29:48,2022-03-09 21:01:30,Sheridan Rd & Irving Park Rd,13063,Streeter Dr & Grand Ave,13022,casual,31.70,Wed,Mar,"41.9543, -87.6544","41.8923, -87.612",4.80
5723529,9C4CE6CC19F8225B,electric_bike,2022-03-09 15:55:26,2022-03-09 16:08:54,,,Albany Ave & Montrose Ave,15621,member,13.47,Wed,Mar,"41.94, -87.71","41.961, -87.7059",1.46
5723530,F4E136DEF696F3AE,electric_bike,2022-03-21 16:12:44,2022-03-21 16:18:24,,,Larrabee St & Division St,KA1504000079,member,5.67,Mon,Mar,"41.91, -87.65","41.9035, -87.6434",0.56


In [61]:
bike_share.to_csv('C:/Users/zaydm/Documents/Repos/Google_Data_Analytics_Capstone/bike_share_large.csv')