# Analysis on Lyft's Baywheels (Aug 2019 - Oct 2019)
##### Data found at https://www.lyft.com/bikes/bay-wheels/system-data

## Pricing information
##### Found at https://www.lyft.com/bikes/bay-wheels/pricing

#### Customer Pricing
* 2 dollars for one ride up to 30 minutes
* After 30 minutes - extra $3 per additional 15 minutes

#### Subsription Pricing
* unlimited 45-minute trips on any of our bikes
* If you keep a bike out longer than 45 minutes at a time, it’s an extra $3 per additional 15 minutes

#### Bike Share For All
* Membership includes first 60 minutes of each trip
* Rides longer than 60 minutes will result in additional fees of $3 for each additional 15 minutes or potential account suspension

## High Level

In [1]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

august_df = pd.read_csv('data/201908-baywheels-tripdata.csv')
september_df = pd.read_csv('data/201909-baywheels-tripdata.csv')
october_df = pd.read_csv('data/201910-baywheels-tripdata.csv')

frames = [august_df, september_df, october_df]

df = pd.concat(frames)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668444 entries, 0 to 239894
Data columns (total 16 columns):
duration_sec               668444 non-null int64
start_time                 668444 non-null object
end_time                   668444 non-null object
start_station_id           668444 non-null int64
start_station_name         668444 non-null object
start_station_latitude     668444 non-null float64
start_station_longitude    668444 non-null float64
end_station_id             668444 non-null int64
end_station_name           668444 non-null object
end_station_latitude       668444 non-null float64
end_station_longitude      668444 non-null float64
bike_id                    668444 non-null int64
user_type                  668444 non-null object
member_birth_year          531451 non-null float64
member_gender              531455 non-null object
bike_share_for_all_trip    668444 non-null object
dtypes: float64(5), int64(4), object(7)
memory usage: 86.7+ MB


In [3]:
df.head()

Unnamed: 0,duration_sec,start_time,end_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,end_station_longitude,bike_id,user_type,member_birth_year,member_gender,bike_share_for_all_trip
0,68145,2019-08-31 21:27:42.2530,2019-09-01 16:23:27.4920,249,Russell St at College Ave,37.858473,-122.253253,247,Fulton St at Bancroft Way,37.867789,-122.265896,3112,Customer,,,No
1,53216,2019-08-31 22:34:17.5120,2019-09-01 13:21:13.9310,368,Myrtle St at Polk St,37.785434,-122.419622,78,Folsom St at 9th St,37.773717,-122.411647,2440,Customer,1993.0,Female,No
2,53182,2019-08-31 22:34:49.9420,2019-09-01 13:21:12.4570,368,Myrtle St at Polk St,37.785434,-122.419622,78,Folsom St at 9th St,37.773717,-122.411647,9743,Customer,1995.0,Male,No
3,75682,2019-08-31 14:22:02.2780,2019-09-01 11:23:24.5110,104,4th St at 16th St,37.767045,-122.390833,126,Esprit Park,37.761634,-122.390648,11418,Subscriber,1993.0,Female,No
4,30849,2019-08-31 18:47:08.0570,2019-09-01 03:21:17.6970,327,5th St at San Salvador St,37.332039,-121.881766,327,5th St at San Salvador St,37.332039,-121.881766,1553,Customer,,,No


## Preprocessing 

In [4]:
# Resetting index after Concatenation of dataframe
df = df.set_index('start_time')
df = df.reset_index()

## Feature Engineering

In [5]:
# Converting Ride time from seconds to minutes
df['duration_min'] = df['duration_sec']/60

# Converting member birth year to age
df['age'] = 2020 - df['member_birth_year']

In [6]:
# Getting price of ride for customers with 30 minute rides or less
temp_customer = (df.loc[(df['user_type'] == 'Customer') & (df['duration_min'] <= 30)]['duration_min'] * 0) + 2
# Getting price of ride for customers over 30 minutes 
temp_customer_over_30 = np.ceil((df.loc[(df['user_type'] == 'Customer') & (df['duration_min'] > 30)]['duration_min']-30)/15) * 3 + 2


# Getting price of ride for Subscriptions with 45 minute rides or less
temp_subscriber = (df.loc[(df['user_type'] == 'Subscriber') & (df['bike_share_for_all_trip'] == 'No') & (df['duration_min'] <= 45)]['duration_min'] * 0)
# Getting price of ride for Subscriptions over 45 minutes 
temp_subscriber_over_45 = np.ceil((df.loc[(df['user_type'] == 'Subscriber') & (df['bike_share_for_all_trip'] == 'No') & (df['duration_min'] > 45)]['duration_min']-45)/15) * 3


# Getting price of ride for bike share rides with 60 minute rides or less
temp_bike_share = (df.loc[(df['user_type'] == 'Subscriber') & (df['bike_share_for_all_trip'] == 'Yes') & (df['duration_min'] <= 60)]['duration_min'] * 0)
# Getting price of ride for bike share rides over 60 minutes 
temp_bike_share_over_60 = np.ceil((df.loc[(df['user_type'] == 'Subscriber') & (df['bike_share_for_all_trip'] == 'Yes') & (df['duration_min'] > 60)]['duration_min']-60)/15) * 3

# adding all ride prices to the dataframe
df['ride_prices'] = pd.concat([temp_customer, temp_customer_over_30, temp_subscriber, temp_subscriber_over_45, temp_bike_share, temp_bike_share_over_60], axis=0).sort_index()

In [7]:
# Building Start Station Dataframe
station_df = pd.DataFrame({'start_station_id':df['start_station_id'].unique()})

station_df = station_df.merge(df[['start_station_id', 'start_station_name', 'start_station_latitude', 
                                  'start_station_longitude']], on='start_station_id', how='outer')

station_df.drop_duplicates(subset=['start_station_id'], inplace=True)
station_df.sort_values(by='start_station_id', inplace=True)
#station_df = station_df.reset_index(drop=True)
station_df.set_index('start_station_id', inplace=True)

In [8]:
station_df['total_revenue'] = df.groupby('start_station_id')['ride_prices'].agg('sum')
station_df['total_users'] = df.groupby('start_station_id')['user_type'].agg('count')
station_df['mean_duration'] = df.groupby('start_station_id')['duration_min'].agg('mean')

station_df['total_customers'] = df.groupby('start_station_id')['user_type'].apply(lambda x: (x=='Customer').sum())
station_df['total_subscribers'] = df.groupby('start_station_id')['user_type'].apply(lambda x: (x=='Subscriber').sum())
station_df['total_males'] = df.groupby('start_station_id')['member_gender'].apply(lambda x: (x=='Male').sum())
station_df['total_females'] = df.groupby('start_station_id')['member_gender'].apply(lambda x: (x=='Female').sum())

station_df['under_35'] = df.groupby('start_station_id')['age'].apply(lambda x: (x<=35).sum())
station_df['over_35'] = df.groupby('start_station_id')['age'].apply(lambda x: (x>35).sum())

In [9]:
station_df.head()

Unnamed: 0_level_0,start_station_name,start_station_latitude,start_station_longitude,total_revenue,total_users,mean_duration,total_customers,total_subscribers,total_males,total_females,under_35,over_35
start_station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3,Powell St BART Station (Market St at 4th St),37.786375,-122.404904,8496.0,8575,14.527431,1959,6616,5457,1293,4269,2551
4,Cyril Magnin St at Ellis St,37.785881,-122.408915,3268.0,1571,20.150127,623,948,744,213,592,385
5,Powell St BART Station (Market St at 5th St),37.783899,-122.408445,8732.0,7534,14.120578,1963,5571,4494,1103,3633,2091
6,The Embarcadero at Sansome St,37.80477,-122.403234,11368.0,7902,17.559498,2642,5260,4242,1286,2966,2670
7,Frank H Ogawa Plaza,37.804562,-122.271738,1866.0,2479,12.551049,378,2101,1416,696,1105,1060
