# General Divvy Bike Share Analysis
This notebook is used for processing and cleaning Divvy Bike Sharing data for the September 2022 data set.

In [3]:
import pandas as pd
import plotly.graph_objects as go
import os

In [4]:
## Create file location
ROOT = os.getcwd()
SAVE_FILES = os.path.join(ROOT, "DATA")
SAVE_FILES

'C:\\Users\\Nicholas\\Desktop\\Masters - Classes\\MSDS436\\Final\\MSDS436-FINAL\\DATA'

In [5]:
df = pd.read_csv(os.path.join(SAVE_FILES, "202209_divvy_distance.csv"))
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,started_at_clean,duration (s),distance (m)
0,2FD3F90EDCE2ACD9,classic_bike,2022-09-01 19:39:15,2022-09-01 19:46:45,Southport Ave & Roscoe St,13071,Broadway & Cornelia Ave,13278,41.943739,-87.66402,41.945529,-87.646439,member,2022-09-01,481.9,1669.2
1,EE62794A94F80A83,classic_bike,2022-09-01 06:53:41,2022-09-01 07:02:54,LaSalle St & Washington St,13006,Wells St & Polk St,SL-011,41.882664,-87.63253,41.872732,-87.633516,casual,2022-09-01,395.0,1358.8
2,56FD4B364747F270,electric_bike,2022-09-01 11:25:21,2022-09-01 11:28:33,N Sheffield Ave & W Wellington Ave,20256.0,Southport Ave & Wellington Ave,TA1307000006,41.94,-87.65,41.935733,-87.663576,casual,2022-09-01,411.5,1595.7
3,BD4D6AC842CDF729,classic_bike,2022-09-01 07:46:03,2022-09-01 08:05:36,Racine Ave & Wrightwood Ave,TA1309000059,DuSable Lake Shore Dr & North Blvd,LF-005,41.928887,-87.658971,41.911722,-87.626804,casual,2022-09-01,948.8,3643.8
4,2E0E8C378865C01A,electric_bike,2022-09-01 09:55:31,2022-09-01 10:12:27,Wabash Ave & Adams St,KA1503000015,Wood St & Taylor St (Temp),13285,41.879373,-87.625492,41.869265,-87.673731,member,2022-09-01,1275.1,5104.7


## Type conversion

In [6]:
df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
started_at_clean       object
duration (s)          float64
distance (m)          float64
dtype: object

### Trip duration

In [7]:
df['started_at'] = pd.to_datetime(df['started_at']) # Ride start date and time
df['ended_at'] = pd.to_datetime(df['ended_at']) # Ride end data and time
df['ride_duration'] = (df['ended_at'] - df['started_at']).astype('timedelta64[s]') / 60 # Ride duration in minutes

### Time

In [8]:
df['day_of_week'] = df['started_at'].dt.day_name() # Name of day 
df['day_no'] = df['started_at'].dt.dayofweek # Day number, will be used for sorting in visualizations

df['month_of_year'] = df['started_at'].dt.month_name() # Name of month
df['month_no'] = df['started_at'].dt.month # Month number, will be used for sorting in visualizations

df['hour'] = df['started_at'].dt.hour # Hour bike ride starts at

### Round lat & long

In [9]:
df = df.round({'start_lat': 3, 'start_lng': 3, 'end_lat': 3, 'end_lng': 3})
display(len(df))
display(df.head())

150000

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,member_casual,started_at_clean,duration (s),distance (m),ride_duration,day_of_week,day_no,month_of_year,month_no,hour
0,2FD3F90EDCE2ACD9,classic_bike,2022-09-01 19:39:15,2022-09-01 19:46:45,Southport Ave & Roscoe St,13071,Broadway & Cornelia Ave,13278,41.944,-87.664,...,member,2022-09-01,481.9,1669.2,7.5,Thursday,3,September,9,19
1,EE62794A94F80A83,classic_bike,2022-09-01 06:53:41,2022-09-01 07:02:54,LaSalle St & Washington St,13006,Wells St & Polk St,SL-011,41.883,-87.633,...,casual,2022-09-01,395.0,1358.8,9.216667,Thursday,3,September,9,6
2,56FD4B364747F270,electric_bike,2022-09-01 11:25:21,2022-09-01 11:28:33,N Sheffield Ave & W Wellington Ave,20256.0,Southport Ave & Wellington Ave,TA1307000006,41.94,-87.65,...,casual,2022-09-01,411.5,1595.7,3.2,Thursday,3,September,9,11
3,BD4D6AC842CDF729,classic_bike,2022-09-01 07:46:03,2022-09-01 08:05:36,Racine Ave & Wrightwood Ave,TA1309000059,DuSable Lake Shore Dr & North Blvd,LF-005,41.929,-87.659,...,casual,2022-09-01,948.8,3643.8,19.55,Thursday,3,September,9,7
4,2E0E8C378865C01A,electric_bike,2022-09-01 09:55:31,2022-09-01 10:12:27,Wabash Ave & Adams St,KA1503000015,Wood St & Taylor St (Temp),13285,41.879,-87.625,...,member,2022-09-01,1275.1,5104.7,16.933333,Thursday,3,September,9,9


### General Data Cleaning
- Ride duation should be a positive number.
- Get rid of data with missing latitude and/or longitude information.

In [10]:
df = df[df['ride_duration'] > 0]
df = df[df['end_lat'].notna()]
df = df[df['end_lng'].notna()]

display(len(df))
display(df.head())

149995

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,member_casual,started_at_clean,duration (s),distance (m),ride_duration,day_of_week,day_no,month_of_year,month_no,hour
0,2FD3F90EDCE2ACD9,classic_bike,2022-09-01 19:39:15,2022-09-01 19:46:45,Southport Ave & Roscoe St,13071,Broadway & Cornelia Ave,13278,41.944,-87.664,...,member,2022-09-01,481.9,1669.2,7.5,Thursday,3,September,9,19
1,EE62794A94F80A83,classic_bike,2022-09-01 06:53:41,2022-09-01 07:02:54,LaSalle St & Washington St,13006,Wells St & Polk St,SL-011,41.883,-87.633,...,casual,2022-09-01,395.0,1358.8,9.216667,Thursday,3,September,9,6
2,56FD4B364747F270,electric_bike,2022-09-01 11:25:21,2022-09-01 11:28:33,N Sheffield Ave & W Wellington Ave,20256.0,Southport Ave & Wellington Ave,TA1307000006,41.94,-87.65,...,casual,2022-09-01,411.5,1595.7,3.2,Thursday,3,September,9,11
3,BD4D6AC842CDF729,classic_bike,2022-09-01 07:46:03,2022-09-01 08:05:36,Racine Ave & Wrightwood Ave,TA1309000059,DuSable Lake Shore Dr & North Blvd,LF-005,41.929,-87.659,...,casual,2022-09-01,948.8,3643.8,19.55,Thursday,3,September,9,7
4,2E0E8C378865C01A,electric_bike,2022-09-01 09:55:31,2022-09-01 10:12:27,Wabash Ave & Adams St,KA1503000015,Wood St & Taylor St (Temp),13285,41.879,-87.625,...,member,2022-09-01,1275.1,5104.7,16.933333,Thursday,3,September,9,9


### Drop outliers
Drop any rides shorter than ~0.5 minutes or longer than 540 minutes.

In [11]:
df = df[(df.ride_duration <= df.ride_duration.quantile(.999)) & (df.ride_duration >= df.ride_duration.quantile(.01))]

display(len(df))
display(df.head())

148360

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,member_casual,started_at_clean,duration (s),distance (m),ride_duration,day_of_week,day_no,month_of_year,month_no,hour
0,2FD3F90EDCE2ACD9,classic_bike,2022-09-01 19:39:15,2022-09-01 19:46:45,Southport Ave & Roscoe St,13071,Broadway & Cornelia Ave,13278,41.944,-87.664,...,member,2022-09-01,481.9,1669.2,7.5,Thursday,3,September,9,19
1,EE62794A94F80A83,classic_bike,2022-09-01 06:53:41,2022-09-01 07:02:54,LaSalle St & Washington St,13006,Wells St & Polk St,SL-011,41.883,-87.633,...,casual,2022-09-01,395.0,1358.8,9.216667,Thursday,3,September,9,6
2,56FD4B364747F270,electric_bike,2022-09-01 11:25:21,2022-09-01 11:28:33,N Sheffield Ave & W Wellington Ave,20256.0,Southport Ave & Wellington Ave,TA1307000006,41.94,-87.65,...,casual,2022-09-01,411.5,1595.7,3.2,Thursday,3,September,9,11
3,BD4D6AC842CDF729,classic_bike,2022-09-01 07:46:03,2022-09-01 08:05:36,Racine Ave & Wrightwood Ave,TA1309000059,DuSable Lake Shore Dr & North Blvd,LF-005,41.929,-87.659,...,casual,2022-09-01,948.8,3643.8,19.55,Thursday,3,September,9,7
4,2E0E8C378865C01A,electric_bike,2022-09-01 09:55:31,2022-09-01 10:12:27,Wabash Ave & Adams St,KA1503000015,Wood St & Taylor St (Temp),13285,41.879,-87.625,...,member,2022-09-01,1275.1,5104.7,16.933333,Thursday,3,September,9,9


In [12]:
# Drop columns that are not needed
df.drop(['ride_id', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id' ], axis=1, inplace=True)

In [13]:
# By Day
rd_by_day = df.groupby(['day_no', 'day_of_week', 'member_casual']).agg({'ride_duration' : ['min','max', 'mean', 'median']})
rd_by_day.columns = rd_by_day.columns.to_flat_index()
# rd_by_day.head()


#By Month
rd_by_month = df.groupby(['month_no', 'month_of_year', 'member_casual']).agg({'ride_duration' : ['min','max', 'mean', 'median']})
rd_by_month.columns = rd_by_month.columns.to_flat_index()
rd_by_month.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,"(ride_duration, min)","(ride_duration, max)","(ride_duration, mean)","(ride_duration, median)"
month_no,month_of_year,member_casual,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9,September,casual,0.45,325.75,20.603445,12.75
9,September,member,0.45,324.216667,12.298168,9.183333


In [14]:
num_rides = df[['month_no', 'month_of_year','day_no', 'day_of_week','hour','member_casual']].drop_duplicates().reset_index()
num_rides.rename(columns={num_rides.columns[0]: "cnt"}, inplace=True)

num_rides

Unnamed: 0,cnt,month_no,month_of_year,day_no,day_of_week,hour,member_casual
0,0,9,September,3,Thursday,19,member
1,1,9,September,3,Thursday,6,casual
2,2,9,September,3,Thursday,11,casual
3,3,9,September,3,Thursday,7,casual
4,4,9,September,3,Thursday,9,member
...,...,...,...,...,...,...,...
331,30320,9,September,2,Wednesday,23,member
332,30475,9,September,2,Wednesday,1,casual
333,30620,9,September,2,Wednesday,3,casual
334,30762,9,September,2,Wednesday,2,casual


In [15]:
df_paths = df[['start_lat','start_lng','end_lat', 'end_lng', 'member_casual']].drop_duplicates().reset_index()
df_paths.reset_index(inplace=True)
df_paths.rename(columns={df_paths.columns[0]: "cnt"}, inplace=True)

df_paths = df_paths[df_paths['cnt']>100] # Filter out the most popular paths

# Generate two data frames for popular paths: Casual riders and members
df_paths_c = df_paths[df_paths['member_casual'] == 'casual'].reset_index()
df_paths_m = df_paths[df_paths['member_casual'] == 'member'].reset_index()

In [24]:
df_paths.to_csv('webpage/DATA/202209_divvy_paths.csv', index=False)
df_paths_c[:10000].to_csv('webpage/DATA/202209_divvy_cas.csv', index=False)
df_paths_m[:10000].to_csv('webpage/DATA/202209_divvy_mem.csv', index=False)

In [21]:
fig = go.Figure()

#Plot casual riders' popular paths in red.
for i in range(len(df_paths_c[:1000])):
    fig.add_trace(
        go.Scattergeo(
            locationmode = 'USA-states',
            lon = [df_paths_c['start_lng'][i], df_paths_c['end_lng'][i]],
            lat = [df_paths_c['start_lat'][i], df_paths_c['end_lat'][i]],
            mode = 'lines',
            line = dict(width = 1,color = 'red'),
            opacity = float(df_paths_c['cnt'][i]) / float(df_paths['cnt'].max()),
        )
    )

#Plot members' popular paths in blue.
for i in range(len(df_paths_m[:1000])):
    fig.add_trace(
        go.Scattergeo(
            locationmode = 'USA-states',
            lon = [df_paths_m['start_lng'][i], df_paths_m['end_lng'][i]],
            lat = [df_paths_m['start_lat'][i], df_paths_m['end_lat'][i]],
            mode = 'lines',
            line = dict(width = 1,color = 'blue'),
            opacity = float(df_paths_m['cnt'][i]) / float(df_paths['cnt'].max()),
        )
    )   

fig.update_layout(
    title_text = 'Divvy Bike Sharing Popular Paths',
    showlegend = False,
    geo = dict(
        scope = 'north america',
        projection_type = 'azimuthal equal area',
        showland = True,
        showsubunits = True,
        landcolor = 'rgb(243, 243, 243)',
        countrycolor = 'rgb(204, 204, 204)',
        fitbounds = 'locations',
    ),
)

fig.show()

In [None]:
df_paths_m['start_lng']