# Loading Blue Bike Boston Dataset


The dataset is broken down as:


Blue Bikes Comprehensive Trip Histories:

    Trip Duration (seconds)
    Start Time and Date
    Stop Time and Date
    Start Station Name & ID
    End Station Name & ID
    Bike ID
    User Type (Casual = Single Trip or Day Pass user; Member = Annual or Monthly Member)
    Birth Year
    Gender, self-reported by member (Zero=unknown; 1=male; 2=female)


Bluebike Stations:

    Number
    Name
    Latitude/Longitude
    Municipality
    Total docks
    Deployment Year



In [1]:
import pandas as pd
import geopandas as gpd
import requests
from zipfile import ZipFile
import numpy as np
import os

## Loading Station Data

In [48]:

bike_station_url = 'https://s3.amazonaws.com/hubway-data/current_bluebikes_stations.csv'


station_df = pd.read_csv(bike_station_url, header=[0,1])
last_updated_station_date = station_df.columns[1][0]

print('Station Last Updated: ',last_updated_station_date)
station_df = station_df.droplevel(0, axis=1)

station_df.head()

Station Last Updated:  08/05/2022


Unnamed: 0,Number,Name,Latitude,Longitude,District,Public,Total docks,Deployment Year
0,K32015,1200 Beacon St,42.344149,-71.114674,Brookline,Yes,15,2021.0
1,W32006,160 Arsenal,42.364664,-71.175694,Watertown,Yes,11,2021.0
2,A32019,175 N Harvard St,42.363796,-71.129164,Boston,Yes,18,2014.0
3,S32035,191 Beacon St,42.380323,-71.108786,Somerville,Yes,19,2018.0
4,C32094,2 Hummingbird Lane at Olmsted Green,42.28887,-71.095003,Boston,Yes,17,2020.0


## Loading Comprehensive History Data

In [3]:
from requests_html import AsyncHTMLSession

u = "https://s3.amazonaws.com/hubway-data/index.html"


session = AsyncHTMLSession()


resp = await session.get(u)
await resp.html.arender(timeout=6000, sleep=3) 
html = resp.html.raw_html

df_url_list = pd.read_html(html)[0]

In [4]:
df_url_list['Date Modified'] = pd.to_datetime(df_url_list['Date Modified'])
df_url_list = df_url_list.sort_values(by='Date Modified', ascending=False).reset_index(drop=True)
df_url_list.head(26)

Unnamed: 0,Name,Date Modified,Size,Type
0,current_bluebikes_stations.csv,2022-08-05 18:41:44,34 KB,CSV file
1,202207-bluebikes-tripdata.zip,2022-08-05 18:41:43,10.80 MB,ZIP file
2,202206-bluebikes-tripdata.zip,2022-07-15 11:19:44,16.67 MB,ZIP file
3,202205-bluebikes-tripdata.zip,2022-06-03 22:21:03,14.86 MB,ZIP file
4,202204-bluebikes-tripdata.zip,2022-05-03 12:40:48,11.33 MB,ZIP file
5,202203-bluebikes-tripdata.zip,2022-04-06 13:20:48,7.48 MB,ZIP file
6,202202-bluebikes-tripdata.zip,2022-03-02 16:27:48,4.42 MB,ZIP file
7,202201-bluebikes-tripdata.zip,2022-02-02 12:02:56,3.28 MB,ZIP file
8,202110-bluebikes-tripdata.zip,2022-01-13 11:29:14,16.41 MB,ZIP file
9,202112-bluebikes-tripdata.zip,2022-01-06 11:45:51,5.79 MB,ZIP file


In [5]:
# Selecting the n most recent files for download
# WARNING: ONLY IMPLEMENTED A SIMPLE CHECK
# IF A FILE WAS IN GB THIS WOULD COMPLETELY MISS IT
# SHOULD UPDATE
n_recent_files = 12
total_download_size = df_url_list.loc[(df_url_list.Type == 'ZIP file')].Size[:n_recent_files].apply(lambda x: float(x.split(' ')[0])).sum()
print('Total Download size of: ' + str(total_download_size) + 'MB' )

Total Download size of: 134.13MB


In [6]:
zip_files_to_download = df_url_list.loc[(df_url_list.Type == 'ZIP file')].Name[:n_recent_files]
zip_files_to_download

1     202207-bluebikes-tripdata.zip
2     202206-bluebikes-tripdata.zip
3     202205-bluebikes-tripdata.zip
4     202204-bluebikes-tripdata.zip
5     202203-bluebikes-tripdata.zip
6     202202-bluebikes-tripdata.zip
7     202201-bluebikes-tripdata.zip
8     202110-bluebikes-tripdata.zip
9     202112-bluebikes-tripdata.zip
10    202111-bluebikes-tripdata.zip
11    202109-bluebikes-tripdata.zip
12    202108-bluebikes-tripdata.zip
Name: Name, dtype: object

In [8]:
bike_comp_folder = 'data/comp_monthly/'  

In [65]:
# Takes a while to download, theres likely a more effective means

def download_bike_comprehensive(file, save_folder):

    bike_comp_url = 'https://s3.amazonaws.com/hubway-data/%s' %(file)
    bike_comp_zipfile = save_folder + file


    r = requests.get(bike_comp_url)
    with open(bike_comp_zipfile , 'wb') as f:
        f.write(r.content)

    with ZipFile(bike_comp_zipfile, 'r') as zObject:
        zObject.extractall(path = save_folder) 


    if os.path.exists(bike_comp_zipfile):
        os.remove(bike_comp_zipfile)
    else:
        print("Error: The file does not exist") 

[download_bike_comprehensive(file, bike_comp_folder) for file in zip_files_to_download]

print('Done!')


[None, None, None, None, None, None, None, None, None, None, None, None]

In [73]:
# optional: if you don't like the __MACOSX folder

# import shutil
# if os.path.exists(bike_comp_folder + '/__MACOSX'):
#     shutil.rmtree(bike_comp_folder + '/__MACOSX')
# else:
#     print('Error: The file does not exist')



# I debated on how to sort the file_paths

# def sorted_ls(path):
#     mtime = lambda f: os.stat(os.path.join(path, f)).st_mtime
#     return list(sorted(os.listdir(path), key=mtime))

# bike_comp_file_paths = sorted_ls(bike_comp_folder)

In [9]:
bike_comp_file_paths = [bike_comp_folder + file[:-4] + '.csv' for file in zip_files_to_download]
bike_comp_file_paths

['data/comp_monthly/202207-bluebikes-tripdata.csv',
 'data/comp_monthly/202206-bluebikes-tripdata.csv',
 'data/comp_monthly/202205-bluebikes-tripdata.csv',
 'data/comp_monthly/202204-bluebikes-tripdata.csv',
 'data/comp_monthly/202203-bluebikes-tripdata.csv',
 'data/comp_monthly/202202-bluebikes-tripdata.csv',
 'data/comp_monthly/202201-bluebikes-tripdata.csv',
 'data/comp_monthly/202110-bluebikes-tripdata.csv',
 'data/comp_monthly/202112-bluebikes-tripdata.csv',
 'data/comp_monthly/202111-bluebikes-tripdata.csv',
 'data/comp_monthly/202109-bluebikes-tripdata.csv',
 'data/comp_monthly/202108-bluebikes-tripdata.csv']

# Further edits and testing

In [10]:
pd.read_csv(bike_comp_file_paths[0])

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,postal code
0,604,2020-07-01 00:00:02.6570,2020-07-01 00:10:07.5220,107,Ames St at Main St,42.362500,-71.088220,97,Harvard University River Houses at DeWolfe St ...,42.369206,-71.117106,3309,Subscriber,02142
1,269,2020-07-01 00:00:03.8580,2020-07-01 00:04:33.4070,46,Christian Science Plaza - Massachusetts Ave at...,42.343666,-71.085824,33,Kenmore Square,42.348706,-71.097009,5467,Subscriber,02135
2,794,2020-07-01 00:00:16.7790,2020-07-01 00:13:30.9210,6,Cambridge St at Joy St,42.361257,-71.065287,177,University Park,42.362648,-71.100061,5759,Customer,02139
3,736,2020-07-01 00:00:20.3000,2020-07-01 00:12:36.5470,14,HMS/HSPH - Avenue Louis Pasteur at Longwood Ave,42.337417,-71.102861,332,Harvard Ave at Brainerd Rd,42.349530,-71.130228,2914,Subscriber,02134
4,930,2020-07-01 00:00:23.1080,2020-07-01 00:15:53.3010,61,Boylston St at Fairfield St,42.349082,-71.081924,161,W Broadway at D St,42.339109,-71.051443,2398,Customer,02127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258582,245,2020-07-31 23:59:52.4850,2020-08-01 00:03:57.7810,363,Harrison Ave at Mullins Way,42.345216,-71.063840,26,Washington St at Waltham St,42.341575,-71.068904,4406,Subscriber,02118
258583,975,2020-07-31 23:59:54.7800,2020-08-01 00:16:10.3610,150,State Street at Channel Center,42.344137,-71.052608,47,Cross St at Hanover St,42.362811,-71.056067,5456,Customer,02127
258584,695,2020-07-31 23:59:55.2380,2020-08-01 00:11:31.0890,141,Kendall Street,42.363560,-71.082168,352,Ring Rd,42.348278,-71.080449,3525,Subscriber,02199
258585,978,2020-07-31 23:59:56.3810,2020-08-01 00:16:15.2230,150,State Street at Channel Center,42.344137,-71.052608,47,Cross St at Hanover St,42.362811,-71.056067,2149,Customer,02127
