Running the following script creates a pickle file called "bikedata.pickle", 
which is the preprocessed data that you would need for the project. 
This file is equivalent to "bikedata.RData" for those of you using R. 

In [9]:
import requests
from zipfile import ZipFile
import csv
import re
import numpy as np
import pickle 
import pandas as pd

In [10]:
# citations: 
# https://www.tutorialspoint.com/downloading-files-from-web-using-python
# https://www.analyticsvidhya.com/blog/2021/08/python-tutorial-working-with-csv-file-for-data-science/

header = []
data = []
for i in range(2010, 2012): 
    # the data goes up to 2017, but the files are extremely large from 2011 onwards - 
    # you can decide to just use a subset
    url = "https://s3.amazonaws.com/capitalbikeshare-data/" + str(i) + "-capitalbikeshare-tripdata.zip"
    r = requests.get(url, allow_redirects = True)
    zipfile_name = 'bikedata.zip'
    open(zipfile_name, 'wb').write(r.content)
    with ZipFile(zipfile_name, 'r') as zip:
        zip.extractall()
    csv_name = str(i) + "-capitalbikeshare-tripdata.csv"
    csvreader = csv.reader(open(csv_name))
    file = open(csv_name)
    csvreader = csv.reader(file)
    header = next(csvreader)
    for row in csvreader:
        data.append(row)
    file.close()

In [11]:
# the variables with names that end with "_tmp" will be further preprocessed 
n = len(data)
duration = []
# duration of the ride in seconds
starttime_tmp = []
# start time of ride #i
station_start = []
# station ID where the bike was checked out
station_end = []
# station ID where the bike was returned
member_tmp = []
# member (1) or nonmember (0)
station_start_name = []
station_end_name = []
bikenum_tmp = []
for i in range(n): 
    cur = data[i]
    duration.append(cur[0])
    starttime_tmp.append(cur[1])
    station_start.append(cur[3])
    station_end.append(cur[5])
    member_tmp.append(cur[8])
    station_start_name.append(cur[4])
    station_end_name.append(cur[6])
    bikenum_tmp.append(cur[7])
station_start = np.array(station_start).astype(np.intc)
station_end = np.array(station_end).astype(np.intc)

In [12]:
# preprocessing starttime_tmp
starttime = np.empty((n, 6))
# row i = year/month/date/hour/minute/second for ride #i
for i in range(n):
    starttime[i,] = np.array(re.split('-|:| ', starttime_tmp[i])).astype(np.intc)
starttime = starttime.astype(np.intc)

In [13]:
# preprocessing member_tmp
member = np.array(member_tmp)
member = member == "Member"
# member (1) or nonmember (0)

In [14]:
# preprocessing bikenum_tmp
bikenum = []
for i in range(n):
    tmp = re.split('w|W|,| ', bikenum_tmp[i])
    bikenum.append([item for item in tmp if item != ''][0])
for i in range(n):
    cur = bikenum[i]
    if "?" in cur[0]:
        bikenum[i] = np.NAN
bikenum = np.genfromtxt(np.array(bikenum))
# some are NA, the data is messy for this one

In [15]:
# creating stations 
# stations[i,0] = station ID for the i-th station, 
# stations[i,1] = station location for the i-th station
all_stations = np.unique(np.concatenate((station_start, station_end)))
stations = []
for item in all_stations:
    ind = np.argwhere(station_start == item)
    if(len(ind) != 0):
        location = station_start_name[ind[0][0]]
    else: 
        location = station_end_name[np.argwhere(station_end == item)[0][0]]
    stations.append([item, location])
stations = np.array(stations)
# note that stations get added to the program over time

In [16]:
# creating days_in_month
days_in_month = np.array([31,28,31,30,31,30,31,31,30,31,30,31]
                         + [31,28,31,30,31,30,31,31,30,31,30,31])
# Jan 2010, ..., Dec 2011

In [17]:
# creating days_since_Jan1_2010
term1 = (starttime[:,0] - 2010) * 365
term2 = np.cumsum(days_in_month)[(starttime[:,1] - 1)]
term3 = days_in_month[(starttime[:,1] - 1)] 
term4 = (starttime[:,2] - 1)
days_since_Jan1_2010 = term1 + term2 - term3 + term4

In [18]:
# creating day_of_week
ind = np.mod((days_since_Jan1_2010 + 4), 7) 
day_of_week = np.array(['Monday','Tuesday','Wednesday','Thursday',
               'Friday','Saturday','Sunday'])[ind]

In [19]:
# saves the data into a pickle file
filename = "bikedata.pickle"
with open(filename, 'wb') as f:
    pickle.dump([starttime, duration, bikenum, stations, station_start, 
                station_end, member, days_since_Jan1_2010, day_of_week], f)

In [20]:
# to read the file, run the following. 
# data_final is a list that contains the following (in order):
# [0] starttime , [1] duration, [2] bikenum, [3] stations, [4] station_start, 
# [5] station_end, [6]member, [7] days_since_Jan1_2010, [8] day_of_week
filename = "bikedata.pickle"

with open(filename, 'rb') as f:
    data_final = pickle.load(f)

In [35]:
start_loc = []
end_loc = []

for station_num in data_final[4]:
    for addy in data_final[3]:
        if int(addy[0]) == station_num:
            start_loc.append(addy[1])
            break
    
for station_num in data_final[5]:
    for addy in data_final[3]:
        if int(addy[0]) == station_num:
            end_loc.append(addy[1])
            break

In [37]:
# for i in data_final[3]:
#     if int(i[0]) == data_final[4][2]:
#         print(i[0])

In [38]:
starttime_list = [i for i in data_final[0]]

In [47]:
data = {'starttime': starttime_list, 'duration': data_final[1], 'bikenum': data_final[2], 'station_start': data_final[4], 'start_addy': start_loc,
 'addy_end': end_loc, 'station_end': data_final[5], 'member': data_final[6], 'days_since_Jan1_2010': data_final[7], 'day_of_week': data_final[8]} 
df = pd.DataFrame(data)

In [48]:
df.to_csv('df.csv')

In [49]:
df = pd.read_csv('df.csv')

In [51]:
df.head()

Unnamed: 0.1,Unnamed: 0,starttime,duration,bikenum,station_start,start_addy,addy_end,station_end,member,days_since_Jan1_2010,day_of_week
0,0,[2010 9 20 11 27 4],1012,742.0,31208,M St & New Jersey Ave SE,4th & M St SW,31108,True,262,Monday
1,1,[2010 9 20 11 41 22],61,32.0,31209,1st & N St SE,1st & N St SE,31209,True,262,Monday
2,2,[2010 9 20 12 5 37],2690,993.0,31600,5th & K St NW,19th St & Pennsylvania Ave NW,31100,True,262,Monday
3,3,[2010 9 20 12 6 5],1406,344.0,31600,5th & K St NW,Park Rd & Holmead Pl NW,31602,True,262,Monday
4,4,[2010 9 20 12 10 43],1413,883.0,31100,19th St & Pennsylvania Ave NW,15th & P St NW,31201,True,262,Monday
