# Pulling data from the Divvy

## Description
We will need a programatic way to gather Divvy's monthly bike/trip data found on [thier portal](https://divvy-tripdata.s3.amazonaws.com/index.html). The data is uploaded on a monthly bases and it excludes trips that are taken by staff as they service and inspect the system; and any trips that were below 60 seconds in length (potentially false starts or users trying to re-dock a bike to ensure it was secure).

## Import Libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import requests
import zipfile
import io

## Create file location

In [2]:
ROOT = os.getcwd()
SAVE_FILES = os.path.join(ROOT, "DATA")
SAVE_FILES

'C:\\Users\\Nicholas\\Desktop\\Masters - Classes\\MSDS436\\Final\\DATA'

## Pull Divvy Bike Data

### Pull keys from website using BeautifulSoup

In [3]:
main_url = 'https://divvy-tripdata.s3.amazonaws.com'
page = requests.get(main_url)
soup = BeautifulSoup(page.content, 'html.parser')

print(soup.prettify())

<?xml version="1.0" encoding="UTF-8"?>
<listbucketresult xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
 <name>
  divvy-tripdata
 </name>
 <prefix>
 </prefix>
 <marker>
 </marker>
 <maxkeys>
  1000
 </maxkeys>
 <istruncated>
  false
 </istruncated>
 <contents>
  <key>
   202004-divvy-tripdata.zip
  </key>
  <lastmodified>
   2020-06-01T14:50:06.000Z
  </lastmodified>
  <etag>
   "e7a221ace4629d53dcd73a62b314b567"
  </etag>
  <size>
   3323572
  </size>
  <storageclass>
   STANDARD
  </storageclass>
 </contents>
 <contents>
  <key>
   202005-divvy-tripdata.zip
  </key>
  <lastmodified>
   2020-06-01T14:50:09.000Z
  </lastmodified>
  <etag>
   "606a191a00a58840ce8d3cf7d08556e4"
  </etag>
  <size>
   7988821
  </size>
  <storageclass>
   STANDARD
  </storageclass>
 </contents>
 <contents>
  <key>
   202006-divvy-tripdata.zip
  </key>
  <lastmodified>
   2020-07-06T00:31:49.000Z
  </lastmodified>
  <etag>
   "e397c3a64e4f8d4cefe90736cce330eb"
  </etag>
  <size>
   14732088
  </size>
  <s

In [4]:
zip_keys = soup.findAll('key')
len(zip_keys)

49

### Loop through Keys and only keep divvy-tripdata

In [5]:
key_ls = []

for i in range(len(zip_keys)):
    key_ls.append(zip_keys[i].text)

key_ls_clean = [ x for x in key_ls if "divvy-tripdata" in x ]

key_ls_clean

['202004-divvy-tripdata.zip',
 '202005-divvy-tripdata.zip',
 '202006-divvy-tripdata.zip',
 '202007-divvy-tripdata.zip',
 '202008-divvy-tripdata.zip',
 '202009-divvy-tripdata.zip',
 '202010-divvy-tripdata.zip',
 '202011-divvy-tripdata.zip',
 '202012-divvy-tripdata.zip',
 '202101-divvy-tripdata.zip',
 '202102-divvy-tripdata.zip',
 '202103-divvy-tripdata.zip',
 '202104-divvy-tripdata.zip',
 '202105-divvy-tripdata.zip',
 '202106-divvy-tripdata.zip',
 '202107-divvy-tripdata.zip',
 '202108-divvy-tripdata.zip',
 '202109-divvy-tripdata.zip',
 '202110-divvy-tripdata.zip',
 '202111-divvy-tripdata.zip',
 '202112-divvy-tripdata.zip',
 '202201-divvy-tripdata.zip',
 '202202-divvy-tripdata.zip',
 '202203-divvy-tripdata.zip',
 '202204-divvy-tripdata.zip',
 '202205-divvy-tripdata.zip',
 '202206-divvy-tripdata.zip',
 '202207-divvy-tripdata.zip',
 '202208-divvy-tripdata.zip',
 '202209-divvy-tripdata.zip']

In [13]:
key_ls_clean[27:]

['202207-divvy-tripdata.zip',
 '202208-divvy-tripdata.zip',
 '202209-divvy-tripdata.zip']

### Pull and save all files

In [14]:
for zip_f in key_ls_clean[27:]:
    r = requests.get(f"https://divvy-tripdata.s3.amazonaws.com/{zip_f}")
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(SAVE_FILES)

In [23]:
import os

file_ls = []

for file in os.listdir(SAVE_FILES):
    if file.endswith(".csv"):
        file_ls.append(os.path.join(SAVE_FILES, file))
    
file_ls

['C:\\Users\\Nicholas\\Desktop\\Masters - Classes\\MSDS436\\Final\\DATA\\202207-divvy-tripdata.csv',
 'C:\\Users\\Nicholas\\Desktop\\Masters - Classes\\MSDS436\\Final\\DATA\\202208-divvy-tripdata.csv',
 'C:\\Users\\Nicholas\\Desktop\\Masters - Classes\\MSDS436\\Final\\DATA\\202209-divvy-publictripdata.csv']

In [33]:
df_dict = {}

for i in file_ls:
    for num in range(len(file_ls)):
        df = pd.read_csv(i)
        df_dict[f"df_{num}"] = df

In [45]:
df_dict.keys()

dict_keys(['df_0', 'df_1', 'df_2'])

In [56]:
dict_keys_ls = list(df_dict.keys())
df_dict[dict_keys_ls[1]]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,5156990AC19CA285,electric_bike,2022-09-01 08:36:22,2022-09-01 08:39:05,,,California Ave & Milwaukee Ave,13084,41.930000,-87.690000,41.922695,-87.697153,casual
1,E12D4A16BF51C274,electric_bike,2022-09-01 17:11:29,2022-09-01 17:14:45,,,,,41.870000,-87.620000,41.870000,-87.620000,casual
2,A02B53CD7DB72DD7,electric_bike,2022-09-01 17:15:50,2022-09-01 17:16:12,,,,,41.870000,-87.620000,41.870000,-87.620000,casual
3,C82E05FEE872DF11,electric_bike,2022-09-01 09:00:28,2022-09-01 09:10:32,,,,,41.930000,-87.690000,41.940000,-87.670000,casual
4,4DEEB4550A266AE1,electric_bike,2022-09-01 07:30:11,2022-09-01 07:32:36,,,,,41.920000,-87.730000,41.920000,-87.730000,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...
701334,32ECA2B32C4B6F85,classic_bike,2022-09-05 17:59:21,2022-09-05 18:19:07,Lincoln Ave & Winona St,KA1504000078,Broadway & Wilson - Truman College Vaccination...,13074,41.974911,-87.692503,41.965221,-87.658139,member
701335,14801F713026AEAE,classic_bike,2022-09-30 17:20:54,2022-09-30 17:34:40,Broadway & Ridge Ave,15578,Broadway & Wilson - Truman College Vaccination...,13074,41.984045,-87.660274,41.965221,-87.658139,member
701336,7CCAF5D6E88E45C0,electric_bike,2022-09-04 11:39:37,2022-09-04 11:50:55,Broadway & Ridge Ave,15578,Broadway & Wilson - Truman College Vaccination...,13074,41.984112,-87.660269,41.965221,-87.658139,member
701337,AF9A129D9AFAA40B,electric_bike,2022-09-28 13:42:45,2022-09-28 13:52:59,Lincoln Ave & Winona St,KA1504000078,Broadway & Wilson - Truman College Vaccination...,13074,41.974921,-87.692735,41.965221,-87.658139,member


In [50]:
# Stack data frames
Q3_df = pd.concat([df_dict[dict_keys_ls[0]], df_dict[dict_keys_ls[1]], df_dict[dict_keys_ls[2]]], ignore_index=True, axis=0)

In [51]:
Q3_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,5156990AC19CA285,electric_bike,2022-09-01 08:36:22,2022-09-01 08:39:05,,,California Ave & Milwaukee Ave,13084.0,41.93,-87.69,41.922695,-87.697153,casual
1,E12D4A16BF51C274,electric_bike,2022-09-01 17:11:29,2022-09-01 17:14:45,,,,,41.87,-87.62,41.87,-87.62,casual
2,A02B53CD7DB72DD7,electric_bike,2022-09-01 17:15:50,2022-09-01 17:16:12,,,,,41.87,-87.62,41.87,-87.62,casual
3,C82E05FEE872DF11,electric_bike,2022-09-01 09:00:28,2022-09-01 09:10:32,,,,,41.93,-87.69,41.94,-87.67,casual
4,4DEEB4550A266AE1,electric_bike,2022-09-01 07:30:11,2022-09-01 07:32:36,,,,,41.92,-87.73,41.92,-87.73,casual


In [55]:
Q3_df.to_csv("2022Q3_divvy-tripdata.csv")

### Save data in AWS S3 Bucket