# 01 - Data Gathering

## Description
This notebook gathers the necissary data from 3 different data sources:
 - 2022Q3 Divvy bike/trip data found on [Divvy's Data Portal](https://divvy-tripdata.s3.amazonaws.com/index.html)
 - Estimated travel time using  [Open Source Routing Machine (OSRM) API](http://project-osrm.org/docs/v5.10.0/api/#general-options)
 - Historic weather data from [OpenWeather API](https://openweathermap.org/api/one-call-3#data)

## Step 0: Setup Libraries and File Locations

### Import Libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import requests
import zipfile
import io
import datetime
import json
from tqdm import tqdm
from scrapy.selector import Selector
import config

### Create file location

In [2]:
ROOT = os.getcwd()
SAVE_FILES = os.path.join(ROOT, "DATA")
SAVE_FILES

'C:\\Users\\Nicholas\\Desktop\\Masters - Classes\\MSDS436\\Final\\MSDS436-FINAL\\DATA'

## Step 1: Pull Divvy Bike Data

### Pull keys from website using BeautifulSoup

In [3]:
main_url = 'https://divvy-tripdata.s3.amazonaws.com'
page = requests.get(main_url)
soup = BeautifulSoup(page.content, 'html.parser')

# print(soup.prettify())

In [4]:
zip_keys = soup.findAll('key')
len(zip_keys)

50

### Loop through Keys and only keep divvy-tripdata

In [5]:
key_ls = []

for i in tqdm(range(len(zip_keys))):
    key_ls.append(zip_keys[i].text)

key_ls_clean = [ x for x in key_ls if "divvy-tripdata" in x ]

key_ls_clean[27:30]

100%|███████████████████████████████████████| 50/50 [00:00<00:00, 49979.79it/s]


['202207-divvy-tripdata.zip',
 '202208-divvy-tripdata.zip',
 '202209-divvy-tripdata.zip']

### Pull and save all files

In [6]:
for zip_f in key_ls_clean[27:30]:
    r = requests.get(f"https://divvy-tripdata.s3.amazonaws.com/{zip_f}")
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(SAVE_FILES)

### For our data purpose, read in the 2022Q3 files and create 1 master file
- '202207-divvy-tripdata.csv',
- '202208-divvy-tripdata.csv',
- '202209-divvy-tripdata.csv'

In [7]:
# Get 202207, 202208, and 202209 files and save file path in list
file_ls = []

for file in os.listdir(SAVE_FILES):
    if file.endswith(".csv"):
        file_ls.append(os.path.join(SAVE_FILES, file))
    
file_ls

['C:\\Users\\Nicholas\\Desktop\\Masters - Classes\\MSDS436\\Final\\MSDS436-FINAL\\DATA\\202207-divvy-tripdata.csv',
 'C:\\Users\\Nicholas\\Desktop\\Masters - Classes\\MSDS436\\Final\\MSDS436-FINAL\\DATA\\202208-divvy-tripdata.csv',
 'C:\\Users\\Nicholas\\Desktop\\Masters - Classes\\MSDS436\\Final\\MSDS436-FINAL\\DATA\\202209-divvy-publictripdata.csv']

In [8]:
# Loop through files, create pandas data frame, and save in dictonary
df_dict = {}

for i in file_ls:
    for num in range(len(file_ls)):
        df = pd.read_csv(i)
        df_dict[f"df_{num}"] = df

In [9]:
# Grab dictonary keys and check dataframe
dict_keys_ls = list(df_dict.keys())
df_dict[dict_keys_ls[2]].head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,5156990AC19CA285,electric_bike,2022-09-01 08:36:22,2022-09-01 08:39:05,,,California Ave & Milwaukee Ave,13084.0,41.93,-87.69,41.922695,-87.697153,casual
1,E12D4A16BF51C274,electric_bike,2022-09-01 17:11:29,2022-09-01 17:14:45,,,,,41.87,-87.62,41.87,-87.62,casual
2,A02B53CD7DB72DD7,electric_bike,2022-09-01 17:15:50,2022-09-01 17:16:12,,,,,41.87,-87.62,41.87,-87.62,casual
3,C82E05FEE872DF11,electric_bike,2022-09-01 09:00:28,2022-09-01 09:10:32,,,,,41.93,-87.69,41.94,-87.67,casual
4,4DEEB4550A266AE1,electric_bike,2022-09-01 07:30:11,2022-09-01 07:32:36,,,,,41.92,-87.73,41.92,-87.73,casual


In [10]:
sep_df = df_dict[dict_keys_ls[2]]
sep_df.index.name = 'row'
len(sep_df)

701339

In [None]:
# # Concat all 3 data frames and generate Q3_df
# Q3_df = pd.concat([df_dict[dict_keys_ls[0]], df_dict[dict_keys_ls[1]], df_dict[dict_keys_ls[2]]], ignore_index=True, axis=0)
# display(len(Q3_df))
# display(Q3_df.head())

### Get random sample from data

In [30]:
df_sample = df.sample(n=883, random_state=0)
display(len(df_sample))
df_sample.head()

883

Unnamed: 0_level_0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
121565,868962DA932B21BB,electric_bike,2022-09-20 19:33:21,2022-09-20 19:44:48,N Green St & W Lake St,20246.0,Damen Ave & Thomas St (Augusta Blvd),TA1307000070,41.89,-87.65,41.901315,-87.677409,casual
514012,3A58F3B4C9B96CF8,classic_bike,2022-09-24 11:07:04,2022-09-24 11:11:16,DuSable Lake Shore Dr & Monroe St,13300,Dusable Harbor,KA1503000064,41.880958,-87.616743,41.886976,-87.612813,member
136359,D6C07BD56AD72DCB,electric_bike,2022-09-17 15:33:12,2022-09-17 15:42:55,Damen Ave & Wellington Ave,13268,Leavitt St & Armitage Ave,TA1309000029,41.935877,-87.678373,41.917805,-87.682437,casual
487566,833C093844EB3BBB,classic_bike,2022-09-07 14:57:46,2022-09-07 15:00:58,Morgan St & Polk St,TA1307000130,Halsted St & Roosevelt Rd,TA1305000017,41.871737,-87.65103,41.867324,-87.648625,casual
502565,7E90725C16E10BD7,classic_bike,2022-09-16 18:52:36,2022-09-16 19:08:08,Ashland Ave & Grace St,13319,Racine Ave & Fullerton Ave,TA1306000026,41.950687,-87.6687,41.925563,-87.658404,casual


## Step 2: Get estimated travel time

In [11]:
#new method per https://github.com/Project-OSRM/osrm-backend/issues/6258
def get_distance_bike(point1: dict, point2: dict) -> tuple:
    """Gets distance between two points en route using http://project-osrm.org/docs/v5.10.0/api/#nearest-service"""
    
    url = f"""https://routing.openstreetmap.de/routed-bike/route/v1/biking/{point1["start_lng"]},{point1["start_lat"]};{point2["end_lng"]},{point2["end_lat"]}?overview=false&alternatives=false"""
    r = requests.get(url)
    
    # get the distance from the returned values
    route = json.loads(r.content)["routes"][0]
    return (route["distance"], route["duration"])

In [12]:
# get the distances and durations
dist_array_bike = []
for i , r in tqdm(sep_df.iterrows()):
    try:
        point1 = {"start_lat": r["start_lat"], "start_lng": r["start_lng"]}
        point2 = {"end_lat": r["end_lat"], "end_lng": r["end_lng"]}
        dist, duration = get_distance_bike(point1, point2)
        #dist = geodesic((i_lat, i_lon), (o["CapitalLatitude"], o["CapitalLongitude"])).km
        dist_array_bike.append((i, duration, dist))
    except KeyError:
        dist_array_bike.append((i, 0, 0))
        continue

290672it [33:04:41,  2.44it/s]


ConnectionError: HTTPSConnectionPool(host='routing.openstreetmap.de', port=443): Max retries exceeded with url: /routed-bike/route/v1/biking/-87.63189366666668,41.911873666666665;-87.612043,41.892278000000005?overview=false&alternatives=false (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000024BBA78D340>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [13]:
dist_array_bike[-1]

(290671, 1271.3, 5061.1)

In [14]:
# ensure values are different
# print(dist_array_car) ---> [(0, 800.9, 3224.7), (1, 1289.7, 4141.1)]
print(len(dist_array_bike))

290672


In [15]:
distances_bike_df = pd.DataFrame(dist_array_bike,columns=["row","duration (s)","distance (m)"])
distances_bike_df.head()

Unnamed: 0,row,duration (s),distance (m)
0,0,493.6,1871.5
1,1,0.0,0.0
2,2,0.0,0.0
3,3,704.8,2821.1
4,4,0.0,0.0


In [16]:
# Matches key value for 'row' or any other unique identifier we want to assign later on
sep_dis_df = pd.merge(sep_df.iloc[:290673], distances_bike_df, on='row', how='right').drop('row', axis=1)
sep_dis_df

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration (s),distance (m)
0,5156990AC19CA285,electric_bike,2022-09-01 08:36:22,2022-09-01 08:39:05,,,California Ave & Milwaukee Ave,13084,41.930000,-87.690000,41.922695,-87.697153,casual,493.6,1871.5
1,E12D4A16BF51C274,electric_bike,2022-09-01 17:11:29,2022-09-01 17:14:45,,,,,41.870000,-87.620000,41.870000,-87.620000,casual,0.0,0.0
2,A02B53CD7DB72DD7,electric_bike,2022-09-01 17:15:50,2022-09-01 17:16:12,,,,,41.870000,-87.620000,41.870000,-87.620000,casual,0.0,0.0
3,C82E05FEE872DF11,electric_bike,2022-09-01 09:00:28,2022-09-01 09:10:32,,,,,41.930000,-87.690000,41.940000,-87.670000,casual,704.8,2821.1
4,4DEEB4550A266AE1,electric_bike,2022-09-01 07:30:11,2022-09-01 07:32:36,,,,,41.920000,-87.730000,41.920000,-87.730000,casual,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290667,366620D22D3DB730,electric_bike,2022-09-03 14:24:03,2022-09-03 14:40:17,Clark St & North Ave,13128,Streeter Dr & Grand Ave,13022,41.911947,-87.631998,41.892278,-87.612043,casual,910.2,3365.5
290668,3CC25101FC4703C1,classic_bike,2022-09-12 15:14:48,2022-09-12 15:38:33,Clark St & North Ave,13128,Streeter Dr & Grand Ave,13022,41.911974,-87.631942,41.892278,-87.612043,casual,924.5,3369.6
290669,F1B1A1F33DD39739,electric_bike,2022-09-16 15:42:06,2022-09-16 16:08:07,MLK Jr Dr & 29th St,TA1307000139,Streeter Dr & Grand Ave,13022,41.841947,-87.616975,41.892278,-87.612043,casual,1827.1,6777.3
290670,0ECB821AB87FA257,electric_bike,2022-09-16 11:12:58,2022-09-16 13:05:47,Ellis Ave & 58th St,TA1309000011,Streeter Dr & Grand Ave,13022,41.788742,-87.601232,41.892278,-87.612043,casual,3371.0,13248.6


In [17]:
# Save data frame
sep_dis_df.to_csv("202209_divvy_distance.csv")

In [None]:
# export master file
# Q3_df.to_csv("2022Q3_divvy-tripdata.csv")

## Step 3: Collect historic weather data
**NOTE:** the config.py file contains api_key

### Clean data for weatehr API

In [18]:
# round lat and long to 2 decimal places (needed for API)
sep_dis_df['start_lat_clean'] = sep_dis_df['start_lat'].round(2)
sep_dis_df['start_lng_clean'] = sep_dis_df['start_lng'].round(2)

sep_dis_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration (s),distance (m),start_lat_clean,start_lng_clean
0,5156990AC19CA285,electric_bike,2022-09-01 08:36:22,2022-09-01 08:39:05,,,California Ave & Milwaukee Ave,13084.0,41.93,-87.69,41.922695,-87.697153,casual,493.6,1871.5,41.93,-87.69
1,E12D4A16BF51C274,electric_bike,2022-09-01 17:11:29,2022-09-01 17:14:45,,,,,41.87,-87.62,41.87,-87.62,casual,0.0,0.0,41.87,-87.62
2,A02B53CD7DB72DD7,electric_bike,2022-09-01 17:15:50,2022-09-01 17:16:12,,,,,41.87,-87.62,41.87,-87.62,casual,0.0,0.0,41.87,-87.62
3,C82E05FEE872DF11,electric_bike,2022-09-01 09:00:28,2022-09-01 09:10:32,,,,,41.93,-87.69,41.94,-87.67,casual,704.8,2821.1,41.93,-87.69
4,4DEEB4550A266AE1,electric_bike,2022-09-01 07:30:11,2022-09-01 07:32:36,,,,,41.92,-87.73,41.92,-87.73,casual,0.0,0.0,41.92,-87.73


In [19]:
sep_dis_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 290672 entries, 0 to 290671
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             290672 non-null  object 
 1   rideable_type       290672 non-null  object 
 2   started_at          290672 non-null  object 
 3   ended_at            290672 non-null  object 
 4   start_station_name  251378 non-null  object 
 5   start_station_id    251378 non-null  object 
 6   end_station_name    251335 non-null  object 
 7   end_station_id      251335 non-null  object 
 8   start_lat           290672 non-null  float64
 9   start_lng           290672 non-null  float64
 10  end_lat             290661 non-null  float64
 11  end_lng             290661 non-null  float64
 12  member_casual       290672 non-null  object 
 13  duration (s)        290672 non-null  float64
 14  distance (m)        290672 non-null  float64
 15  start_lat_clean     290672 non-nul

In [20]:
sep_dis_df['unix_dt'] = pd.to_datetime(sep_dis_df['started_at'])
sep_dis_df['unix_dt'] = pd.to_datetime(sep_dis_df['unix_dt'])
sep_dis_df['unix_dt'] = pd.to_numeric(sep_dis_df['unix_dt'])
sep_dis_df['unix_dt'] = sep_dis_df['unix_dt'] // 10 ** 9

sep_dis_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,duration (s),distance (m),start_lat_clean,start_lng_clean,unix_dt
0,5156990AC19CA285,electric_bike,2022-09-01 08:36:22,2022-09-01 08:39:05,,,California Ave & Milwaukee Ave,13084.0,41.93,-87.69,41.922695,-87.697153,casual,493.6,1871.5,41.93,-87.69,1662021382
1,E12D4A16BF51C274,electric_bike,2022-09-01 17:11:29,2022-09-01 17:14:45,,,,,41.87,-87.62,41.87,-87.62,casual,0.0,0.0,41.87,-87.62,1662052289
2,A02B53CD7DB72DD7,electric_bike,2022-09-01 17:15:50,2022-09-01 17:16:12,,,,,41.87,-87.62,41.87,-87.62,casual,0.0,0.0,41.87,-87.62,1662052550
3,C82E05FEE872DF11,electric_bike,2022-09-01 09:00:28,2022-09-01 09:10:32,,,,,41.93,-87.69,41.94,-87.67,casual,704.8,2821.1,41.93,-87.69,1662022828
4,4DEEB4550A266AE1,electric_bike,2022-09-01 07:30:11,2022-09-01 07:32:36,,,,,41.92,-87.73,41.92,-87.73,casual,0.0,0.0,41.92,-87.73,1662017411


In [21]:
def weathermap(latnum, lngnum, dtnum):
    '''
    Pulls weather data using lat, long, and unix_dt
    '''
    api_url = f"http://api.openweathermap.org/data/3.0/onecall/timemachine?lat={latnum}&lon={lngnum}&dt={dtnum}&units=imperial&appid={config.api_key}"
    response = requests.get(api_url)
    resp = response.json()
    
    temp = resp['data'][0]['temp']
    hum = resp['data'][0]['humidity']
    windsp = resp['data'][0]['wind_speed']
    weather = resp['data'][0]['weather'][0]['main']
    try:
        rain = resp['data'][0]['rain']['1h']
    except KeyError as ke:
        rain = 0    
    try:
        snow = resp['data'][0]['snow']['1h']
    except KeyError as ke:
        snow = 0
    
    return temp, hum, windsp, weather, rain, snow

In [22]:
temp_ls = []
hum_ls = []
windsp_ls = []
weather_ls = []
rain_ls = []
snow_ls = []


for i in tqdm(range(len(sep_dis_df))):
    try:
        latnum = sep_dis_df['start_lat'].iloc[i]
        lngnum = sep_dis_df['start_lng'].iloc[i]
        dtnum = sep_dis_df['unix_dt'].iloc[i]
        temp_ls.append(weathermap(latnum, lngnum, dtnum)[0])
        hum_ls.append(weathermap(latnum, lngnum, dtnum)[1])
        windsp_ls.append(weathermap(latnum, lngnum, dtnum)[2])
        weather_ls.append(weathermap(latnum, lngnum, dtnum)[3])
        rain_ls.append(weathermap(latnum, lngnum, dtnum)[4])
        snow_ls.append(weathermap(latnum, lngnum, dtnum)[5])
    except NameError:
        temp_ls.append('Nan')
        hum_ls.append('Nan')
        windsp_ls.append('Nan')
        weather_ls.append('Nan')
        rain_ls.append('Nan')
        snow_ls.append('Nan')
    
sep_dis_df['temp'] = temp_ls
sep_dis_df['hum'] = hum_ls
sep_dis_df['windsp'] = windsp_ls
sep_dis_df['weather'] = weather_ls
sep_dis_df['rain'] = rain_ls
sep_dis_df['snow'] = snow_ls

  0%|                                  | 883/290672 [10:10<55:38:19,  1.45it/s]


KeyError: 'data'

In [26]:
len(temp_ls)

883

## Join historic weather data to sample

In [None]:
df['temp'] = temp_ls
df['hum'] = hum_ls
df['windsp'] = windsp_ls
df['weather'] = weather_ls
df['rain'] = rain_ls
df['snow'] = snow_ls

### Save data in AWS S3 Bucket

In [23]:
latnum = sep_dis_df['start_lat'].iloc[0]
latnum

41.93

In [None]:
lngnum = sep_dis_df['start_lng'].iloc[0]
lngnum

In [None]:
dtnum = sep_dis_df['unix_dt'].iloc[i]
dtnum

In [27]:
api_url = f"http://api.openweathermap.org/data/3.0/onecall/timemachine?lat={latnum}&lon={lngnum}&dt={dtnum}&units=imperial&appid={config.api_key}"
response = requests.get(api_url)
resp = response.json()

In [28]:
resp

{'cod': 429,
 'message': 'Your account is temporary blocked due to exceeding of requests limitation of your subscription type. Please choose the proper subscription https://openweathermap.org/price'}