# 01 - Data Gathering

## Description
This notebook gathers the necissary data from 3 different data sources:
 - 2022Q3 Divvy bike/trip data found on [Divvy's Data Portal](https://divvy-tripdata.s3.amazonaws.com/index.html)
 - Estimated travel time using  [Open Source Routing Machine (OSRM) API](http://project-osrm.org/docs/v5.10.0/api/#general-options)
 - Historic weather data from [OpenWeather API](https://openweathermap.org/api/one-call-3#data)

## Step 0: Setup Libraries and File Locations

### Import Libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import requests
import zipfile
import io
import datetime
import json
from tqdm import tqdm
from scrapy.selector import Selector
import numpy as np
import config

### Create file location

In [2]:
ROOT = os.getcwd()
SAVE_FILES = os.path.join(ROOT, "DATA")
SAVE_FILES

'C:\\Users\\Nicholas\\Desktop\\Masters - Classes\\MSDS436\\Final\\MSDS436-FINAL\\DATA'

## Step 1: Pull the September Divvy Bike Data

In [None]:
r = requests.get(f"https://divvy-tripdata.s3.amazonaws.com/202209-divvy-tripdata.zip")
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(SAVE_FILES)

In [None]:
sept_df = pd.read_csv('C:\\Users\\Nicholas\\Desktop\\Masters - Classes\\MSDS436\\Final\\MSDS436-FINAL\\DATA\\202209-divvy-publictripdata.csv')
sept_df.head()

## Step 2: Clean data
- Order start_at in decending order
- Drop all rows that do not have a start and end station
- Pull between 1,000-2,000 data points per day
- Pull weather data for 24h per day (720 rows total)

### Order start_at in decending order

In [None]:
sept_df.dtypes

In [None]:
sept_df['started_at'] = pd.to_datetime(sept_df['started_at'])
sept_df.dtypes

In [None]:
sept_df = sept_df.sort_values(by='started_at')

display(len(sept_df))
sept_df.head()

### Drop all rows that do not have a start and end station name

In [None]:
sept_df = sept_df[sept_df['start_station_name'].notnull()]
sept_df = sept_df[sept_df['end_station_name'].notnull()]

display(len(sept_df))
sept_df.head()

### Grab up to 2,000 rows per day

In [None]:
# get just date column
sept_df['started_at_clean'] = sept_df['started_at'].dt.date.astype(str)
sept_df.head()

In [None]:
# create unique date list
date_ls = sept_df['started_at_clean'].unique().tolist()
date_ls[:5]

In [None]:
date

In [None]:
cnt = 0
for date in date_ls:
    if cnt == 0:
        main_df = sept_df[sept_df['started_at_clean'] == date].sample(n=2000, random_state=0)
        cnt =+ 1
    else:
        filter_df = sept_df[sept_df['started_at_clean'] == date].sample(n=2000, random_state=0)
        main_df = pd.concat([main_df, filter_df], ignore_index=True, axis=0)

In [None]:
display(len(main_df))

main_df.index.name = 'row'
main_df.head()

In [None]:
# # check data
# main_df.to_csv("data_check.csv")

## Step 3: Get estimated travel time

In [None]:
# new method per https://github.com/Project-OSRM/osrm-backend/issues/6258
def get_distance_bike(point1: dict, point2: dict) -> tuple:
    """Gets distance between two points en route using http://project-osrm.org/docs/v5.10.0/api/#nearest-service"""
    
    url = f"""https://routing.openstreetmap.de/routed-bike/route/v1/biking/{point1["start_lng"]},{point1["start_lat"]};{point2["end_lng"]},{point2["end_lat"]}?overview=false&alternatives=false"""
    r = requests.get(url)
    
    # get the distance from the returned values
    route = json.loads(r.content)["routes"][0]
    return (route["distance"], route["duration"])

In [None]:
# get the distances and durations
dist_array_bike = []
for i , r in tqdm(main_df.iterrows()):
    try:
        point1 = {"start_lat": r["start_lat"], "start_lng": r["start_lng"]}
        point2 = {"end_lat": r["end_lat"], "end_lng": r["end_lng"]}
        dist, duration = get_distance_bike(point1, point2)
        #dist = geodesic((i_lat, i_lon), (o["CapitalLatitude"], o["CapitalLongitude"])).km
        dist_array_bike.append((i, duration, dist))
    except KeyError:
        dist_array_bike.append((i, 0, 0))
        continue

In [None]:
distances_bike_df = pd.DataFrame(dist_array_bike,columns=["row","duration (s)","distance (m)"])

display(len(distances_bike_df))
distances_bike_df.head()

In [None]:
# Matches key value for 'row' or any other unique identifier we want to assign later on
sep_dis_df = pd.merge(main_df, distances_bike_df, on='row', how='right').drop('row', axis=1)
sep_dis_df

In [None]:
# Save data frame
sep_dis_df.to_csv("DATA/202209_divvy_distance.csv", index=False)

## Step 4: Collect historic weather data
- Chicago lat: 41.87
- Chicago long: 87.62
- Pull hourly data for all 30 days (720 rows)

In [3]:
# Load save file
sep_dis_df = pd.read_csv("DATA/202209_divvy_distance.csv")

In [4]:
sep_dis_df['started_at_unix'] = pd.to_datetime(sep_dis_df['started_at'])
sep_dis_df['started_at_unix'] = pd.to_datetime(sep_dis_df['started_at_unix'].dt.strftime('%Y-%m-%d %H'))
sep_dis_df['started_at_unix'] = pd.to_numeric(sep_dis_df['started_at_unix'])
sep_dis_df['started_at_unix'] = sep_dis_df['started_at_unix'] // 10 ** 9


sep_dis_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,started_at_clean,duration (s),distance (m),started_at_unix
0,2FD3F90EDCE2ACD9,classic_bike,2022-09-01 19:39:15,2022-09-01 19:46:45,Southport Ave & Roscoe St,13071,Broadway & Cornelia Ave,13278,41.943739,-87.66402,41.945529,-87.646439,member,2022-09-01,481.9,1669.2,1662058800
1,EE62794A94F80A83,classic_bike,2022-09-01 06:53:41,2022-09-01 07:02:54,LaSalle St & Washington St,13006,Wells St & Polk St,SL-011,41.882664,-87.63253,41.872732,-87.633516,casual,2022-09-01,395.0,1358.8,1662012000
2,56FD4B364747F270,electric_bike,2022-09-01 11:25:21,2022-09-01 11:28:33,N Sheffield Ave & W Wellington Ave,20256.0,Southport Ave & Wellington Ave,TA1307000006,41.94,-87.65,41.935733,-87.663576,casual,2022-09-01,411.5,1595.7,1662030000
3,BD4D6AC842CDF729,classic_bike,2022-09-01 07:46:03,2022-09-01 08:05:36,Racine Ave & Wrightwood Ave,TA1309000059,DuSable Lake Shore Dr & North Blvd,LF-005,41.928887,-87.658971,41.911722,-87.626804,casual,2022-09-01,948.8,3643.8,1662015600
4,2E0E8C378865C01A,electric_bike,2022-09-01 09:55:31,2022-09-01 10:12:27,Wabash Ave & Adams St,KA1503000015,Wood St & Taylor St (Temp),13285,41.879373,-87.625492,41.869265,-87.673731,member,2022-09-01,1275.1,5104.7,1662022800


In [6]:
# Get unique list of unix times
unix_dt_ls = sep_dis_df['started_at_unix'].unique().tolist()
display(unix_dt_ls[:5])
display(len(unix_dt_ls))

[1662058800, 1662012000, 1662030000, 1662015600, 1662022800]

720

In [8]:
def weathermap(latnum, lngnum, dtnum):
    '''
    Pulls weather data using lat, long, and unix_dt
    '''
    api_url = f"http://api.openweathermap.org/data/3.0/onecall/timemachine?lat={latnum}&lon={lngnum}&dt={dtnum}&units=imperial&appid={config.api_key}"
    response = requests.get(api_url)
    resp = response.json()
    
    temp = resp['data'][0]['temp']
    hum = resp['data'][0]['humidity']
    windsp = resp['data'][0]['wind_speed']
    weather = resp['data'][0]['weather'][0]['main']
    try:
        rain = resp['data'][0]['rain']['1h']
    except KeyError as ke:
        rain = 0    
    try:
        snow = resp['data'][0]['snow']['1h']
    except KeyError as ke:
        snow = 0
    
    return temp, hum, windsp, weather, rain, snow

In [9]:
temp_ls = []
hum_ls = []
windsp_ls = []
weather_ls = []
rain_ls = []
snow_ls = []


for unix in tqdm(unix_dt_ls):
    try:
        latnum = 41.87
        lngnum = -87.62
        dtnum = unix
        data = weathermap(latnum, lngnum, dtnum)
        temp_ls.append(data[0])
        hum_ls.append(data[1])
        windsp_ls.append(data[2])
        weather_ls.append(data[3])
        rain_ls.append(data[4])
        snow_ls.append(data[5])
    except NameError:
        temp_ls.append('Nan')
        hum_ls.append('Nan')
        windsp_ls.append('Nan')
        weather_ls.append('Nan')
        rain_ls.append('Nan')
        snow_ls.append('Nan')

100%|████████████████████████████████████████| 696/696 [08:45<00:00,  1.32it/s]


In [7]:
# # API Check

# latnum = 41.87
# lngnum = -87.62

# api_url = f"http://api.openweathermap.org/data/3.0/onecall/timemachine?lat={latnum}&lon={lngnum}&dt=1664568000&units=imperial&appid={config.api_key}"
# response = requests.get(api_url)
# resp = response.json()

# resp

{'lat': 41.87,
 'lon': -87.62,
 'timezone': 'America/Chicago',
 'timezone_offset': -18000,
 'data': [{'dt': 1664568000,
   'sunrise': 1664538373,
   'sunset': 1664580873,
   'temp': 61.39,
   'feels_like': 59.86,
   'pressure': 1024,
   'humidity': 56,
   'dew_point': 45.57,
   'clouds': 0,
   'visibility': 10000,
   'wind_speed': 10.36,
   'wind_deg': 90,
   'wind_gust': 18.41,
   'weather': [{'id': 800,
     'main': 'Clear',
     'description': 'clear sky',
     'icon': '01d'}]}]}

In [15]:
weather_df = pd.DataFrame(list(zip(unix_dt_ls, temp_ls, hum_ls, windsp_ls, 
                                weather_ls, rain_ls, snow_ls)),
               columns =['started_at_unix', 'temp', 'hum', 'windsp', 
                         'weather', 'rain', 'snow'])

weather_df.head()

Unnamed: 0,started_at_unix,temp,hum,windsp,weather,rain,snow
0,1662008400,75.45,69,1.99,Clear,0.0,0
1,1662012000,73.87,71,1.01,Clear,0.0,0
2,1662015600,73.27,72,1.01,Clear,0.0,0
3,1662019200,72.34,74,1.99,Clear,0.0,0
4,1662022800,71.74,75,1.99,Clear,0.0,0


In [21]:
## Data check
weather_df[weather_df['started_at_unix']== 1662012000]

Unnamed: 0,started_at_unix,temp,hum,windsp,weather,rain,snow
1,1662012000,73.87,71,1.01,Clear,0.0,0


In [17]:
# Join data frames on started_at_unix
sep_dis_weath_df = sep_dis_df.merge(weather_df, on='started_at_unix', how='left')
sep_dis_weath_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,started_at_clean,duration (s),distance (m),started_at_unix,temp,hum,windsp,weather,rain,snow
0,2FD3F90EDCE2ACD9,classic_bike,2022-09-01 19:39:15,2022-09-01 19:46:45,Southport Ave & Roscoe St,13071,Broadway & Cornelia Ave,13278,41.943739,-87.66402,...,2022-09-01,481.9,1669.2,1662058800,90.05,39.0,4.0,Clouds,0.0,0.0
1,EE62794A94F80A83,classic_bike,2022-09-01 06:53:41,2022-09-01 07:02:54,LaSalle St & Washington St,13006,Wells St & Polk St,SL-011,41.882664,-87.63253,...,2022-09-01,395.0,1358.8,1662012000,73.87,71.0,1.01,Clear,0.0,0.0
2,56FD4B364747F270,electric_bike,2022-09-01 11:25:21,2022-09-01 11:28:33,N Sheffield Ave & W Wellington Ave,20256.0,Southport Ave & Wellington Ave,TA1307000006,41.94,-87.65,...,2022-09-01,411.5,1595.7,1662030000,73.06,75.0,8.1,Clear,0.0,0.0
3,BD4D6AC842CDF729,classic_bike,2022-09-01 07:46:03,2022-09-01 08:05:36,Racine Ave & Wrightwood Ave,TA1309000059,DuSable Lake Shore Dr & North Blvd,LF-005,41.928887,-87.658971,...,2022-09-01,948.8,3643.8,1662015600,73.27,72.0,1.01,Clear,0.0,0.0
4,2E0E8C378865C01A,electric_bike,2022-09-01 09:55:31,2022-09-01 10:12:27,Wabash Ave & Adams St,KA1503000015,Wood St & Taylor St (Temp),13285,41.879373,-87.625492,...,2022-09-01,1275.1,5104.7,1662022800,71.74,75.0,1.99,Clear,0.0,0.0


In [18]:
# export final data
sep_dis_weath_df.to_csv("DATA/202209_divvy_distance_weather.csv", index=False)

# Full Script
If we had more time, we would use the below code to pull a years worthof bike data and loop through the APIs.

## Step 1: Pull Divvy Bike Data

### Pull keys from website using BeautifulSoup

In [None]:
main_url = 'https://divvy-tripdata.s3.amazonaws.com'
page = requests.get(main_url)
soup = BeautifulSoup(page.content, 'html.parser')

# print(soup.prettify())

In [None]:
zip_keys = soup.findAll('key')
len(zip_keys)

### Loop through Keys and only keep divvy-tripdata

In [None]:
key_ls = []

for i in tqdm(range(len(zip_keys))):
    key_ls.append(zip_keys[i].text)

key_ls_clean = [ x for x in key_ls if "divvy-tripdata" in x ]

key_ls_clean[27:30]

### Pull and save all files

In [None]:
for zip_f in key_ls_clean[27:30]:
    r = requests.get(f"https://divvy-tripdata.s3.amazonaws.com/{zip_f}")
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(SAVE_FILES)

### For our data purpose, read in the 2022Q3 files and create 1 master file
- '202207-divvy-tripdata.csv',
- '202208-divvy-tripdata.csv',
- '202209-divvy-tripdata.csv'

In [None]:
# Get 202207, 202208, and 202209 files and save file path in list
file_ls = []

for file in os.listdir(SAVE_FILES):
    if file.endswith(".csv"):
        file_ls.append(os.path.join(SAVE_FILES, file))
    
file_ls

In [None]:
# Loop through files, create pandas data frame, and save in dictonary
df_dict = {}

for i in file_ls:
    for num in range(len(file_ls)):
        df = pd.read_csv(i)
        df_dict[f"df_{num}"] = df

In [None]:
# Grab dictonary keys and check dataframe
dict_keys_ls = list(df_dict.keys())
df_dict[dict_keys_ls[2]].head()

In [None]:
sep_df = df_dict[dict_keys_ls[2]]
sep_df.index.name = 'row'
len(sep_df)

In [None]:
sep_df[:10000].tail()

In [None]:
# # Concat all 3 data frames and generate Q3_df
# Q3_df = pd.concat([df_dict[dict_keys_ls[0]], df_dict[dict_keys_ls[1]], df_dict[dict_keys_ls[2]]], ignore_index=True, axis=0)
# display(len(Q3_df))
# display(Q3_df.head())

### Get random sample from data

In [None]:
df_sample = df.sample(n=883, random_state=0)
display(len(df_sample))
df_sample.head()

## Step 2: Get estimated travel time

In [None]:
#new method per https://github.com/Project-OSRM/osrm-backend/issues/6258
def get_distance_bike(point1: dict, point2: dict) -> tuple:
    """Gets distance between two points en route using http://project-osrm.org/docs/v5.10.0/api/#nearest-service"""
    
    url = f"""https://routing.openstreetmap.de/routed-bike/route/v1/biking/{point1["start_lng"]},{point1["start_lat"]};{point2["end_lng"]},{point2["end_lat"]}?overview=false&alternatives=false"""
    r = requests.get(url)
    
    # get the distance from the returned values
    route = json.loads(r.content)["routes"][0]
    return (route["distance"], route["duration"])

In [None]:
# get the distances and durations
dist_array_bike = []
for i , r in tqdm(sep_df.iterrows()):
    try:
        point1 = {"start_lat": r["start_lat"], "start_lng": r["start_lng"]}
        point2 = {"end_lat": r["end_lat"], "end_lng": r["end_lng"]}
        dist, duration = get_distance_bike(point1, point2)
        #dist = geodesic((i_lat, i_lon), (o["CapitalLatitude"], o["CapitalLongitude"])).km
        dist_array_bike.append((i, duration, dist))
    except KeyError:
        dist_array_bike.append((i, 0, 0))
        continue

In [None]:
dist_array_bike[-1]

In [None]:
# ensure values are different
# print(dist_array_car) ---> [(0, 800.9, 3224.7), (1, 1289.7, 4141.1)]
print(len(dist_array_bike))

In [None]:
distances_bike_df = pd.DataFrame(dist_array_bike,columns=["row","duration (s)","distance (m)"])
distances_bike_df.head()

In [None]:
# Matches key value for 'row' or any other unique identifier we want to assign later on
sep_dis_df = pd.merge(sep_df.iloc[:290673], distances_bike_df, on='row', how='right').drop('row', axis=1)
sep_dis_df

In [None]:
# Save data frame
sep_dis_df.to_csv("202209_divvy_distance.csv")

In [None]:
# export master file
# Q3_df.to_csv("2022Q3_divvy-tripdata.csv")

## Step 3: Collect historic weather data
**NOTE:** the config.py file contains api_key

### Clean data for weatehr API

In [None]:
# round lat and long to 2 decimal places (needed for API)
sep_dis_df['start_lat_clean'] = sep_dis_df['start_lat'].round(2)
sep_dis_df['start_lng_clean'] = sep_dis_df['start_lng'].round(2)

sep_dis_df.head()

In [None]:
sep_dis_df.info()

In [None]:
sep_dis_df['unix_dt'] = pd.to_datetime(sep_dis_df['started_at'])
sep_dis_df['unix_dt'] = pd.to_datetime(sep_dis_df['unix_dt'])
sep_dis_df['unix_dt'] = pd.to_numeric(sep_dis_df['unix_dt'])
sep_dis_df['unix_dt'] = sep_dis_df['unix_dt'] // 10 ** 9

sep_dis_df.head()

In [None]:
def weathermap(latnum, lngnum, dtnum):
    '''
    Pulls weather data using lat, long, and unix_dt
    '''
    api_url = f"http://api.openweathermap.org/data/3.0/onecall/timemachine?lat={latnum}&lon={lngnum}&dt={dtnum}&units=imperial&appid={config.api_key}"
    response = requests.get(api_url)
    resp = response.json()
    
    temp = resp['data'][0]['temp']
    hum = resp['data'][0]['humidity']
    windsp = resp['data'][0]['wind_speed']
    weather = resp['data'][0]['weather'][0]['main']
    try:
        rain = resp['data'][0]['rain']['1h']
    except KeyError as ke:
        rain = 0    
    try:
        snow = resp['data'][0]['snow']['1h']
    except KeyError as ke:
        snow = 0
    
    return temp, hum, windsp, weather, rain, snow

In [None]:
temp_ls = []
hum_ls = []
windsp_ls = []
weather_ls = []
rain_ls = []
snow_ls = []


for i in tqdm(range(len(sep_dis_df))):
    try:
        latnum = sep_dis_df['start_lat'].iloc[i]
        lngnum = sep_dis_df['start_lng'].iloc[i]
        dtnum = sep_dis_df['unix_dt'].iloc[i]
        temp_ls.append(weathermap(latnum, lngnum, dtnum)[0])
        hum_ls.append(weathermap(latnum, lngnum, dtnum)[1])
        windsp_ls.append(weathermap(latnum, lngnum, dtnum)[2])
        weather_ls.append(weathermap(latnum, lngnum, dtnum)[3])
        rain_ls.append(weathermap(latnum, lngnum, dtnum)[4])
        snow_ls.append(weathermap(latnum, lngnum, dtnum)[5])
    except NameError:
        temp_ls.append('Nan')
        hum_ls.append('Nan')
        windsp_ls.append('Nan')
        weather_ls.append('Nan')
        rain_ls.append('Nan')
        snow_ls.append('Nan')
    
sep_dis_df['temp'] = temp_ls
sep_dis_df['hum'] = hum_ls
sep_dis_df['windsp'] = windsp_ls
sep_dis_df['weather'] = weather_ls
sep_dis_df['rain'] = rain_ls
sep_dis_df['snow'] = snow_ls

In [None]:
len(temp_ls)

## Join historic weather data to sample

In [None]:
df['temp'] = temp_ls
df['hum'] = hum_ls
df['windsp'] = windsp_ls
df['weather'] = weather_ls
df['rain'] = rain_ls
df['snow'] = snow_ls

### Save data in AWS S3 Bucket

In [None]:
latnum = sep_dis_df['start_lat'].iloc[0]
latnum

In [None]:
lngnum = sep_dis_df['start_lng'].iloc[0]
lngnum

In [None]:
dtnum = sep_dis_df['unix_dt'].iloc[i]
dtnum

In [None]:
api_url = f"http://api.openweathermap.org/data/3.0/onecall/timemachine?lat={latnum}&lon={lngnum}&dt={dtnum}&units=imperial&appid={config.api_key}"
response = requests.get(api_url)
resp = response.json()

In [None]:
resp