"""
File for Scraping Taxi Data.
Import Requirements
Specify date range to be scraped - post Covid (up to ~300 million rows)
Specify category of taxi for scraping
Scrape
"""

In [1]:
import os
import requests
import pandas as pd
from datetime import datetime

In [2]:
""" 
DatetimeIndex Object, used for scraping files
"""

date_range = pd.date_range(start='2021-01', end='2024-04', freq='ME')
print(date_range)

DatetimeIndex(['2021-01-31', '2021-02-28', '2021-03-31', '2021-04-30',
               '2021-05-31', '2021-06-30', '2021-07-31', '2021-08-31',
               '2021-09-30', '2021-10-31', '2021-11-30', '2021-12-31',
               '2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
               '2022-05-31', '2022-06-30', '2022-07-31', '2022-08-31',
               '2022-09-30', '2022-10-31', '2022-11-30', '2022-12-31',
               '2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30',
               '2023-05-31', '2023-06-30', '2023-07-31', '2023-08-31',
               '2023-09-30', '2023-10-31', '2023-11-30', '2023-12-31',
               '2024-01-31', '2024-02-29', '2024-03-31'],
              dtype='datetime64[ns]', freq='ME')


In [3]:
"""
List of types of taxi-data to scrape
See taxi_cleaning.ipynb for more info
"""

taxis = ["yellow", "green", "fhv", "fhvhv"]

In [4]:
"""
Code for Scraping Taxi Data
Creates "taxi_parquets" folder if one not present already
Scrapes
(Runtime = ~17-49 minutes on Data Lead's machine)
"""

directory = os.path.join("Datasets", "taxi_parquets")
if not os.path.exists(directory):
    os.makedirs(directory)
    
for date in date_range:
    for taxi_type in taxis:
        url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{taxi_type}_tripdata_{date.strftime('%Y-%m')}.parquet"
        response = requests.get(url)
        
        file_path = os.path.join(directory, f"{taxi_type}_{date.strftime('%Y-%m')}.parquet")
        
        with open(file_path, 'wb') as file:
            file.write(response.content)
            print("Downloaded", taxi_type, datetime.strftime(date, "%Y-%m"))
        

Downloaded yellow 2021-01
Downloaded green 2021-01
Downloaded fhv 2021-01
Downloaded fhvhv 2021-01
Downloaded yellow 2021-02
Downloaded green 2021-02
Downloaded fhv 2021-02
Downloaded fhvhv 2021-02
Downloaded yellow 2021-03
Downloaded green 2021-03
Downloaded fhv 2021-03
Downloaded fhvhv 2021-03
Downloaded yellow 2021-04
Downloaded green 2021-04
Downloaded fhv 2021-04
Downloaded fhvhv 2021-04
Downloaded yellow 2021-05
Downloaded green 2021-05
Downloaded fhv 2021-05
Downloaded fhvhv 2021-05
Downloaded yellow 2021-06
Downloaded green 2021-06
Downloaded fhv 2021-06
Downloaded fhvhv 2021-06
Downloaded yellow 2021-07
Downloaded green 2021-07
Downloaded fhv 2021-07
Downloaded fhvhv 2021-07
Downloaded yellow 2021-08
Downloaded green 2021-08
Downloaded fhv 2021-08
Downloaded fhvhv 2021-08
Downloaded yellow 2021-09
Downloaded green 2021-09
Downloaded fhv 2021-09
Downloaded fhvhv 2021-09
Downloaded yellow 2021-10
Downloaded green 2021-10
Downloaded fhv 2021-10
Downloaded fhvhv 2021-10
Downloaded