Collect Taxi Data from NYCTLC

In [1]:
from urllib.request import urlretrieve
from urllib.error import URLError
import os
import zipfile

In [2]:
# OPTIONS
month_range = 'min' #'full
output_relative_dir = '../data/raw/tlc_data/'
target_dir = 'tlc_data_yellow'

YEARS = {'2019',
         '2021',
         '2022'}

In [3]:
# used for development
months_lut = {
    'full' : range(1,13),
    'min' : range(1,2)
}

# data output directory is `data/tlc_data/`
tlc_output_dir = output_relative_dir + target_dir
# URL template as of 11-2022
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_"

In [4]:
# check if paths exists, otherwise create
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

if not os.path.exists(output_relative_dir + target_dir):
    os.makedirs(output_relative_dir + target_dir)

In [5]:
for year in YEARS:
    for month in months_lut[month_range]:
        #logging.info(f"Attempting to download: {month}:{year}...")
        print(f"Attempting to download: {month}:{year}...")

        # 0-fill i.e 1 -> 01, 2 -> 02, etc
        month = str(month).zfill(2)
        # generate url
        url = f'{URL_TEMPLATE}{year}-{month}.parquet'
        # generate output location and filename
        output_dir = f"{tlc_output_dir}/{year}-{month}.parquet"

        try: # download
            urlretrieve(url, output_dir)
        except URLError:
            print(f"Path does not exist (yet): {month}:{year}")
        except:
            print(f"Error occurred when trying to download {month}")
        else:
            print(f"Completed downloading: {month}:{year}")

Attempting to download: 1:2022...
Completed downloading: 01:2022
Attempting to download: 1:2019...
Completed downloading: 01:2019
Attempting to download: 1:2021...
Completed downloading: 01:2021


In [6]:
# shapefile download and unzip
shapefile_dir = "shapefiles/"
shapefile_path = output_relative_dir + shapefile_dir
if not os.path.exists(shapefile_path):
    os.makedirs(shapefile_path)

In [7]:
# download shapefile
shapefile_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"
shapefile_name = "taxi_zones.zip"
urlretrieve(shapefile_url, shapefile_path + shapefile_name)

('../data/raw/tlc_data/shapefiles/taxi_zones.zip',
 <http.client.HTTPMessage at 0x7fa7cc5b5220>)

In [8]:
# download lookup file
lookupfile_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv"
lookupfile_name = "taxi_zone_lookup.csv"
urlretrieve(lookupfile_url, shapefile_path + lookupfile_name)

('../data/raw/tlc_data/shapefiles/taxi_zone_lookup.csv',
 <http.client.HTTPMessage at 0x7fa7cc5b5e80>)

In [9]:
# unzip taxi zones file
with zipfile.ZipFile(shapefile_path + shapefile_name, 'r') as zip_ref:
    zip_ref.extractall(shapefile_path + "taxi_zones/")