In [2]:
from bs4 import BeautifulSoup
import requests
import pyarrow.parquet as pq
import os
import logging

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('../../logs/ingestion.log'),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

FileNotFoundError: [Errno 2] No such file or directory: '/logs/ingestion.log'

In [2]:
def download_parquet(url, file_name, year, month):
    folder_path = f"../../raw_data/{year}/{month}"
    file_path = f"{folder_path}/{file_name}"
    os.makedirs(folder_path, exist_ok=True)
    logger.info(f'Download Start: {file_name}')
    
    parquet_response = requests.get(url, stream=True)
    parquet_response.raise_for_status()
    with open(file_path, 'wb') as file:
        for chunk in parquet_response.iter_content(chunk_size = 8192):
            file.write(chunk)
    logger.info(f'Download Complete: {file_name}')

In [3]:
def repartition_parquet(file_name, year, month, no_of_partitions):
    table = pq.read_table(f'../../raw_data/{year}/{month}/{file_name}')
    chunk_size = table.num_rows // no_of_partitions
    
    tables = []
    for i in range(no_of_partitions):
        if i < no_of_partitions-1:
            tables.append(table.slice(i * chunk_size, chunk_size))
        else:
            tables.append(table.slice(i * chunk_size, chunk_size + (table.num_rows % no_of_partitions)))

    logger.info(f'Repartition Start:{file_name}')
    for i, partition_table in enumerate(tables):
        output_folder_path = f'../repartition/{year}/{month}'
        output_file_name = f'repartition-{i}-yellow-taxi-data-{year}-{month}.parquet'
        os.makedirs(output_folder_path, exist_ok=True)
        
        output_file_path = f'{output_folder_path}/{output_file_name}'
        pq.write_table(partition_table, output_file_path)
    logger.info(f'Repartition Complete:{file_name}')

In [16]:
page_url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

In [17]:
html_response = requests.get(page_url)
html_response.raise_for_status()

In [18]:
soup = BeautifulSoup(html_response.text, 'html.parser')

In [19]:
headers = soup.find_all('a', attrs={'title': 'Yellow Taxi Trip Records'})

In [25]:
latest_5_links = [headers[i] for i in range(5)]  

In [8]:
for link in latest_5_links:
    url = link.get('href').strip()
    file_name = url.split('/')[-1]
    year, month = file_name.split('_')[-1].split('.')[0].split('-')

    download_parquet(url, file_name, year, month)

    no_of_partitions = 4
    repartition_parquet(file_name, year, month, no_of_partitions)

Download Start: yellow_tripdata_2024-01.parquet
Download Complete: yellow_tripdata_2024-01.parquet
Repartition Start:yellow_tripdata_2024-01.parquet
Repartition Complete:yellow_tripdata_2024-01.parquet
Download Start: yellow_tripdata_2024-02.parquet
Download Complete: yellow_tripdata_2024-02.parquet
Repartition Start:yellow_tripdata_2024-02.parquet
Repartition Complete:yellow_tripdata_2024-02.parquet
Download Start: yellow_tripdata_2024-03.parquet
Download Complete: yellow_tripdata_2024-03.parquet
Repartition Start:yellow_tripdata_2024-03.parquet
Repartition Complete:yellow_tripdata_2024-03.parquet
Download Start: yellow_tripdata_2024-04.parquet
Download Complete: yellow_tripdata_2024-04.parquet
Repartition Start:yellow_tripdata_2024-04.parquet
Repartition Complete:yellow_tripdata_2024-04.parquet
Download Start: yellow_tripdata_2024-05.parquet
Download Complete: yellow_tripdata_2024-05.parquet
Repartition Start:yellow_tripdata_2024-05.parquet
Repartition Complete:yellow_tripdata_2024-0

KeyboardInterrupt: 