In [80]:
from bs4 import BeautifulSoup
import requests
import io
import logging
import boto3
from botocore.client import Config

In [81]:
s3 = boto3.client(
    's3',
    endpoint_url = 'http://object-storage:9000',
    aws_access_key_id = 'root',
    aws_secret_access_key = 'password',
    config=Config(signature_version='s3v4'),
)

In [82]:
def create_minio_bucket(bucket_name):
    try:
        s3.head_bucket(Bucket=bucket_name)
        loggin.info(f'Bucket already exists: {bucket_name}')
    except:
        try:
            s3.create_bucket(Bucket=bucket_name)
            logger.info(f'Bucket created succesfully: {bucket_name}')
        except Exception as e:
            logger.error(f'Error creating a bucket: {e}')
            raise

In [83]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('../logs/ingestion.log'),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

In [84]:
def download_parquet_to_memory(url):
    logger.info(f'Download Start: {file_name}')
    
    parquet_response = requests.get(url, stream=True)
    parquet_response.raise_for_status()
    buffer = io.BytesIO()
    
    for chunk in parquet_response.iter_content(chunk_size = 8192):
        buffer.write(chunk)

    logger.info(f'Download Complete: {file_name}')
    return buffer

In [85]:
def upload_to_minio(buffer, bucket_name, object_name):
    try:
        buffer.seek(0)
        s3.put_object(Bucket=bucket_name, Key=object_name, Body=buffer.getvalue())
        logger.info(f"Succesfully uploaded to bucket '{bucket_name}': {bucket_name}")
        buffer.close()
    except Exception as e:
        logger.error(f'Error uploading to MinIO: {e}')

In [86]:
def process_and_upload_to_minio(url, file_name, year, month, bucket_name):
    parquet_buffer = download_parquet_to_memory(url)

    logger.info(f'Uploading file {file_name} to Bucket: {bucket_name}')

    object_name = f'{year}/{month}/{file_name}'

    upload_to_minio(parquet_buffer, bucket_name, object_name)

In [87]:
page_url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

In [88]:
html_response = requests.get(page_url)
html_response.raise_for_status()

In [89]:
soup = BeautifulSoup(html_response.text, 'html.parser')

In [90]:
headers = soup.find_all('a', attrs={'title': 'Yellow Taxi Trip Records'})

In [91]:
bucket_name = 'data'
create_minio_bucket(bucket_name)

2024-09-10 03:06:22,138 - __main__ - INFO - Bucket created succesfully: data


In [None]:
data_till_year = 2019
for link in headers:
    url = link.get('href').strip()
    file_name = url.split('/')[-1]
    year, month = file_name.split('_')[-1].split('.')[0].split('-')
    if int(year) >= data_till_year:
        process_and_upload_to_minio(url, file_name, year, month, bucket_name)
    else:
        break

2024-09-10 03:06:24,778 - __main__ - INFO - Download Start: yellow_tripdata_2024-01.parquet
2024-09-10 03:06:49,856 - __main__ - INFO - Download Complete: yellow_tripdata_2024-01.parquet
2024-09-10 03:06:49,859 - __main__ - INFO - Uploading file yellow_tripdata_2024-01.parquet to Bucket: data
2024-09-10 03:06:50,521 - __main__ - INFO - Succesfully uploaded to bucket 'data': data
2024-09-10 03:06:50,525 - __main__ - INFO - Download Start: yellow_tripdata_2024-02.parquet
2024-09-10 03:07:04,252 - __main__ - INFO - Download Complete: yellow_tripdata_2024-02.parquet
2024-09-10 03:07:04,253 - __main__ - INFO - Uploading file yellow_tripdata_2024-02.parquet to Bucket: data
2024-09-10 03:07:04,936 - __main__ - INFO - Succesfully uploaded to bucket 'data': data
2024-09-10 03:07:04,941 - __main__ - INFO - Download Start: yellow_tripdata_2024-03.parquet
2024-09-10 03:07:21,345 - __main__ - INFO - Download Complete: yellow_tripdata_2024-03.parquet
2024-09-10 03:07:21,347 - __main__ - INFO - Uploa