In [8]:
from bs4 import BeautifulSoup
import requests
import io
import logging
import boto3
from botocore.client import Config
import os

In [15]:
minio_access_key = os.getenv('MINIO_ROOT_USER')
minio_secret_key = os.getenv('MINIO_ROOT_PASSWORD')

In [16]:
s3 = boto3.client(
    's3',
    endpoint_url = 'http://object-storage:9000',
    aws_access_key_id = minio_access_key,
    aws_secret_access_key = minio_secret_key,
    config=Config(signature_version='s3v4'),
)

In [17]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('../logs/ingestion.log'),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger(__name__)

In [18]:
def create_minio_bucket(bucket_name):
    try:
        s3.head_bucket(Bucket=bucket_name)
        loggin.info(f'Bucket already exists: {bucket_name}')
    except:
        try:
            s3.create_bucket(Bucket=bucket_name)
            logger.info(f'Bucket created succesfully: {bucket_name}')
        except Exception as e:
            logger.error(f'Error creating a bucket: {e}')
            raise

In [19]:
def download_parquet_to_memory(url):
    logger.info(f'Download Start: {file_name}')
    
    parquet_response = requests.get(url, stream=True)
    parquet_response.raise_for_status()
    buffer = io.BytesIO()
    
    for chunk in parquet_response.iter_content(chunk_size = 8192):
        buffer.write(chunk)

    logger.info(f'Download Complete: {file_name}')
    return buffer

In [20]:
def upload_to_minio(buffer, bucket_name, object_name):
    try:
        buffer.seek(0)
        s3.put_object(Bucket=bucket_name, Key=object_name, Body=buffer.getvalue())
        logger.info(f"Succesfully uploaded to bucket '{bucket_name}': {object_name}")
        buffer.close()
    except Exception as e:
        logger.error(f'Error uploading to MinIO: {e}')

In [21]:
def process_and_upload_to_minio(url, file_name, year, month, bucket_name):
    parquet_buffer = download_parquet_to_memory(url)

    logger.info(f'Uploading file {file_name} to Bucket: {bucket_name}')

    object_name = f'raw_data/{year}/{month}/{file_name}'

    upload_to_minio(parquet_buffer, bucket_name, object_name)

In [22]:
page_url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

In [23]:
html_response = requests.get(page_url)
html_response.raise_for_status()

In [24]:
soup = BeautifulSoup(html_response.text, 'html.parser')

In [25]:
headers = soup.find_all('a', attrs={'title': 'Yellow Taxi Trip Records'})

In [26]:
bucket_name = 'nyc-project'
create_minio_bucket(bucket_name)

2024-09-19 03:11:36,617 - __main__ - INFO - Bucket created succesfully: nyc-project


In [27]:
data_till_year = 2019
for link in headers:
    try:
        url = link.get('href').strip()
        file_name = url.split('/')[-1]
        year, month = file_name.split('_')[-1].split('.')[0].split('-')
        if int(year) >= data_till_year:
            process_and_upload_to_minio(url, file_name, year, month, bucket_name)
    except Exception as e:
        logger.error(f'Error processing file {file_name}: {e}')

2024-09-19 03:11:40,059 - __main__ - INFO - Download Start: yellow_tripdata_2024-01.parquet
2024-09-19 03:11:51,265 - __main__ - INFO - Download Complete: yellow_tripdata_2024-01.parquet
2024-09-19 03:11:51,267 - __main__ - INFO - Uploading file yellow_tripdata_2024-01.parquet to Bucket: nyc-project
2024-09-19 03:11:51,916 - __main__ - INFO - Succesfully uploaded to bucket 'nyc-project': raw_data/2024/01/yellow_tripdata_2024-01.parquet
2024-09-19 03:11:51,919 - __main__ - INFO - Download Start: yellow_tripdata_2024-02.parquet
2024-09-19 03:12:13,530 - __main__ - INFO - Download Complete: yellow_tripdata_2024-02.parquet
2024-09-19 03:12:13,531 - __main__ - INFO - Uploading file yellow_tripdata_2024-02.parquet to Bucket: nyc-project
2024-09-19 03:12:14,145 - __main__ - INFO - Succesfully uploaded to bucket 'nyc-project': raw_data/2024/02/yellow_tripdata_2024-02.parquet
2024-09-19 03:12:14,150 - __main__ - INFO - Download Start: yellow_tripdata_2024-03.parquet
2024-09-19 03:12:29,306 - __

In [30]:
directory = './dim_table_data/'
s3_folder = 'dim_table_data'
for file in os.listdir(directory):
    if file.endswith('.csv'):
        local_path = os.path.join(directory, file)
        s3_path = os.path.join(s3_folder,file)
        s3.upload_file(local_path, bucket_name, s3_path)
        logger.info(f'Uploaded Dim Table csv: {file}')

2024-09-19 03:47:54,868 - __main__ - INFO - Uploaded Dim Table csv: payment_type_dim.csv
2024-09-19 03:47:54,987 - __main__ - INFO - Uploaded Dim Table csv: rate_code_dim.csv
2024-09-19 03:47:55,082 - __main__ - INFO - Uploaded Dim Table csv: taxi_zone_dim.csv
