In [118]:
import requests
from bs4 import BeautifulSoup

class WebScraper:
    def __init__(self, url):
        self.url = url

    # Get all the parquet files link
    def get_all_links(self, title_filter=None):
        try:
            html_response = requests.get(self.url)
            html_response.raise_for_status()
        except Exception as e:
            raise

        soup = BeautifulSoup(html_response.text, 'html.parser')
        if title_filter:
            tags = soup.find_all('a', attrs={'title': title_filter})
            return [tag['href'] for tag in tags]
        return soup.find_all('a')

    def get_latest_link(self, title_filter=None):
        try:
            html_response = requests.get(self.url)
            html_response.raise_for_status()
        except Exception as e:
            raise

        soup = BeautifulSoup(html_response.text, 'html.parser')
        if title_filter:
            tag = soup.find('a', attrs={'title': title_filter})
            return tag['href']
        return soup.find_all('a')

    def get_date(self, link):
        year, month = link.rsplit('/', 1)[-1].rsplit('_', 1)[-1].split('.')[0].split('-')
        return year, month

In [119]:
page_url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

web_scraper = WebScraper(page_url)

links = web_scraper.get_all_links(title_filter = "Yellow Taxi Trip Records")
latest_link = web_scraper.get_latest_link(title_filter = 'Yellow Taxi Trip Records')

In [120]:
latest_link

'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet '

In [140]:
import boto3
import os
from botocore.client import Config
from botocore.exceptions import ClientError

class MinioClient:
    endpoint = 'http://object-storage:9000'
    access_key = os.getenv('MINIO_ROOT_USER')
    secret_key = os.getenv('MINIO_ROOT_PASSWORD')

    def __init__(self):
        self.endpoint = MinioClient.endpoint
        self.access_key = MinioClient.access_key
        self.secret_key = MinioClient.secret_key
        self.s3 = boto3.client(
            's3',
            endpoint_url=self.endpoint,
            aws_access_key_id=self.access_key,
            aws_secret_access_key=self.secret_key,
            config=Config(signature_version='s3v4')
        )
    
    @classmethod
    def set_credentials(cls, endpoint, access_key, secret_key):
        cls.endpoint = endpoint
        cls.access_key = access_key
        cls.secret_key = secret_key

    def create_bucket(self, bucket_name):
        try:
            self.s3.head_bucket(Bucket=bucket_name)
        except ClientError as e:
            if e.response['Error']['Code'] == '404':
                self.s3.create_bucket(Bucket=bucket_name)
            else:
                raise

    def upload_file(self, bucket_name, object_name, file):
        try:
            self.s3.put_object(Bucket=bucket_name, Key=object_name, Body=file)
        except ClientError as e:
            raise

In [141]:
import os
import io
import requests

class DataProcessor:

    def download_file(self, url):
        try:
            file_response = requests.get(url, stream=True)
            file_response.raise_for_status()
            buffer = io.BytesIO()
            for chunk in file_response.iter_content(chunk_size=8192):
                buffer.write(chunk)
            buffer.seek(0)
            return buffer.getvalue()
        except Exception as e:
            raise

In [145]:
import argparse

def main(download_year, download_month):
    # Initialize MinIO and DataProcessor
    bucket_name = 'nyc-project'

    # Create MinIO bucket if it doesn't exist
    minio_client = MinioClient()
    
    # Initiate Data Processor
    data_processor = DataProcessor()
    
    # Declare URL and file name
    url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{download_year}-{download_month:02}.parquet'
    file_name = f'yellow_tripdata_{download_year}_{download_month:02}.parquet'

    # Process and upload data
    parquet_buffer = data_processor.download_file(url)
    parquet_path = f'raw-data/{year}/{month}/{file_name}'
    minio_client.upload_file(bucket_name, parquet_path, parquet_buffer)

if __name__ == "__main__":
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--year", required=True, type=int)
    # parser.add_argument("--month", required=True, type=int)
    # args = parser.parse_args()
    year = 2024
    month = 10

    main(year, month)