In [1]:
import os
from typing import Iterator
import pandas as pd
from dlt.sources.helpers import requests
from dlt.common.libs.pyarrow import pyarrow as pa
import gzip
import dlt
from dlt.sources import TDataItems
from dlt.sources.filesystem import FileItemDict, filesystem, readers, read_csv
import io
import datetime

BUCKET_NAME = "zoomcamp"

In [52]:
def get_taxi_data_url(taxi_type, year, month):
    """Generates URL for downloading data."""
    base_url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download"
    return f"{base_url}/{taxi_type}/{taxi_type}_tripdata_{year}-{month:02d}.csv.gz"


In [53]:
@dlt.source(name="ny_taxi")
def ny_taxi_source(
    taxi_type: str,
    year: str,
    month: str,
):
    @dlt.resource(
        name=f"taxi_data_{taxi_type}",
        file_format="csv",
        write_disposition="append",
    )
    def taxi_data_chunker():
        """Downloads data, processes it and loads it into an Iceberg table."""
        url = get_taxi_data_url(taxi_type, year, month)

        # Download and unzip the file
        response = requests.get(url, stream=True)
        response.raise_for_status()

        # Unpacking .gz file in memory
        with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as gz_file:
            dtype = {
                "VendorID": "float64",
                "passenger_count": "float64",
                "trip_distance": "float64",
                "RatecodeID": "float64",
                "store_and_fwd_flag": "object",
                "PULocationID": "float64",
                "DOLocationID": "float64",
                "payment_type": "float64",
                "fare_amount": "float64",
                "extra": "float64",
                "mta_tax": "float64",
                "tip_amount": "float64",
                "tolls_amount": "float64",
                "improvement_surcharge": "float64",
                "total_amount": "float64",
                "congestion_surcharge": "float64",
            }
            df = pd.read_csv(gz_file, dtype=dtype, low_memory=False)

        # Adding a date column
        df["custom_date"] = datetime.date(year, month, 1)

        yield df    # table

    return taxi_data_chunker

In [54]:
# Loading environment inclusion from .env file
import s3fs
from dotenv import load_dotenv
load_dotenv()

True

In [55]:
# Creating a Configuration for MinIO
s3_config = {
        "key": os.getenv("AWS_ACCESS_KEY_ID"),
        "secret": os.getenv("AWS_SECRET_ACCESS_KEY"),
        "endpoint_url": os.getenv("ENDPOINT_URL"),
        "use_ssl": False, 
    }

In [56]:
# Creating a filesystem for S3/MinIO
fs = s3fs.S3FileSystem(**s3_config)

In [57]:
table_name = "green_taxi_data"
taxi_type = "green"
year = 2019
month = 1

In [42]:
# Loading data
data = ny_taxi_source(taxi_type=taxi_type, year=year, month=1)

# Custom file path and name
file_name = f"{table_name}_{year}-{month:02d}.csv"
s3_path = f"{BUCKET_NAME}/{table_name}/{file_name}"

# Saving data to S3/MinIO
with fs.open(s3_path, "w") as f:
    for chunk in data:
        chunk.to_csv(f, index=False)

print(f"File {s3_path} successfully saved to S3/MinIO.")

File zoomcamp/green_taxi_data/green_taxi_data_2019-01.csv successfully saved to S3/MinIO.


In [60]:
# Data pipeline for many files
for taxi_type in ['green', 'yellow']:
    for year in [2019, 2020]:
        for month in range(1, 13):
            data = ny_taxi_source(taxi_type=taxi_type, year=year, month=month)
            file_name = f"{taxi_type}_tripdata_{year}-{month:02d}.csv"
            s3_path = f"{BUCKET_NAME}/{taxi_type}_taxi_data/{file_name}"
            with fs.open(s3_path, "w") as f:
                for chunk in data:
                    chunk.to_csv(f, index=False)
            print(f"File {s3_path} successfully saved to S3/MinIO.")
print("Loading completed")

File zoomcamp/green_taxi_data/green_tripdata_2019-10.csv successfully saved to S3/MinIO.
File zoomcamp/green_taxi_data/green_tripdata_2019-11.csv successfully saved to S3/MinIO.
File zoomcamp/green_taxi_data/green_tripdata_2019-12.csv successfully saved to S3/MinIO.
File zoomcamp/green_taxi_data/green_tripdata_2020-10.csv successfully saved to S3/MinIO.
File zoomcamp/green_taxi_data/green_tripdata_2020-11.csv successfully saved to S3/MinIO.
File zoomcamp/green_taxi_data/green_tripdata_2020-12.csv successfully saved to S3/MinIO.
File zoomcamp/yellow_taxi_data/yellow_tripdata_2019-10.csv successfully saved to S3/MinIO.
File zoomcamp/yellow_taxi_data/yellow_tripdata_2019-11.csv successfully saved to S3/MinIO.
File zoomcamp/yellow_taxi_data/yellow_tripdata_2019-12.csv successfully saved to S3/MinIO.
File zoomcamp/yellow_taxi_data/yellow_tripdata_2020-10.csv successfully saved to S3/MinIO.
File zoomcamp/yellow_taxi_data/yellow_tripdata_2020-11.csv successfully saved to S3/MinIO.
File zoomca

In [62]:
taxi_type = "fhv"
year = 2019
for month in range(1, 13):
    data = ny_taxi_source(taxi_type=taxi_type, year=year, month=month)
    file_name = f"{taxi_type}_tripdata_{year}-{month:02d}.csv"
    s3_path = f"{BUCKET_NAME}/{taxi_type}_taxi_data/{file_name}"
    with fs.open(s3_path, "w") as f:
        for chunk in data:
            chunk.to_csv(f, index=False)
    print(f"File {s3_path} successfully saved to S3/MinIO.")
print("Loading completed")

File zoomcamp/fhv_taxi_data/fhv_tripdata_2019-01.csv successfully saved to S3/MinIO.
File zoomcamp/fhv_taxi_data/fhv_tripdata_2019-02.csv successfully saved to S3/MinIO.
File zoomcamp/fhv_taxi_data/fhv_tripdata_2019-03.csv successfully saved to S3/MinIO.
File zoomcamp/fhv_taxi_data/fhv_tripdata_2019-04.csv successfully saved to S3/MinIO.
File zoomcamp/fhv_taxi_data/fhv_tripdata_2019-05.csv successfully saved to S3/MinIO.
File zoomcamp/fhv_taxi_data/fhv_tripdata_2019-06.csv successfully saved to S3/MinIO.
File zoomcamp/fhv_taxi_data/fhv_tripdata_2019-07.csv successfully saved to S3/MinIO.
File zoomcamp/fhv_taxi_data/fhv_tripdata_2019-08.csv successfully saved to S3/MinIO.
File zoomcamp/fhv_taxi_data/fhv_tripdata_2019-09.csv successfully saved to S3/MinIO.
File zoomcamp/fhv_taxi_data/fhv_tripdata_2019-10.csv successfully saved to S3/MinIO.
File zoomcamp/fhv_taxi_data/fhv_tripdata_2019-11.csv successfully saved to S3/MinIO.
File zoomcamp/fhv_taxi_data/fhv_tripdata_2019-12.csv successfully

In [2]:
dfgz = pd.read_parquet('/Users/zharauai/Downloads/00000-0-e37687bc-3731-41c3-accc-8de3667c0154.parquet')

In [7]:
print(dfgz.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1475564 entries, 0 to 1475563
Data columns (total 8 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   dispatching_base_num    1475564 non-null  object
 1   pickup_datetime         1475564 non-null  object
 2   drop_off_datetime       1475564 non-null  object
 3   p_ulocation_id          1474460 non-null  object
 4   d_olocation_id          1474460 non-null  object
 5   sr_flag                 0 non-null        object
 6   affiliated_base_number  1468253 non-null  object
 7   custom_date             1475564 non-null  object
dtypes: object(8)
memory usage: 90.1+ MB
None


In [6]:
dfgz.head(15)

Unnamed: 0,dispatching_base_num,pickup_datetime,drop_off_datetime,p_ulocation_id,d_olocation_id,sr_flag,affiliated_base_number,custom_date
0,B00013,2019-03-01 00:09:15,2019-03-01 00:26:20,264.0,264.0,,B00013,2019-03-01
1,B00013,2019-03-01 00:00:08,2019-03-02 00:30:33,264.0,264.0,,B00013,2019-03-01
2,B00013,2019-03-01 00:55:38,2019-03-01 01:03:13,264.0,264.0,,B00013,2019-03-01
3,B00013,2019-03-01 00:56:30,2019-03-01 01:00:43,264.0,264.0,,B00013,2019-03-01
4,B00014,2019-03-01 00:53:03,2019-03-01 00:59:46,264.0,264.0,,B00014,2019-03-01
5,B00014,2019-03-01 00:14:07,2019-03-01 00:21:40,264.0,264.0,,B00014,2019-03-01
6,B00014,2019-03-01 00:33:34,2019-03-01 01:00:09,264.0,264.0,,B00014,2019-03-01
7,B00014,2019-03-01 00:58:43,2019-03-01 01:26:38,264.0,264.0,,B00014,2019-03-01
8,B00014,2019-03-01 00:07:27,2019-03-02 00:09:05,264.0,264.0,,B00014,2019-03-01
9,B00014,2019-03-01 00:51:38,2019-03-01 01:35:07,264.0,264.0,,B00014,2019-03-01
