In [3]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator


@dlt.resource(name="rides", write_disposition="replace")
def ny_taxi():
    client = RESTClient(
        base_url="https://us-central1-dlthub-analytics.cloudfunctions.net",
        paginator=PageNumberPaginator(
            base_page=1,
            total_path=None
        )
    )

    for page in client.paginate("data_engineering_zoomcamp_api"):
        yield page

Why use a Data Lake?

Cost-effective storage – Cheaper than traditional databases.
Optimized for big data processing – Works seamlessly with Spark, Databricks, and Presto.
Easy scalability – Store petabytes of data efficiently.
The filesystem destination enables you to load data into files stored locally or in cloud storage solutions, making it an excellent choice for lightweight testing, prototyping, or file-based workflows.

Below is an example demonstrating how to use the filesystem destination to load data in Parquet format:

- Step 1: Set up a local bucket or cloud directory for storing files

In [4]:
import os

os.environ["BUCKET_URL"] = "workshop/content"

- Step 2: Define the data source (above)
- Step 3: Run the pipeline

In [5]:
import dlt


pipeline = dlt.pipeline(
    pipeline_name='fs_pipeline',
    destination='filesystem', # <--- change destination to 'filesystem'
    dataset_name='fs_data',
)

load_info = pipeline.run(ny_taxi, loader_file_format="parquet") # <--- choose a file format: parquet, csv or jsonl
print(load_info)

Pipeline fs_pipeline load step completed in 0.02 seconds
1 load package(s) were loaded to destination filesystem and into dataset fs_data
The filesystem destination used file:///workspaces/DEZoomCamp2025/workshop/workshop/content location to store data
Load package 1739949879.9430459 is LOADED and contains no failed jobs


Look at the files:

In [None]:
#! ls fs_data/rides

Look at the loaded data:

In [6]:
# explore loaded data
pipeline.dataset(dataset_type="default").rides.df()

Unnamed: 0,end_lat,end_lon,fare_amt,passenger_count,payment_type,start_lat,start_lon,tip_amt,tolls_amt,total_amt,trip_distance,trip_dropoff_date_time,trip_pickup_date_time,surcharge,vendor_name,_dlt_load_id,_dlt_id,store_and_forward
0,40.742963,-73.980072,45.0,1,Credit,40.641525,-73.787442,9.0,4.15,58.15,17.52,2009-06-14 23:48:00+00:00,2009-06-14 23:23:00+00:00,0.0,VTS,1739949879.9430459,2AvsGgmCDQf8BQ,
1,40.740187,-74.005698,6.5,1,Credit,40.722065,-74.009767,1.0,0.00,8.50,1.56,2009-06-18 17:43:00+00:00,2009-06-18 17:35:00+00:00,1.0,VTS,1739949879.9430459,xgelUnVlmtCwZA,
2,40.718043,-74.004745,12.5,5,Credit,40.761945,-73.983038,2.0,0.00,15.50,3.37,2009-06-10 18:27:00+00:00,2009-06-10 18:08:00+00:00,1.0,VTS,1739949879.9430459,wIAwyKcLOZRMZw,
3,40.739637,-73.985233,4.9,1,CASH,40.749802,-73.992247,0.0,0.00,5.40,1.11,2009-06-14 23:58:00+00:00,2009-06-14 23:54:00+00:00,0.5,VTS,1739949879.9430459,tsyq4WGBUF3l+Q,
4,40.730032,-73.852693,25.7,1,CASH,40.776825,-73.949233,0.0,4.15,29.85,11.09,2009-06-13 13:23:00+00:00,2009-06-13 13:01:00+00:00,0.0,VTS,1739949879.9430459,oJay+becNXjDZw,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,40.783522,-73.970690,5.7,1,CASH,40.778560,-73.953660,0.0,0.00,5.70,1.16,2009-06-19 11:28:00+00:00,2009-06-19 11:22:00+00:00,0.0,VTS,1739949879.9430459,zflEHVigqmCFMQ,
9996,40.777200,-73.964197,4.1,1,CASH,40.779800,-73.974297,0.0,0.00,4.10,0.89,2009-06-17 07:43:00+00:00,2009-06-17 07:41:00+00:00,0.0,VTS,1739949879.9430459,4nkV2sWtNEHX+g,
9997,40.780172,-73.957617,6.1,1,CASH,40.788388,-73.976758,0.0,0.00,6.10,1.30,2009-06-19 11:46:00+00:00,2009-06-19 11:39:00+00:00,0.0,VTS,1739949879.9430459,W7+THgIKPNhMHg,
9998,40.777342,-73.957242,5.7,1,CASH,40.773828,-73.956690,0.0,0.00,6.20,0.97,2009-06-17 04:19:00+00:00,2009-06-17 04:13:00+00:00,0.5,VTS,1739949879.9430459,ACMwlj1ZhjaQUA,


Table formats: Delta tables & Iceberg
dlt supports writing Delta and Iceberg tables when using the filesystem destination.

How it works:

dlt uses the deltalake and pyiceberg libraries to write Delta and Iceberg tables, respectively. One or multiple Parquet files are prepared during the extract and normalize steps. In the load step, these Parquet files are exposed as an Arrow data structure and fed into deltalake or pyiceberg.

In [None]:
# !pip install "dlt[pyiceberg]"

In [7]:
pipeline = dlt.pipeline(
    pipeline_name='fs_pipeline',
    destination='filesystem', # <--- change destination to 'filesystem'
    dataset_name='fs_iceberg_data',
)

load_info = pipeline.run(
    ny_taxi,
    loader_file_format="parquet",
    table_format="iceberg",  # <--- choose a table format: delta or iceberg
)
print(load_info)

Pipeline fs_pipeline load step completed in 1.92 seconds
1 load package(s) were loaded to destination filesystem and into dataset fs_iceberg_data
The filesystem destination used file:///workspaces/DEZoomCamp2025/workshop/workshop/content location to store data
Load package 1739950112.8105237 is LOADED and contains no failed jobs




💡Note:

Open source version of dlt supports basic functionality for iceberg, but the dltHub team is currently working on an extended and more powerful integration with iceberg.

Join the waiting list to learn more about dlt+ and Iceberg.