# Ingest Data

This is to simulate data ingestion

In [None]:
dbutils.widgets.text("environment", "dev")

In [None]:
from pyspark.sql import functions as F
from datetime import datetime, timedelta

curr_env = dbutils.widgets.get("environment")
curr_catalog = f'brian_ml_{curr_env}'

In [None]:
%sql
USE CATALOG brian_ml_${environment};

In [None]:
loaded_data = spark.sql(f"SELECT * FROM warehouse.raw_data")

max_date = loaded_data \
            .select('tpep_pickup_datetime') \
            .agg(
                F.max(F.col('tpep_pickup_datetime')).alias('max_loaded')
            )

current_max_date = max_date.collect()[0].max_loaded

next_date_cutoff = current_max_date + timedelta(days=2)
next_date_midnight = next_date_cutoff.replace(hour=0,minute=0,second=0)

print(f'current max date {current_max_date}')
print(f'Next max date {next_date_midnight}')

In [None]:
raw_data = spark.read.format("delta").load("/databricks-datasets/nyctaxi-with-zipcodes/subsampled")
display(raw_data)

In [None]:
# Data to ingest
filtered_df = raw_data.filter(
    (F.col('tpep_pickup_datetime') > current_max_date) & 
    (F.col('tpep_pickup_datetime') <= next_date_midnight)
)

row_count = filtered_df.count()

print(f'collected {row_count} records')

In [None]:
if row_count >= 1:
    filtered_df.write.mode('append').saveAsTable(f'{curr_catalog}.warehouse.raw_data')