### Customers

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
import os, sys

repo_root = os.getcwd() 
if repo_root not in sys.path:
    sys.path.append(repo_root)

In [0]:
df_cust = spark.read.table("pysparkdbt.bronze.customers")

In [0]:
display(df_cust)

In [0]:
df_cust = df_cust.withColumn("domain", split(col("email"), "@").getItem(1))
display(df_cust)

In [0]:
# Replace all non digits letters in the phone number column with nothing.
df_cust = df_cust.withColumn(
    "phone_number",
    regexp_replace(col("phone_number"), "[^0-9]", "")
)
display(df_cust)

In [0]:
# Created full_name column by concatinating first_name and last_name, dropping the original after.
df_cust = df_cust.withColumn("full_name", concat_ws((" "), col("first_name"), col("last_name")))
display(df_cust)
df_cust = df_cust.drop("first_name", "last_name")

In [0]:
from utilities.custom_utils import Transformations
t = Transformations()
df_transformed = t.deduplicate(df_cust,["customer_id"],"last_updated_timestamp", "ingestion_timestamp")

display(df_transformed)

In [0]:
df_transformed = t.process_timestamp(df_transformed)
display(df_transformed)

In [0]:
from pyspark.sql.functions import col

table = "pysparkdbt.silver.customers"
cdc_col = "last_updated_timestamp"
t.upsert(
        df=df_transformed,
        key_cols=["customer_id"],
        target_table=table,
        cdc=cdc_col
    )

In [0]:
display(df_transformed) # Rerun and should constantly be 200 if no new records have been inserted.

### Drivers Table

In [0]:
df_driver = spark.read.table("pysparkdbt.bronze.drivers")
display(df_driver)

In [0]:
df_driver = df_driver.withColumn("full_name", concat_ws((" "), col("first_name"), col("last_name")))
display(df_driver)
df_driver = df_driver.drop("first_name", "last_name")

In [0]:
# Replace all non digits letters in the phone number column with nothing.
df_driver = df_driver.withColumn(
    "phone_number",
    regexp_replace(col("phone_number"), "[^0-9]", "")
)
display(df_driver)

In [0]:
# Checking for duplicates

dup_primary = t.duplicates_report(df_driver, ["driver_id"])
dup_temporal = t.duplicates_report(df_driver, ["driver_id", "last_updated_timestamp"])
display(dup_primary)
display(dup_temporal)

In [0]:
nulls = t.nulls_report(df_driver, ["driver_id", "last_updated_timestamp", "phone_number", "vehicle_id", "driver_rating", "city"	])
display(nulls)

In [0]:
# Split the data into valid and rejected by assuring non nulls
split = t.require_non_null(df_driver, ["driver_id" , "last_updated_timestamp"])
valid = split["valid"]
rejected = split["rejected"]

In [0]:
# Filters by valid ranges on the driver ratings [0.0,5.0]
valid = t.filter_by_ranges(valid, {"driver_rating": (0.0, 5.0)}, inclusive=True)["valid"]

In [0]:
# Deduplication by last updated timestamp and if needed ingestion timestamp as a tie breaker
df_silver = t.deduplicate_by_recency(
    df = valid,
    keys = ["driver_id"],
    cdc = "last_updated_timestamp",
    tie_breaker = "ingestion_timestamp"
)

In [0]:
display(df_silver)

In [0]:
t.upsert(
    df = df_silver,
    key_cols = ["driver_id"],
    target_table = "pysparkdbt.silver.drivers",
    cdc = "last_updated_timestamp"
)

### Vehicles

In [0]:
df_vehicles = spark.read.table("pysparkdbt.bronze.vehicles")
display(df_vehicles)

In [0]:
# Clean Strings
df_vehicles = t.sanitize_string(df_vehicles,["license_plate"])
display(df_vehicles)
