### Customers

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
import os, sys

repo_root = os.getcwd() 
if repo_root not in sys.path:
    sys.path.append(repo_root)

In [0]:
df_cust = spark.read.table("pysparkdbt.bronze.customers")

In [0]:
display(df_cust)

In [0]:
df_cust = df_cust.withColumn("domain", split(col("email"), "@").getItem(1))
display(df_cust)

In [0]:
# Replace all non digits letters in the phone number column with nothing.
df_cust = df_cust.withColumn(
    "phone_number",
    regexp_replace(col("phone_number"), "[^0-9]", "")
)
display(df_cust)

In [0]:
# Created full_name column by concatinating first_name and last_name, dropping the original after.
df_cust = df_cust.withColumn("full_name", concat_ws((" "), col("first_name"), col("last_name")))
display(df_cust)
df_cust = df_cust.drop("first_name", "last_name")

In [0]:
from utilities.custom_utils import Transformations
t = Transformations()
df_transformed = t.deduplicate(df_cust,["customer_id"],"last_updated_timestamp", "ingestion_timestamp")

display(df_transformed)

In [0]:
df_transformed = t.process_timestamp(df_transformed)
display(df_transformed)

In [0]:
from pyspark.sql.functions import col

table = "pysparkdbt.silver.customers"
cdc_col = "last_updated_timestamp"
t.upsert(
        df=df_transformed,
        key_cols=["customer_id"],
        target_table=table,
        cdc=cdc_col
    )

In [0]:
display(df_transformed) # Rerun and should constantly be 200 if no new records have been inserted.

### Drivers Table

In [0]:
df_driver = spark.read.table("pysparkdbt.bronze.drivers")
display(df_driver)

In [0]:
df_driver = df_driver.withColumn("full_name", concat_ws((" "), col("first_name"), col("last_name")))
display(df_driver)
df_driver = df_driver.drop("first_name", "last_name")

In [0]:
# Replace all non digits letters in the phone number column with nothing.
df_driver = df_driver.withColumn(
    "phone_number",
    regexp_replace(col("phone_number"), "[^0-9]", "")
)
display(df_driver)

In [0]:
%sql
-- Checking if there are ratings outside the range [0,5]
SELECT max(driver_rating) AS max_rating, min(driver_rating) AS min_rating FROM pysparkdbt.bronze.drivers