In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col
import os

In [14]:
spark = SparkSession \
            .builder \
            .appName("Ingestion") \
            .config("spark.jars", "/spark/jars/postgresql-jdbc.jar") \
            .getOrCreate()

In [15]:
db_username = os.environ.get("DB_USERNAME")
db_password = os.environ.get("DB_PASSWORD")


In [16]:
jdbc_url = "jdbc:postgresql://postgres:5432/retail"

In [17]:
connection_params = {
    "user": db_username,
    "password": db_password,
    "driver": "org.postgresql.Driver"
}

In [18]:
orders_df = spark.read.jdbc(
    url=jdbc_url,
    table="orders",
    properties=connection_params
).createOrReplaceTempView("orders")

In [19]:
orders_history_df = spark.read.jdbc(
    url=jdbc_url,
    table="ordershistory",
    properties=connection_params
).createOrReplaceTempView("ordershistory")

In [20]:
result_df = spark.sql(
    """
    SELECT
        o.orderid,
        historyid,
        ordertime,
        branch,
        status,
        updatedat
    FROM orders AS o
    INNER JOIN ordershistory AS oh
    ON o.orderid = oh.orderid
    """
)

In [21]:
result_df = result_df.withColumn("update", to_date(col("updatedat")))

In [22]:
result_df.write.mode("overwrite").partitionBy("update").parquet("hdfs://hadoop:pass@hadoop-namenode:8020/data/bronze/")

                                                                                

In [23]:
result_df.createOrReplaceTempView("result")

In [24]:
spark.sql(
    """SELECT * 
        FROM result
        ORDER BY historyid
    """
).show()

+-------+---------+-------------------+--------------------+---------+-------------------+----------+
|orderid|historyid|          ordertime|              branch|   status|          updatedat|    update|
+-------+---------+-------------------+--------------------+---------+-------------------+----------+
|      1|        1|2024-06-15 17:14:56| Grand Avenue Center|      NEW|2024-06-15 17:14:56|2024-06-15|
|      1|        2|2024-06-15 17:14:56| Grand Avenue Center|INTRANSIT|2024-06-15 18:07:56|2024-06-15|
|      2|        3|2024-06-15 18:24:56|Springfield Town ...|      NEW|2024-06-15 18:24:56|2024-06-15|
|      3|        4|2024-06-15 18:56:56|         Metro Plaza|      NEW|2024-06-15 18:56:56|2024-06-15|
|      2|        5|2024-06-15 18:24:56|Springfield Town ...|INTRANSIT|2024-06-15 19:24:56|2024-06-15|
|      2|        6|2024-06-15 18:24:56|Springfield Town ...|DELIVERED|2024-06-15 20:22:56|2024-06-15|
|      4|        7|2024-06-15 21:04:56|    Riverfront Plaza|      NEW|2024-06-15 2