In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, TimestampType, DoubleType
import sys
spark = SparkSession.builder.appName("DataStoreObject").getOrCreate()

In [1]:
source_schema_name = "_silver"
target_schema_name = "_gold"

Reading Customer,Order,Product Tables from _silver schema

In [None]:
table_name = 'customer_table'
customer_df = spark.sql(f"SELECT * FROM {source_schema_name}.{table_name}")

In [None]:
table_name = 'product_table'
product_df = spark.sql(f"SELECT * FROM {source_schema_name}.{table_name}")

In [None]:
table_name = 'orders_table'
orders_df = spark.sql(f"SELECT * FROM {source_schema_name}.{table_name}")
orders_df = orders_df.withColumn("Profit", F.round(orders_df.Profit, 2))

Merging Orders,product, customer tables
![DSO.png](../img/DSO.png)

In [2]:

# Add aliases to the dataframes
customer_alias = customer_df.alias("customer")
orders_alias = orders_df.alias("orders")
product_alias = product_df.alias("product")

In [None]:
# Join customer_df and orders_df using aliases
intermediate_df = customer_alias.join(orders_alias, customer_alias.Customer_ID == orders_alias.Customer_ID)

# Join intermediate_df and product_df using aliases
master_dso_df = intermediate_df.join(product_alias, intermediate_df.Product_ID == product_alias.Product_ID)

In [None]:
# Select columns from the final joined dataframe
master_dso_df = master_dso_df.select(
    "orders.Order_ID", 
    "orders.Order_Date", 
    "customer.Customer_ID", 
    "customer.Customer_Name",
    "customer.Country", 
    "product.Product_ID", 
    "product.Product_Name",
    "product.Category",
    "product.Sub-Category",
    "orders.Quantity",
    "orders.Discount", 
    "product.Price_Per_Product",
    "orders.Profit"
)

In [None]:
table_name = 'master_dso'
master_dso_df.write.mode("overwrite").saveAsTable(f"{target_schema_name}.{table_name}")