In [77]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Data Modeling - Star Schema") \
    .getOrCreate()


In [78]:
df = spark.read.parquet("/home/daman/Downloads/notebooks/SalesTrac/data/processed",header=True, inferSchema=True)
df.printSchema()

root
 |-- row_id: integer (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_date: date (nullable = true)
 |-- ship_date: date (nullable = true)
 |-- ship_mode: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- segment: string (nullable = true)
 |-- country: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: integer (nullable = true)
 |-- region: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- sub-category: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- sales: double (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- discount: double (nullable = true)
 |-- profit: double (nullable = true)
 |-- order_month: integer (nullable = true)
 |-- order_year: integer (nullable = true)



In [79]:
dim_customer = df.select("customer_id", "customer_name", "segment").dropDuplicates()
dim_customer.write.mode("overwrite").parquet("/home/daman/Downloads/notebooks/SalesTrac/data/warehouse/dim_customer")

dim_product = df.select("product_id", "product_name", "category", "sub-category").dropDuplicates()
dim_product.write.mode("overwrite").parquet("/home/daman/Downloads/notebooks/SalesTrac/data/warehouse/dim_product")



In [80]:
from pyspark.sql.functions import col, monotonically_increasing_id, row_number
from pyspark.sql.window import Window
location_df = df.select("city", "state", "postal_code", "country", "region").dropDuplicates()
windowSpec = Window.orderBy("city", "state", "postal_code", "country", "region")
dim_location = location_df.withColumn(
    "location_id", row_number().over(windowSpec)
)
# Reordering
dim_location = dim_location.select(
    "location_id", "city", "state", "postal_code", "country", "region"
)
dim_location.write.mode("overwrite").parquet("/home/daman/Downloads/notebooks/SalesTrac/data/warehouse/dim_location")
dim_location.show(5)

25/06/09 20:54:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/09 20:54:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/09 20:54:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/09 20:54:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/09 20:54:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/09 20:54:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/09 2

+-----------+-----------+------------+-----------+-------------+-------+
|location_id|       city|       state|postal_code|      country| region|
+-----------+-----------+------------+-----------+-------------+-------+
|          1|   Aberdeen|South Dakota|      57401|United States|Central|
|          2|    Abilene|       Texas|      79605|United States|Central|
|          3|      Akron|        Ohio|      44312|United States|   East|
|          4|Albuquerque|  New Mexico|      87105|United States|   West|
|          5| Alexandria|    Virginia|      22304|United States|  South|
+-----------+-----------+------------+-----------+-------------+-------+
only showing top 5 rows



In [81]:
from pyspark.sql.functions import year, month

dim_order = df.select("order_id", "order_date", "ship_date", "ship_mode") \
              .dropDuplicates() \
              .withColumn("order_year", year("order_date")) \
              .withColumn("order_month", month("order_date"))

dim_order.write.mode("overwrite").parquet("/home/daman/Downloads/notebooks/SalesTrac/data/warehouse/dim_order")


In [82]:
df_with_location = df.join(
    dim_location,
    on=["city", "state", "postal_code", "country", "region"],
    how="left"
)
fact_sales = df_with_location.select(
    "order_id", "product_id", "customer_id","location_id",
    "sales", "quantity", "discount", "profit"
)

fact_sales.write.mode("overwrite").parquet("/home/daman/Downloads/notebooks/SalesTrac/data/warehouse/fact_sales")
fact_sales.show(5)

25/06/09 20:54:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/09 20:54:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/09 20:54:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/09 20:54:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/09 20:54:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/09 20:54:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/09 2

+--------------+---------------+-----------+-----------+-------+--------+--------+--------+
|      order_id|     product_id|customer_id|location_id|  sales|quantity|discount|  profit|
+--------------+---------------+-----------+-----------+-------+--------+--------+--------+
|CA-2017-100013|OFF-AR-10001022|   ZC-21910|        374|   5.68|       2|     0.0|  1.7608|
|CA-2017-100055|OFF-AP-10001469|   MD-17860|        343| 125.13|       3|     0.0| 36.2877|
|CA-2017-100097|TEC-PH-10002310|   MN-17935|        453| 979.95|       5|     0.0|264.5865|
|CA-2017-100111|FUR-CH-10003061|   SV-20365|        456| 80.991|       1|     0.1|  8.0991|
|CA-2017-100111|FUR-CH-10003846|   SV-20365|        456|272.646|       3|     0.1| 18.1764|
+--------------+---------------+-----------+-----------+-------+--------+--------+--------+
only showing top 5 rows



In [83]:
print("Fact Sales:", fact_sales.count())
print("dim_customer:", dim_customer.count())
print("dim_product:", dim_product.count())
print("dim_order:", dim_order.count())
print("dim_location:", dim_location.count())


Fact Sales: 10986
dim_customer: 1793
dim_product: 2894
dim_order: 6009
dim_location: 712


In [84]:
spark.read.parquet('/home/daman/Downloads/notebooks/SalesTrac/data/warehouse/dim_customer').show(1, truncate=False)

+-----------+-------------+---------+
|customer_id|customer_name|segment  |
+-----------+-------------+---------+
|DP-13105   |Dave Poirier |Corporate|
+-----------+-------------+---------+
only showing top 1 row



## Load to postgresql

In [45]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .config("spark.jars", "/home/daman/Downloads/postgresql-42.6.0.jar") \
    .getOrCreate()


25/06/09 18:55:56 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [50]:
jdbc_url = "jdbc:postgresql://localhost:5432/salestrac_db"
connection_properties = {
    "user": "admin",
    "password": "admin",
    "driver": "org.postgresql.Driver"
}

In [None]:
dim_customer.write.jdbc(
    url=jdbc_url,
    table="dim_customer",
    mode="overwrite",  
    properties=connection_properties
)
dim_product.write.jdbc(url=jdbc_url, table="dim_product", mode="overwrite", properties=connection_properties)
dim_location.write.jdbc(url=jdbc_url, table="dim_location", mode="overwrite", properties=connection_properties)
dim_order.write.jdbc(url=jdbc_url, table="dim_order", mode="overwrite", properties=connection_properties)


fact_sales.write.jdbc(
    url=jdbc_url,
    table="fact_sales",
    mode="overwrite",  
    properties=connection_properties
)
