In [4]:
df = spark.read.load('abfss://datalakehouse@synapsedp203dl.dfs.core.windows.net/raw/*.csv', format='csv') 

In [5]:
display(df)

In [6]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

orderSchema = StructType([
    StructField("SalesOrderNumber", StringType()),
    StructField("SalesOrderLineNumber", IntegerType()),
    StructField("OrderDate", DateType()),
    StructField("CustomerName", StringType()),
    StructField("Email", StringType()),
    StructField("Item", StringType()),
    StructField("Quantity", IntegerType()),
    StructField("UnitPrice", FloatType()),
    StructField("Tax", FloatType())
    ])

df = spark.read.load('abfss://datalakehouse@synapsedp203dl.dfs.core.windows.net/raw/*.csv', format='csv', schema=orderSchema, header = True)
  

In [7]:
display(df)

In [8]:
customers = df['CustomerName', 'Email', 'OrderDate']
display(customers)

In [9]:
from pyspark.sql.functions import split, col

# Create the new FirstName and LastName fields
customer_data_df = customers.withColumn("FirstName", split(col("CustomerName"), " ").getItem(0)).withColumn("LastName", split(col("CustomerName"), " ").getItem(1))

# Remove the CustomerName field
customer_data_df = customer_data_df.drop("CustomerName")

display(customer_data_df)

In [10]:
customer_data_df = customer_data_df.select('FirstName','LastName','Email', 'OrderDate')
display(customer_data_df.limit(10))
print(customer_data_df.count())
print(customer_data_df.distinct().count())

In [11]:
%%sql
CREATE DATABASE IF NOT EXISTS SALESDB;

In [12]:
customer_data_df.write.mode("overwrite").partitionBy("OrderDate").format("delta").option("path", "abfss://datalakehouse@synapsedp203dl.dfs.core.windows.net/refined/Customers").save()
print ("Transformed data saved!")

In [13]:
customer_data_df.write.mode("overwrite").partitionBy("OrderDate").format("delta").option("path", "abfss://datalakehouse@synapsedp203dl.dfs.core.windows.net/curated/Customers").saveAsTable("SALESDB.Customers")
print ("Transformed data saved!")

In [14]:
df_products = df.select("OrderDate", "Item", "Quantity", "UnitPrice", "Tax")

In [15]:
productSales = df_products.withColumn("GrossSales", df_products["Quantity"]*df_products["UnitPrice"])\
.withColumn("NetSales",(df_products["Quantity"]*df_products["UnitPrice"]) - df_products["Tax"])\
.groupBy("OrderDate", "Item")\
.agg(sum("Quantity").alias("TotalQuantity"),sum("GrossSales").alias("Gross_Sales"), sum("NetSales").alias("Net_Sales"))
display(productSales)

In [16]:
productSales = productSales.select("OrderDate", "Item", "TotalQuantity",round("Gross_Sales", 2).alias('Gross_Sales'),round("Net_Sales", 2).alias('Net_Sales'))

In [17]:
display(productSales)

In [18]:
productSales.write.mode("overwrite").partitionBy("OrderDate").format("delta").option("path", "abfss://datalakehouse@synapsedp203dl.dfs.core.windows.net/refined/ProductSales").save()
print ("Transformed data saved!")

In [19]:
productSales.write.mode("overwrite").partitionBy("OrderDate").format("delta").option("path", "abfss://datalakehouse@synapsedp203dl.dfs.core.windows.net/curated/ProductSales").saveAsTable("SALESDB.ProductSales")
print ("Transformed data saved!")

In [21]:
path_to_data = 'abfss://datalakehouse@synapsedp203dl.dfs.core.windows.net/refined/ProductSales/OrderDate=2023-12-01'

product_data = spark.read.format("delta").load(path_to_data)
display(product_data)

In [22]:
path_to_data = 'abfss://datalakehouse@synapsedp203dl.dfs.core.windows.net/refined/Customers/OrderDate=2023-12-01'

customer_data = spark.read.format("delta").load(path_to_data)
display(customer_data)