#### Create Database

In [None]:
CREATE DATABASE IF NOT EXISTS SalesDB_spark

#### Create Table

In [None]:
CREATE TABLE IF NOT EXISTS SalesDB_spark.SalesEx
(
    SalesOrderNumber STRING,
    SalesOrderLineNumber INT,
    OrderDate DATE,
    CustomerName STRING,
    Email STRING,
    Item STRING,
    Quantity INT,
    UnitPrice FLOAT,
    Tax FLOAT
) 
USING PARQUET
LOCATION 'abfss://data@synapsedp203dl.dfs.core.windows.net/sales/transformed/SparkSQL/Sales'


In [None]:
CREATE TABLE IF NOT EXISTS SalesDB_spark.SalesMn
(
    SalesOrderNumber STRING,
    SalesOrderLineNumber INT,
    OrderDate DATE,
    CustomerName STRING,
    Email STRING,
    Item STRING,
    Quantity INT,
    UnitPrice FLOAT,
    Tax FLOAT
) 
USING PARQUET

#### Insert Data into the Table

In [None]:
INSERT INTO SalesDB_spark.SalesEx

VALUES
(
    'SO12345',
    1,
    date'2024-01-16',
    'John Doe',
    'john.doe@example.com',
    'ProductA',
    10,
    19.99,
    2.5
);




In [None]:
INSERT INTO SalesDB_spark.SalesMn

VALUES
(
    'SO12345',
    1,
    date'2024-01-16',
    'John Doe',
    'john.doe@example.com',
    'ProductA',
    10,
    19.99,
    2.5
);

#### Read data from the Table

In [None]:
SELECT * from SalesDB_spark.SalesEx

In [None]:
SELECT * from SalesDB_spark.SalesMn

#### Read data from File

In [None]:
%%pyspark

read_df = spark.read.load('abfss://data@synapsedp203dl.dfs.core.windows.net/sales/transformed/SparkSQL/Sales', format='parquet', header = True) 
display(read_df)

### Data Transformations using SparkSQL

#### Create Orders Table

In [None]:
%%pyspark

from pyspark.sql.types import *
from pyspark.sql.functions import *

orderSchema = StructType([
    StructField("SalesOrderNumber", StringType()),
    StructField("SalesOrderLineNumber", IntegerType()),
    StructField("OrderDate", DateType()),
    StructField("CustomerName", StringType()),
    StructField("Email", StringType()),
    StructField("Item", StringType()),
    StructField("Quantity", IntegerType()),
    StructField("UnitPrice", FloatType()),
    StructField("Tax", FloatType())
    ])

df = spark.read.load('abfss://data@synapsedp203dl.dfs.core.windows.net/sales/*.csv', format='csv', schema=orderSchema, header = True)

In [None]:
%%pyspark

df.write.saveAsTable('SalesDB_spark.Orders', format='parquet', mode='overwrite', path='abfss://data@synapsedp203dl.dfs.core.windows.net/sales/transformed/SparkSQL/Orders')
print ("Transformed data saved!")

#### Create Product Table from the Orders Table

In [None]:
CREATE TABLE SalesDB_spark.Product 
            USING PARQUET 
            LOCATION 'abfss://data@synapsedp203dl.dfs.core.windows.net/sales/transformed/SparkSQL/Product'

AS 
    SELECT 
        Item, 
        sum(Quantity) AS Total_Quantity, 
        sum(Quantity * UnitPrice) AS Total_Sales
    FROM SalesDB_spark.Orders
    GROUP BY Item



In [None]:
SELECT * FROM SalesDB_spark.Product

#### Create MonthlySales Table with Partitioning

In [None]:
CREATE TABLE SalesDB_spark.MonthlySales 
            USING PARQUET 
            PARTITIONED BY (Year, Month)
            LOCATION 'abfss://data@synapsedp203dl.dfs.core.windows.net/sales/transformed/SparkSQL/MonthlySales'

AS 

SELECT 
        YEAR(OrderDate) AS Year, 
        MONTH(OrderDate) AS Month,
        SalesOrderNumber,
        sum(Quantity * UnitPrice) AS Total_Sales

FROM SalesDB_spark.Orders
GROUP BY YEAR(OrderDate), MONTH(OrderDate), SalesOrderNumber
ORDER BY YEAR(OrderDate), MONTH(OrderDate), SalesOrderNumber

In [None]:
SELECT * FROM SalesDB_spark.MonthlySales

In [None]:
%%pyspark

monthly_df = spark.read.load('abfss://data@synapsedp203dl.dfs.core.windows.net/sales/transformed/SparkSQL/MonthlySales/Year=2020/Month=4', format='parquet', header = True) 
display(monthly_df)