In [39]:
%%pyspark
import random
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    current_timestamp, trim, udf, col, lit, row_number, date_format, dayofweek, dayofmonth, dayofyear,
    month, year, weekofyear, when, concat, quarter
)
from pyspark.sql.types import DateType, FloatType, IntegerType
from pyspark.sql.window import Window
from datetime import datetime, timedelta

SRC_PARQUET_DIR = 'abfss://depi3@depi3.dfs.core.windows.net/Gold Layer'
DIST_PARQUET_DIR = 'abfss://depi3@depi3.dfs.core.windows.net/Gold Layer'

StatementMeta(Depi, 22, 14, Finished, Available, Finished)

In [28]:
Dim_Accounts = spark.read.load(f'{SRC_PARQUET_DIR}/DimAccounts/part-*.snappy.parquet', format='parquet')
Dim_Products = spark.read.load(f'{SRC_PARQUET_DIR}/DimProducts/part-*.snappy.parquet', format='parquet')
Dim_Sales_Agent = spark.read.load(f'{SRC_PARQUET_DIR}/DimSalesAgent/part-*.snappy.parquet', format='parquet')
Dim_Sales_Pipeline = spark.read.load(f'{SRC_PARQUET_DIR}/DimSalesPipeLine/part-*.snappy.parquet', format='parquet')
Dim_time = spark.read.load(f'{SRC_PARQUET_DIR}/DimTime/part-*.snappy.parquet', format='parquet')

StatementMeta(Depi, 22, 3, Finished, Available, Finished)

In [29]:
Dim_Accounts.createOrReplaceTempView("accounts")
Dim_Products.createOrReplaceTempView("products")
Dim_Sales_Agent.createOrReplaceTempView("sales_agent")
Dim_Sales_Pipeline.createOrReplaceTempView("sales_pipeline")
Dim_time.createOrReplaceTempView("time")

StatementMeta(Depi, 22, 4, Finished, Available, Finished)

In [30]:
print(Dim_Accounts.columns)
print(Dim_Products.columns)
print(Dim_Sales_Agent.columns)
print(Dim_Sales_Pipeline.columns)
print(Dim_time.columns)

StatementMeta(Depi, 22, 5, Finished, Available, Finished)

['SrAccountID', 'AccountID', 'account', 'year_established', 'revenue', 'employees', 'Subsidary_of', 'SectorName', 'OfficeLocation']
['SrProductID', 'ProductID', 'product', 'sales_price', 'StartDate', 'Series', 'EndDate']
['SrAgentID', 'SalesagentID', 'Salesagent', 'Manager', 'RegionalOffice']
['SrSalesID', 'opportunity_ID', 'deal_stage', 'engage', 'close_date', 'close_value', 'CreatedAt']
['date', 'timekey', 'dayofweek', 'dayofmonth', 'dayofyear', 'Month', 'MonthName', 'quarter', 'Year', 'WeekofYear', 'ISWeekend', 'ISHoliday', 'HolidayName']


In [40]:
Fact_Sales = spark.sql("""
SELECT
    ROW_NUMBER() OVER (ORDER BY A.SrAccountID) FactSalesID,
    A.SrAccountID,
    P.SrProductID,
    SA.SragentID,
    T.timekey,
    SP.SrSalesID,
    P.sales_price,
    COALESCE(P.Series, 'GTX') Series,
    A.SectorName,
    A.revenue,
    SP.deal_stage,
    SP.engage,
    SP.close_date,
    SP.close_value
FROM
    sales_agent SA
LEFT JOIN
    accounts A
    ON SA.SalesagentID = A.AccountID
LEFT JOIN
    products P
    ON SA.SalesagentID = P.ProductID
LEFT JOIN
    sales_pipeline SP
    ON SP.SrSalesID = SA.SalesagentID
LEFT JOIN
    time T
    ON T.timekey = SA.SalesagentID
""")

def generate_random_price():
    return round(random.uniform(10, 500), 2)

generate_price_udf = udf(generate_random_price, FloatType())

def generate_random_id():
    return random.randint(1, 35)

generate_id_udf = udf(generate_random_id, IntegerType())

Fact_Sales = Fact_Sales.withColumn('sales_price', generate_price_udf())
Fact_Sales = Fact_Sales.withColumn('SrProductID', generate_id_udf())
Fact_Sales = Fact_Sales.withColumn('timekey', generate_id_udf())

display(Fact_Sales.limit(100))

StatementMeta(Depi, 22, 15, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 84fd7268-2ff6-4341-b78d-f7d665e73c58)

In [41]:
Fact_Sales.write.mode("overwrite").parquet(f'{DIST_PARQUET_DIR}/FactSales/')

StatementMeta(Depi, 22, 16, Finished, Available, Finished)