In [2]:
%%pyspark
import random
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    current_timestamp, trim, udf, col, lit, row_number, date_format, dayofweek, dayofmonth, dayofyear,
    month, year, weekofyear, when, concat, quarter
)
from pyspark.sql.types import DateType
from pyspark.sql.window import Window
from datetime import datetime, timedelta

SRC_PARQUET_DIR = 'abfss://depi3@depi3.dfs.core.windows.net/Silver Layer'
DIST_PARQUET_DIR = 'abfss://depi3@depi3.dfs.core.windows.net/Gold Layer'

StatementMeta(Depi, 16, 2, Finished, Available, Finished)

In [3]:
accounts_df = spark.read.load(f'{SRC_PARQUET_DIR}/Accounts/part-*.snappy.parquet', format='parquet')
major_df = spark.read.load(f'{SRC_PARQUET_DIR}/Major/part-*.snappy.parquet', format='parquet')
manager_df = spark.read.load(f'{SRC_PARQUET_DIR}/Manager/part-*.snappy.parquet', format='parquet')
office_df = spark.read.load(f'{SRC_PARQUET_DIR}/Office/part-*.snappy.parquet', foramt='parquet')
sales_agent_df = spark.read.load(f'{SRC_PARQUET_DIR}/Sales Agent/part-*.snappy.parquet', format='parquet')
sales_pipeline_df = spark.read.load(f'{SRC_PARQUET_DIR}/Sales PipeLine/part-*.snappy.parquet', format='parquet')
product_df = spark.read.load(f'{SRC_PARQUET_DIR}/Products/part-*.snappy.parquet', format='parquet')
regional_office_df = spark.read.load(f'{SRC_PARQUET_DIR}/Regional Office/part-*.snappy.parquet', format='parquet')
sector_df = spark.read.load(f'{SRC_PARQUET_DIR}/Sector/part-*.snappy.parquet', format='parquet')
series_df = spark.read.load(f'{SRC_PARQUET_DIR}/Series/part-*.snappy.parquet', format='parquet')

StatementMeta(Depi, 16, 3, Finished, Available, Finished)

In [4]:
accounts_df.createOrReplaceTempView("accounts")
major_df.createOrReplaceTempView("major")
manager_df.createOrReplaceTempView("manager")
office_df.createOrReplaceTempView("office")
sales_agent_df.createOrReplaceTempView("sales_agent")
sales_pipeline_df.createOrReplaceTempView("sales_pipeline")
product_df.createOrReplaceTempView("product")
regional_office_df.createOrReplaceTempView("regional_office")
sector_df.createOrReplaceTempView("sector")
series_df.createOrReplaceTempView("series")

StatementMeta(Depi, 16, 4, Finished, Available, Finished)

In [5]:
def generate_random_date(start_date, end_date):
    """
    Generate a random date between two given dates.

    Parameters:
    start_date (str): The start date in 'YYYY-MM-DD' format.
    end_date (str): The end date in 'YYYY-MM-DD' format.

    Returns:
    datetime: A random date between start_date and end_date.
    """
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    delta = end - start
    random_days = random.randint(0, delta.days)

    return start + timedelta(days=random_days)



StatementMeta(Depi, 16, 5, Finished, Available, Finished)

# Constructing tables
**Making the DIM tables and the Fact Table**

In [6]:
Dim_Accounts = spark.sql("""
SELECT 
    ROW_NUMBER() OVER (ORDER BY A.AccountID, M.MajorID) AS SrAccountID,
    A.AccountID,
    A.account,
    A.year_established,
    A.revenue,
    A.employees,
    COALESCE(M.MajorName, 'Unknown') Subsidary_of,
    S.SectorName,
    O.OfficeLocation
FROM 
    accounts A
LEFT JOIN 
    major M ON M.MajorID = A.subsidiary_of_ID
LEFT JOIN 
    sector S ON S.SectorID = A.sectorID
LEFT JOIN 
    office O ON O.OfficeID = A.OfficeID;
""")

Dim_Products = spark.sql("""
SELECT
	ROW_NUMBER() OVER (ORDER BY P.ProductID, S.SeriesID) SrProductID ,
	P.ProductID , 
	P.product, 
	P.sales_price,
    P.StartDate,
	S.SeriesName Series
FROM
	product P
LEFT JOIN 
	series S ON S.SeriesID = P.SeriesID;
""")
generate_random_date_udf = udf(lambda: generate_random_date("2023-01-01", "2023-12-31"), DateType())
Dim_Products = Dim_Products.withColumn("EndDate", generate_random_date_udf())

Dim_Sales_Agent = spark.sql("""
SELECT
	ROW_NUMBER() OVER (ORDER BY S.SalesID, R.RegionalOfficeID) SrAgentID ,
	S.SalesID SalesagentID, 
	S.sales_agent Salesagent,
	M.ManagerName Manager,
	R.RegionalOfficeName RegionalOffice
From
	sales_agent S
LEFT JOIN
	regional_office R
	ON S.regional_officeID = R.RegionalOfficeID
LEFT JOIN
	manager M
	ON S.ManagerID = M.ManagerID;
""")


Dim_Sales_Pipeline = spark.sql("""
SELECT
    ROW_NUMBER() OVER (ORDER BY SalesID, productID, accountID) SrSalesID,
    opportunity_ID,
    deal_stage,
    engage_date engage,
    close_date,
    close_value
From
    sales_pipeline;
""")

# this one is for DIM SALES PIPELINE
generate_random_date_udf = udf(lambda: generate_random_date("2020-01-01", "2023-12-31"), DateType())
Dim_Sales_Pipeline = Dim_Sales_Pipeline.withColumn('CreatedAt', generate_random_date_udf())

StatementMeta(Depi, 16, 6, Finished, Available, Finished)

In [7]:
print("DIM Accounts Table:\n", Dim_Accounts.columns)
# Show the query result
display(Dim_Accounts.limit(10))

StatementMeta(Depi, 16, 7, Finished, Available, Finished)

DIM Accounts Table:
 ['SrAccountID', 'AccountID', 'account', 'year_established', 'revenue', 'employees', 'Subsidary_of', 'SectorName', 'OfficeLocation']


SynapseWidget(Synapse.DataFrame, 6f98ac2f-e927-4704-88a7-d219c384da34)

In [9]:
def random_date(start_date, end_date):
    time_delta = end_date - start_date
    days_between = time_delta.days
    random_days = random.randrange(0, days_between, random.randint(12,34))
    return start_date + timedelta(days=random_days)

# Generate sample date range using datetime
start_date = datetime(2020, 1, 1)
end_date = datetime(2023, 1, 10)
date_list = [random_date(start_date, end_date).strftime("%Y-%m-%d") for x in range(20)]

# Create a DataFrame with just the `date` column initially
df = spark.createDataFrame([(d,) for d in date_list], ["date"])

# Add necessary columns to simulate your SQL query
window_spec = Window.orderBy("date")

Dim_time = (
    df
    .withColumn("timekey", row_number().over(window_spec))
    .withColumn("dayofweek", dayofweek(col("date")))
    .withColumn("dayofmonth", dayofmonth(col("date")))
    .withColumn("dayofyear", dayofyear(col("date")))
    .withColumn("Month", month(col("date")))
    .withColumn("MonthName", date_format(col("date"), "MMMM"))
    .withColumn("quarter", quarter(col("date")))
    .withColumn("Year", year(col("date")))
    .withColumn("WeekofYear", weekofyear(col("date")))
    .withColumn("ISWeekend", when(col("dayofweek").isin(7, 1), lit(True)).otherwise(lit(False)))
    .withColumn("ISHoliday", lit(False))
    .withColumn("HolidayName", lit("UnKnown").cast("string"))
)

# Show the first 10 rows of the resulting DataFrame
display(Dim_time.limit(1000))


StatementMeta(Depi, 16, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 1c23e33d-a77f-4750-b294-1863d7fe2ecb)

In [10]:
Dim_Accounts.write.mode("overwrite").parquet(f'{DIST_PARQUET_DIR}/DimAccounts/')
Dim_Products.write.mode("overwrite").parquet(f'{DIST_PARQUET_DIR}/DimProducts/')
Dim_Sales_Agent.write.mode("overwrite").parquet(f'{DIST_PARQUET_DIR}/DimSalesAgent/')
Dim_Sales_Pipeline.write.mode("overwrite").parquet(f'{DIST_PARQUET_DIR}/DimSalesPipeLine/')
Dim_time.write.mode("overwrite").parquet(f'{DIST_PARQUET_DIR}/DimTime/')

StatementMeta(Depi, 16, 10, Finished, Available, Finished)