### Config stuff

In [1]:

from pyspark.sql import SparkSession, functions
import ConnectionConfig as cc
from pyspark.sql.functions import *

In [2]:
from delta import configure_spark_with_delta_pip
builder = SparkSession.builder \
    .appName("FactSales") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.driver.extraClassPath", ":".join(cc.jars)) \
    .master("local[*]")
builder = configure_spark_with_delta_pip(builder)
spark = builder.getOrCreate()
builder.getOrCreate()

In [6]:
from pyspark.sql.functions import *
beginDate = '01-01-2009 00:00:00'
endDate = '31-12-2023 00:00:00'
df_weather_history = spark.sql(f"select explode(sequence(to_timestamp('{beginDate}', 'dd-MM-yyyy HH:mm:ss'), to_timestamp('{endDate}', 'dd-MM-yyyy HH:mm:ss'), interval 1 hour)) as Time") \
.withColumn("WeatherType", floor(rand()*4))
df_weather_history.createOrReplaceTempView('hist')
spark.sql('select max(Time) from hist').show()

+-------------------+
|          max(Time)|
+-------------------+
|2023-12-31 00:00:00|
+-------------------+



In [8]:
df_hist = spark.read.format("delta").table("hisweather")
df_hist.createOrReplaceTempView('persHist')
spark.sql("select max(Time) from persHist").show()

+-------------------+
|          max(Time)|
+-------------------+
|2023-12-31 00:00:00|
+-------------------+



# MAKE TABLES AVAILABLE
This notebook is used to query the datawarehouse. Make sure your datawarehouse tables are available to query

In [3]:
# MAKE DIMENSIONS AVAILABLE AS VIEWS
spark.read.format("delta").load("spark-warehouse/dimdate").createOrReplaceTempView("dimDate")
spark.read.format("delta").load("spark-warehouse/dimsalesrep/").createOrReplaceTempView("dimSalesRep")
spark.read.format("delta").load("spark-warehouse/factsales/").createOrReplaceTempView("factSales")

In [4]:
# What is the total revenue per weekday?
spark.sql("select dd.DayOfWeek, sum(fs.revenue_mv) from factSales fs inner join dimDate dd on dd.dateSK = fs.dateSK group by DayOfWeek").show()
# What is the  revenue for each office?
spark.sql("select ds.office, sum(fs.revenue_mv) from factSales fs inner join dimSalesrep ds on ds.salesrepSK = fs.salesrepsk group by Office").show()

+---------+---------------+
|DayOfWeek|sum(revenue_mv)|
+---------+---------------+
|        6|   162008172717|
|        2|   152123354526|
|        5|   151068900235|
|        4|   146901029926|
|        3|   155808767203|
|        7|   183395531045|
|        1|   123479476450|
+---------+---------------+

+--------+---------------+
|  office|sum(revenue_mv)|
+--------+---------------+
| Chicago|    63636097575|
|  Berlin|   701738853711|
|New York|   309410280816|
+--------+---------------+

