In [57]:
from pyspark.sql import SparkSession
import os

In [58]:
aws_access_key = os.environ["AWS_ACCESS_KEY"]
aws_secret_key = os.environ["AWS_SECRET_KEY"]
aws_region = os.environ["AWS_REGION"]
warehouse_location = os.environ["WAREHOUSE_LOCATION"]
metastore_uri = os.environ["METASTORE_URI"]

spark = SparkSession.builder.appName("Warehouse")\
    .master("spark://spark-master:7077") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .config("spark.hadoop.fs.s3a.access.key", aws_access_key) \
    .config("spark.hadoop.fs.s3a.secret.key", aws_secret_key) \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .config("spark.sql.hive.metastore.uris", metastore_uri) \
    .config("hive.metastore.uris", metastore_uri) \
    .config("hive.metastore.warehouse.dir", warehouse_location) \
    .config("hive.hadoop.fs.s3a.access.key", aws_access_key) \
    .config("hive.hadoop.fs.s3a.secret.key", aws_secret_key) \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.driver.memory", "5G") \
    .config("spark.memory.offHeap.size","16g") \
    .config("spark.memory.offHeap.enabled", True) \
    .enableHiveSupport() \
    .getOrCreate()

In [59]:
# Warehouse

In [60]:
spark.sql("drop schema if exists facts cascade")
spark.sql("drop database if exists dimensions cascade")
spark.sql("drop database if exists craigslist_vehicles_bronze cascade")

DataFrame[]

In [61]:
spark.sql("create schema if not exists craigslist_vehicles_bronze")
spark.sql("create schema if not exists facts")
spark.sql("create database if not exists dimensions")

DataFrame[]

In [62]:
# sCreate Delta table for raw Craigslist vehicle data

In [63]:
spark.sql("""create table if not exists craigslist_vehicles_bronze.craigslist_vehicles (
    id string, 
    region string, 
    price double, 
    year DATE, 
    manufacturer string, 
    model string, 
    condition string, 
    cylinders string, 
    fuel string, 
    odometer string, 
    title_status string, 
    transmission string, 
    drive string, 
    size string, 
    type string, 
    paint_color string, 
    state string, 
    lat double, 
    long double
  ) using delta
""")

                                                                                

DataFrame[]

In [64]:
# Create Delta tables for aggregated facts data

In [65]:
spark.sql("""
create table if not exists facts.year_id_price (
        year DATE,
        id string,
        price double,
        state string
    ) using delta
""")

                                                                                

DataFrame[]

In [66]:
spark.sql("""
create table if not exists facts.year_id (
  id string,
  year DATE,
  state string
) using delta
""")

                                                                                

DataFrame[]

In [67]:
spark.sql("""
create table if not exists facts.manufacturer_year (
    id string,
    manufacturer string,
    year DATE,
    state string
    ) using delta
""")

DataFrame[]

In [68]:
spark.sql("""
create table if not exists facts.region_year (
    id string,
    year DATE,
    state string
    ) using delta
""")

DataFrame[]

In [69]:
spark.sql("""
create table if not exists facts.region_year (
    id string,
    year DATE,
    state string,
    manufacturer string
    ) using delta
""")

DataFrame[]

In [70]:
spark.sql("""
create table if not exists facts.region_year (
    id string,
    year DATE,
    region string,
    state string,
    model string
    ) using delta
""")

DataFrame[]

In [71]:
# Create Delta tables for dimension data

In [72]:
spark.sql("""
CREATE TABLE IF NOT EXISTS dimensions.dates (
    date DATE,
    id STRING
) USING DELTA 
""")

DataFrame[]

In [73]:
spark.sql("""
CREATE TABLE IF NOT EXISTS dimensions.regions (
    region STRING,
    id STRING
) USING DELTA
""")

DataFrame[]

In [74]:
spark.sql("""
CREATE TABLE IF NOT EXISTS dimensions.manufacturers (
    manufacturer STRING,
    id STRING
) USING DELTA
""")


DataFrame[]

In [75]:
spark.sql("""
CREATE TABLE IF NOT EXISTS dimensions.model (
    model STRING,
    id STRING
) USING DELTA
""")

DataFrame[]

In [76]:
spark.sql("""
CREATE TABLE IF NOT EXISTS dimensions.model (
    state STRING,
    id STRING
) USING DELTA
""")

DataFrame[]

In [77]:
spark.sql("""
CREATE TABLE IF NOT EXISTS dimensions.model (
    manufacturer STRING,
    id STRING
) USING DELTA
""")

DataFrame[]

In [78]:
spark.sql("show databases").show(truncate=False)

+--------------------------+
|namespace                 |
+--------------------------+
|craigslist_vehicles_bronze|
|default                   |
|dimensions                |
|facts                     |
+--------------------------+



In [79]:
spark.stop()