# Notebook Databricks

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.types as types

In [2]:
spark = (SparkSession.builder.appName('SalesDataPrep').getOrCreate())

## Read CSV Files into DataFrame

In [0]:
sales_df = spark.read.option('inferSchema', True).option('header', True).csv('dbfs:/FileStore/salesdata/raw')

In [0]:
sales_df.show(10)

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  295665|  Macbook Pro Laptop|               1|    1700.0|12/30/19 00:01|136 Church St, Ne...|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...|
|  295667|USB-C Charging Cable|               1|     11.95|12/12/19 18:21|277 Main St, New ...|
|  295668|    27in FHD Monitor|               1|    149.99|12/22/19 15:13|410 6th St, San F...|
|  295669|USB-C Charging Cable|               1|     11.95|12/18/19 12:38|43 Hill St, Atlan...|
|  295670|AA Batteries (4-p...|               1|      3.84|12/31/19 22:58|200 Jefferson St,...|
|  295671|USB-C Charging Cable|               1|     11.95|12/16/19 15:10|928 12th St, Port...|
|  295672|USB-C Charging Cable|         

In [0]:
sales_df.printSchema()

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)



## Creating a Database on Databricks

In [0]:
spark.sql('CREATE DATABASE IF NOT EXISTS sales_db')

Out[7]: DataFrame[]

In [0]:
spark.sql('USE sales_db')

Out[8]: DataFrame[]

In [0]:
spark.sql("""CREATE TABLE IF NOT EXISTS sales_raw (
            OrderID STRING,
            Product STRING,
            QuantityOrdered STRING,
            PriceEach STRING,
            OrderDate STRING,
            PurchaseAddress STRING
            )
          """)

Out[10]: DataFrame[]

## Insert DataFrame Data to Sales Raw Table

In [0]:
sales_df.createOrReplaceTempView('tmp_sales_raw')

In [0]:
spark.sql('SELECT * FROM tmp_sales_raw').show(5)

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  295665|  Macbook Pro Laptop|               1|    1700.0|12/30/19 00:01|136 Church St, Ne...|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...|
|  295667|USB-C Charging Cable|               1|     11.95|12/12/19 18:21|277 Main St, New ...|
|  295668|    27in FHD Monitor|               1|    149.99|12/22/19 15:13|410 6th St, San F...|
|  295669|USB-C Charging Cable|               1|     11.95|12/18/19 12:38|43 Hill St, Atlan...|
+--------+--------------------+----------------+----------+--------------+--------------------+
only showing top 5 rows



In [0]:
spark.sql('DESCRIBE tmp_sales_raw').show()

+----------------+---------+-------+
|        col_name|data_type|comment|
+----------------+---------+-------+
|        Order ID|      int|   null|
|         Product|   string|   null|
|Quantity Ordered|      int|   null|
|      Price Each|   double|   null|
|      Order Date|   string|   null|
|Purchase Address|   string|   null|
+----------------+---------+-------+



In [0]:
spark.sql('DESCRIBE sales_raw').show()

+---------------+---------+-------+
|       col_name|data_type|comment|
+---------------+---------+-------+
|        OrderID|   string|       |
|        Product|   string|       |
|QuantityOrdered|   string|       |
|      PriceEach|   string|       |
|      OrderDate|   string|       |
|PurchaseAddress|   string|       |
|               |         |       |
| # Partitioning|         |       |
|Not partitioned|         |       |
+---------------+---------+-------+



In [0]:
spark.sql("""INSERT OVERWRITE sales_raw
             SELECT * FROM tmp_sales_raw
          """)

Out[20]: DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.sql('SELECT * FROM sales_raw').show()

+-------+--------------------+---------------+---------+--------------+--------------------+
|OrderID|             Product|QuantityOrdered|PriceEach|     OrderDate|     PurchaseAddress|
+-------+--------------------+---------------+---------+--------------+--------------------+
| 295665|  Macbook Pro Laptop|              1|   1700.0|12/30/19 00:01|136 Church St, Ne...|
| 295666|  LG Washing Machine|              1|    600.0|12/29/19 07:03|562 2nd St, New Y...|
| 295667|USB-C Charging Cable|              1|    11.95|12/12/19 18:21|277 Main St, New ...|
| 295668|    27in FHD Monitor|              1|   149.99|12/22/19 15:13|410 6th St, San F...|
| 295669|USB-C Charging Cable|              1|    11.95|12/18/19 12:38|43 Hill St, Atlan...|
| 295670|AA Batteries (4-p...|              1|     3.84|12/31/19 22:58|200 Jefferson St,...|
| 295671|USB-C Charging Cable|              1|    11.95|12/16/19 15:10|928 12th St, Port...|
| 295672|USB-C Charging Cable|              2|    11.95|12/13/19 09:29

In [0]:
tmp_sales_df = spark.sql("""
                        SELECT 
                        cast(OrderID as int) as OrderID,
                        Product,
                        cast(QuantityOrdered as int) as Quantity,
                        cast(PriceEach as float) as PriceEach,
                        to_timestamp(OrderDate, 'MM/dd/yy HH:mm') as OrderDate,
                        PurchaseAddress,
                        split(PurchaseAddress, ',')[1] as City,
                        substr(split(PurchaseAddress, ',')[2], 1, 3) as State,
                        year(to_timestamp(OrderDate, 'MM/dd/yy HH:mm')) as ReportYear,
                        month(to_timestamp(OrderDate, 'MM/dd/yy HH:mm')) as Month
                        FROM sales_raw
                        WHERE OrderID is not null
                        """)

In [0]:
tmp_sales_df.createOrReplaceTempView('tmp_sales')

In [0]:
spark.sql('describe tmp_sales').show()

+---------------+---------+-------+
|       col_name|data_type|comment|
+---------------+---------+-------+
|        OrderID|      int|   null|
|        Product|   string|   null|
|       Quantity|      int|   null|
|      PriceEach|    float|   null|
|      OrderDate|timestamp|   null|
|PurchaseAddress|   string|   null|
|           City|   string|   null|
|          State|   string|   null|
|     ReportYear|      int|   null|
|          Month|      int|   null|
+---------------+---------+-------+



In [0]:
spark.sql("""
CREATE TABLE IF NOT EXISTS sales (
  OrderID INT,
  Product STRING,
  Quantity INT,
  PriceEach FLOAT,
  OrderDate TIMESTAMP,
  StoreAddress STRING,
  City STRING,
  State STRING,
  ReportYear INT,
  Month INT
)
USING PARQUET
PARTITIONED BY (ReportYear, Month)
OPTIONS('compression' = 'snappy')
LOCATION 'dbfs:/FileStore/salesdata/published'
""")

Out[71]: DataFrame[]

In [0]:
spark.sql("""
INSERT INTO sales
SELECT
  OrderID,
  Product,
  Quantity,
  PriceEach,
  null,
  PurchaseAddress,
  City,
  State,
  ReportYear,
  Month
FROM tmp_sales
""")

Out[72]: DataFrame[]