# Import Lib

In [27]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os


In [28]:
spark = (
    SparkSession.builder
    .appName("M5-Dataset")
    .master("local[2]")
    .config("spark.driver.memory", "6g")
    .config("spark.executor.memory", "6g")
    .config("spark.sql.shuffle.partitions", "32")
    .getOrCreate()
)

# Load Dataset CSV

In [31]:
calendar_df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("../dataset/raw/calendar.csv")
)

In [32]:
sales_df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("../dataset/raw/sales_train_validation.csv")
)

In [33]:
prices_df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv("../dataset/raw/sell_prices.csv")
)

In [34]:
calendar_df.printSchema()
sales_df.printSchema()
prices_df.printSchema()

root
 |-- date: date (nullable = true)
 |-- wm_yr_wk: integer (nullable = true)
 |-- weekday: string (nullable = true)
 |-- wday: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- d: string (nullable = true)
 |-- event_name_1: string (nullable = true)
 |-- event_type_1: string (nullable = true)
 |-- event_name_2: string (nullable = true)
 |-- event_type_2: string (nullable = true)
 |-- snap_CA: integer (nullable = true)
 |-- snap_TX: integer (nullable = true)
 |-- snap_WI: integer (nullable = true)

root
 |-- id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- dept_id: string (nullable = true)
 |-- cat_id: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- state_id: string (nullable = true)
 |-- d_1: integer (nullable = true)
 |-- d_2: integer (nullable = true)
 |-- d_3: integer (nullable = true)
 |-- d_4: integer (nullable = true)
 |-- d_5: integer (nullable = true)
 |-- d_6: integer (nulla

# Unpivot data Penjualan

### Ambil Kolom d_*

In [35]:
d_cols = [c for c in sales_df.columns if c.startswith("d_")]

### Unpivot dengan stack()

In [36]:
n = len(d_cols)
sales_long_df = sales_df.selectExpr(
    "item_id",
    "store_id",
    "stack({}, {}) as (d, sales)".format(
        n,
        ",".join([f"'{c}', `{c}`" for c in d_cols])
    )
)

item_id | store_id | d | sales


In [37]:
sales_long_df.printSchema()
sales_long_df.show(5)


root
 |-- item_id: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- d: string (nullable = true)
 |-- sales: integer (nullable = true)

+-------------+--------+---+-----+
|      item_id|store_id|  d|sales|
+-------------+--------+---+-----+
|HOBBIES_1_001|    CA_1|d_1|    0|
|HOBBIES_1_001|    CA_1|d_2|    0|
|HOBBIES_1_001|    CA_1|d_3|    0|
|HOBBIES_1_001|    CA_1|d_4|    0|
|HOBBIES_1_001|    CA_1|d_5|    0|
+-------------+--------+---+-----+
only showing top 5 rows



### Join Kalender

In [38]:
calendar_selected = calendar_df.select(
    "d",
    "date",
    "weekday",
    "event_name_1"
)


### sales x Calendar

In [39]:
sales_calendar_df = (
    sales_long_df
    .join(calendar_selected, on="d", how="left")
)

sales_calendar_df.show(5)


+---+-------------+--------+-----+----------+---------+------------+
|  d|      item_id|store_id|sales|      date|  weekday|event_name_1|
+---+-------------+--------+-----+----------+---------+------------+
|d_1|HOBBIES_1_001|    CA_1|    0|2011-01-29| Saturday|        NULL|
|d_2|HOBBIES_1_001|    CA_1|    0|2011-01-30|   Sunday|        NULL|
|d_3|HOBBIES_1_001|    CA_1|    0|2011-01-31|   Monday|        NULL|
|d_4|HOBBIES_1_001|    CA_1|    0|2011-02-01|  Tuesday|        NULL|
|d_5|HOBBIES_1_001|    CA_1|    0|2011-02-02|Wednesday|        NULL|
+---+-------------+--------+-----+----------+---------+------------+
only showing top 5 rows



### Join dengan Sell Prices

In [40]:
calendar_week = calendar_df.select("d", "wm_yr_wk")


In [41]:
sales_calendar_week_df = (
    sales_calendar_df
    .join(calendar_week, on="d", how="left")
)


In [42]:
sales_full_df = (
    sales_calendar_week_df
    .join(
        prices_df,
        on=["item_id", "store_id", "wm_yr_wk"],
        how="left"
    )
)


In [43]:
base_table_df = (
    sales_full_df
    .select(
        col("date"),
        col("item_id"),
        col("store_id"),
        col("sales").cast("int"),
        col("sell_price").alias("price"),
        col("event_name_1").alias("event"),
        col("weekday")
    )
)


In [44]:
base_table_df.printSchema()
base_table_df.show(5)

root
 |-- date: date (nullable = true)
 |-- item_id: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- sales: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- event: string (nullable = true)
 |-- weekday: string (nullable = true)

+----------+-------------+--------+-----+-----+-----+---------+
|      date|      item_id|store_id|sales|price|event|  weekday|
+----------+-------------+--------+-----+-----+-----+---------+
|2011-01-29|HOBBIES_2_112|    TX_2|    0| NULL| NULL| Saturday|
|2011-01-30|HOBBIES_2_112|    TX_2|    0| NULL| NULL|   Sunday|
|2011-01-31|HOBBIES_2_112|    TX_2|    0| NULL| NULL|   Monday|
|2011-02-01|HOBBIES_2_112|    TX_2|    0| NULL| NULL|  Tuesday|
|2011-02-02|HOBBIES_2_112|    TX_2|    0| NULL| NULL|Wednesday|
+----------+-------------+--------+-----+-----+-----+---------+
only showing top 5 rows



In [50]:
base_table_df.write.csv("../dataset/cooked/1/", header=True)