In [62]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions
import os
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, col
from pyspark.ml.feature import StringIndexer

In [43]:
spark = SparkSession.builder \
    .appName("feature") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

In [44]:
path = "../dataset/cooked/1 Dataset"
df = spark.read.csv(path, header=True, inferSchema=True)

In [45]:
df = df.withColumn(
    "is_weekend",
    (col("day_of_week").isin(6, 7)).cast("int")
)
df = df.withColumn(
    "is_workday",
    (~col("day_of_week").isin(6, 7)).cast("int")
)


In [46]:
window_price = (
    Window.partitionBy("store_id", "item_id")
          .orderBy("wm_yr_wk")
)

df = df.withColumn(
    "lag_price_1",
    lag("sell_price", 1).over(window_price)
)


In [47]:
df = df.withColumn(
    "price_change",
    (col("sell_price") - col("lag_price_1")) / col("lag_price_1")
)


In [48]:
df = df.withColumn(
    "price_roll_4",
    avg("sell_price").over(window_price.rowsBetween(-4, 0))
)


In [49]:
window_sales = (
    Window.partitionBy("store_id", "item_id")
          .orderBy("date")
)

df = df.withColumn("lag_1",  lag("sales", 1).over(window_sales))
df = df.withColumn("lag_7",  lag("sales", 7).over(window_sales))
df = df.withColumn("lag_28", lag("sales", 28).over(window_sales))


In [50]:
df = df.withColumn(
    "rolling_mean_7",
    avg("sales").over(window_sales.rowsBetween(-7, -1))
)


In [51]:
df = df.withColumn(
    "rolling_mean_28",
    avg("sales").over(window_sales.rowsBetween(-28, -1))
)


In [52]:
df = df.withColumn(
    "rolling_std_7",
    stddev("sales").over(window_sales.rowsBetween(-7, -1))
)


In [53]:
cols_to_index = ["store_id", "item_id", "dept_id", "cat_id", "state_id"]

indexed_output = [c + "_idx" for c in cols_to_index]

indexers = [
    StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid="skip")
    for c in cols_to_index
]

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=indexers)
df = pipeline.fit(df).transform(df)


In [69]:
df.filter(col("date") >= "2016-04-25")


DataFrame[store_id: string, item_id: string, wm_yr_wk: int, id: string, dept_id: string, cat_id: string, state_id: string, d_code: string, sales: int, date: date, sell_price: double, day_of_week: int, week: int, month: int, year: int, is_weekend: int, is_workday: int, lag_price_1: double, price_change: double, price_roll_4: double, lag_1: int, lag_7: int, lag_28: int, rolling_mean_7: double, rolling_mean_28: double, rolling_std_7: double, store_id_idx: double, item_id_idx: double, dept_id_idx: double, cat_id_idx: double, state_id_idx: double]

In [68]:
df.select(functions.min("date"), functions.max("date")).show()


+----------+----------+
| min(date)| max(date)|
+----------+----------+
|2011-01-29|2016-04-24|
+----------+----------+



In [54]:
# df.select("date").show(5)

In [66]:
# df.printSchema()

In [72]:
train_df = df.filter(col("date") <= "2015-03-27")

val_df = df.filter(
    (col("date") > "2015-03-27") &
    (col("date") <= "2016-02-29")
)

test_df = df.filter(
    (col("date") > "2016-02-29") &
    (col("date") <= "2016-04-24")
)

In [73]:
print("Train:", train_df.count())
print("Validation:", val_df.count())
print("Test:", test_df.count())

Train: 46314310
Validation: 10336110
Test: 1676950


In [74]:
train_df.write.parquet("../dataset/cooked/2 Dataset/train")
val_df.write.parquet("../dataset/cooked/2 Dataset/val")
test_df.write.parquet("../dataset/cooked/2 Dataset/test")
