In [2]:
import gdown
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark.sql.functions import col


In [3]:
spark = SparkSession.builder \
    .appName("M5-Preprocessing") \
    .config("spark.driver.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .getOrCreate()

In [3]:
os.makedirs('../dataset/raw', exist_ok=True)

files = {
    "calendar.csv": "1P22V8M8EY28dmigNW0gS_TqBH8Y3Rwba",
    "sales_train_evaluation.csv": "1mLK9h7aMsM3lgco9UIs7wGXYeLXQ9fmP",
    "sales_train_validation.csv": "1sftR8_l0QVmeZmT1mkdRYoBGVp6vJtSC",
    "sample_submission.csv": "1-IW1kAVLK2b_gZ9CzKaxB7RBSie5T2FN",
    "sell_prices.csv": "1aPt4zbe_2UGJmK2cKNmzu7bkyECWHKRb"
}
for filename, file_id in files.items():
    url = f"https://drive.google.com/uc?id={file_id}"
    save_path = f"../dataset/raw/{filename}"
    gdown.download(url, save_path, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1P22V8M8EY28dmigNW0gS_TqBH8Y3Rwba
To: e:\Riwayat Kuliah\Semester 7\Big Data dan AI\UAS\dataset\raw\calendar.csv
100%|██████████| 103k/103k [00:00<00:00, 136kB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1mLK9h7aMsM3lgco9UIs7wGXYeLXQ9fmP
From (redirected): https://drive.google.com/uc?id=1mLK9h7aMsM3lgco9UIs7wGXYeLXQ9fmP&confirm=t&uuid=e11ac76d-55a7-420e-8908-8f4e95527d77
To: e:\Riwayat Kuliah\Semester 7\Big Data dan AI\UAS\dataset\raw\sales_train_evaluation.csv
100%|██████████| 122M/122M [03:27<00:00, 587kB/s] 
Downloading...
From (original): https://drive.google.com/uc?id=1sftR8_l0QVmeZmT1mkdRYoBGVp6vJtSC
From (redirected): https://drive.google.com/uc?id=1sftR8_l0QVmeZmT1mkdRYoBGVp6vJtSC&confirm=t&uuid=21b87acc-4457-4c93-bf31-80103fa59776
To: e:\Riwayat Kuliah\Semester 7\Big Data dan AI\UAS\dataset\raw\sales_train_validation.csv
100%|██████████| 120M/120M [04:06<00:00, 488kB/s] 
Downloading...
From: https://drive

In [14]:
sales = spark.read.csv(
    "../dataset/raw/sales_train_validation.csv",
    header=True,
    inferSchema=True
)

calendar = spark.read.csv(
    "../dataset/raw/calendar.csv",
    header=True,
    inferSchema=True
)

sell_prices = spark.read.csv(
    "../dataset/raw/sell_prices.csv",
    header=True,
    inferSchema=True
)


In [15]:
meta_cols = [
    "id", "item_id", "dept_id", "cat_id", "store_id", "state_id"
]

# cut sesuai mood terakhir

day_cols = [f"d_{i}" for i in range(1, 1914)]

sales = sales.select(meta_cols + day_cols)

In [16]:
sales.count()

30490

In [17]:
stack_expr = ", ".join([f"'{c}', {c}" for c in day_cols])

sales = sales.select(
    *meta_cols,
    expr(f"stack({len(day_cols)}, {stack_expr}) as (day, sales)")
)

In [18]:
sales = sales.filter(col("sales").isNotNull())

In [19]:
sales = sales.withColumn("sales", col("sales").cast("int"))
sales.printSchema()

root
 |-- id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- dept_id: string (nullable = true)
 |-- cat_id: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- state_id: string (nullable = true)
 |-- day: string (nullable = true)
 |-- sales: integer (nullable = true)



In [20]:
path = "../dataset/cooked/preprocessed_sales"

sales.write.mode("overwrite").parquet(path)