# Data Cleaning

This notebook provides necessary steps for cleaning the data

## Imports

In [1]:
import os

# ETL and Data Manipulation
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, to_date, expr

## Loading Data

In [3]:
spark = SparkSession.builder \
        .appName("hello") \
        .master("local[2]") \
        .getOrCreate()

In [None]:
# Load data from CSV
DATA_PATH = "/sparkdata/wholesale-recommender"

order_lines = spark.read.option("header", True).option("inferSchema", True).csv(f"{DATA_PATH}/orders.csv")
products = spark.read.option("header", True).option("inferSchema", True).csv(f"{DATA_PATH}/product-supplier.csv")

## Data Cleaning

#### Datetime columns

In [7]:
# Convert date columns
order_lines = order_lines.withColumn(
    "Date Order was placed", to_date(col("Date Order was placed"), "dd-MMM-yy")
).withColumn(
    "Delivery Date", to_date(col("Delivery Date"), "dd-MMM-yy")
)

#### Customer Status column

In [8]:
order_lines = order_lines.withColumn(
    "Customer Status", lower(col("Customer Status").cast("string"))
)

#### Missing columns

In [9]:
order_lines = order_lines.withColumn(
    "Retail price Per Unit",
    col("Total Retail Price for This Order") / col("Quantity Ordered")
).withColumn(
    "Total Cost price",
    col("Quantity Ordered") * col("Cost Price Per Unit")
)

## Save as parquet

In [12]:
# Define absolute path
OUTPUT_PATH = os.path.abspath(os.path.join(DATA_PATH, 'cleaned'))

# Save to Parquet
order_lines.write.mode("overwrite").parquet(os.path.join(OUTPUT_PATH, "order_lines"))
products.write.mode("overwrite").parquet(os.path.join(OUTPUT_PATH, "products"))