# Preprocess for model training

This notebook should do the preprocessing for the training of the individual models. 

Note: The preprocessing for some of the models are the same, so it makes sense to do it in one flow

## Imports

In [13]:
import os

# ETL and Data Manipulation
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, when, udf, log1p
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

from pyspark.sql.types import DoubleType

## Load data

In [14]:
# Create local Spark session
spark = SparkSession.builder \
    .appName("LocalSparkForTesting") \
    .master("local[1]") \
    .getOrCreate()

In [15]:
DATA_PATH = '/sparkdata/wholesale-recommender'

# Lazy load customer data 
customer_features = spark.read.parquet(os.path.join(DATA_PATH, 'processed' , "customers_features"))
customer_cluster = spark.read.parquet(os.path.join(DATA_PATH, 'processed', "customer_cluster"))

# Lazy load order lines and products
order_lines = spark.read.parquet(os.path.join(DATA_PATH, 'cleaned', "order_lines"))
products = spark.read.parquet(os.path.join(DATA_PATH, 'cleaned', "products"))

## Preprocess Interactions

### Clean column names

In [16]:
# Select relevant columns and rename to spark-friendly names
interactions = order_lines.select(
    col("Customer ID").alias("customer_id"),
    col("Product ID").alias("product_id"),
    col("Quantity Ordered").alias("purchase_count")
)

### Join product category to the order lines

In [17]:
# Join product category from product dataframe
interactions_with_category = interactions.join(
    products.select(
        col("`Product ID`").alias('product_id'),
        col("`Product Category`")
    ),
    on="product_id",
    how="left"
)

### Add rating column

We use the purchase count as 'ratings' (for ALS) and perform log-transform for scaling

In [18]:
# Perform log-transform
interactions = interactions_with_category.withColumn("rating", log1p(col("purchase_count"))) # log1p for log-transform on value plus one

### Map IDs to integer indices

To accomodate PySpark ALS' requirement, that ID's should be within the integer-range

In [19]:
# Create and fit indexers
customer_indexer = StringIndexer(inputCol="customer_id", outputCol="customer_index").fit(interactions)
product_indexer = StringIndexer(inputCol="product_id", outputCol="product_index").fit(interactions)

In [20]:
MODEL_PATH = os.path.abspath(os.path.join('/sparkdata/wholesale-recommender', 'models'))

customer_model_path = os.path.join(MODEL_PATH, "customer_indexer_model")
product_model_path = os.path.join(MODEL_PATH, "product_indexer_model")

customer_indexer.write().overwrite().save(customer_model_path)
product_indexer.write().overwrite().save(product_model_path)

In [21]:
# Transform
interactions_indexed = customer_indexer.transform(interactions)
interactions_indexed = product_indexer.transform(interactions_indexed)

# Cast
als_input = interactions_indexed.select(
    col("customer_id"), col("`Product Category`"),
    col("customer_index").cast("int"),
    col("product_index").cast("int"),
    col("rating")
)

### Save processed model input

In [23]:
DATA_PATH = os.path.abspath(os.path.join('/sparkdata/wholesale-recommender', 'processed'))

# Save as parquet
als_input.write.mode("overwrite").parquet(os.path.join(DATA_PATH, "interactions"))