# Feature Engineering

This notebook performs the feature engineering

## Imports

In [3]:
import os

# ETL and Data Manipulation
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, to_date, expr
from pyspark.sql import functions as F
import numpy as np

# Visualizations
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

## Load data

In [4]:
spark = SparkSession.builder \
    .appName("LocalSparkForTesting") \
    .master("local[1]") \
    .getOrCreate()

In [6]:
DATA_PATH = os.path.abspath(os.path.join('/sparkdata/wholesale-recommender', 'cleaned'))

order_lines = spark.read.parquet(os.path.join(DATA_PATH, "order_lines"))
products = spark.read.parquet(os.path.join(DATA_PATH, "products"))

## Feature Engineering

In [7]:
# Unique customer base
customers = order_lines.select("Customer ID").distinct()

### Activity-based Features

#### Order Count

In [8]:
order_counts = order_lines.groupBy("Customer ID") \
    .agg(F.countDistinct("Order ID").alias("order_count"))

customers = customers.join(order_counts, on="Customer ID", how="left")

#### Months Active

In [9]:
activity_extremes = order_lines.groupBy("Customer ID").agg(
    F.min("Date Order was placed").alias("first_order"),
    F.max("Date Order was placed").alias("most_recent_order")
).withColumn(
    "lifetime_days", F.datediff("most_recent_order", "first_order")
).withColumn(
    "active_months", F.col("lifetime_days") / 30
)

customers = customers.join(activity_extremes.select("Customer ID", "active_months"), on="Customer ID", how="left")


#### Avg days between orders

In [10]:
max_lifetime_days = activity_extremes.agg(F.max("lifetime_days")).first()[0]

customers = customers.withColumn(
    "avg_days_between_orders",
    F.when(F.col("order_count") > 1, (F.col("active_months") * 30) / F.col("order_count"))
     .otherwise(F.lit(max_lifetime_days))
)


### Monetary Features

#### Average order value

In [11]:
avg_order_value = order_lines.groupBy("Customer ID") \
    .agg(F.mean("Total Cost price").alias("avg_order_value"))

customers = customers.join(avg_order_value, on="Customer ID", how="left")


### Product Diversity

#### # Unique Categories bought and Unique Groups bought from

In [12]:
# Join with product details if needed
order_lines_products = order_lines.join(products, on="Product ID", how="left")

n_unique = order_lines_products.groupBy("Customer ID").agg(
    F.countDistinct("Product Category").alias("n_unique_categories"),
    F.countDistinct("Product Group").alias("n_unique_groups")
)

customers = customers.join(n_unique, on="Customer ID", how="left")


### Seasonality Profile

#### Quarterly Distribution of orders

In [13]:
from pyspark.sql import functions as F

# Add quarter column to orders
order_lines_with_quarter = order_lines.withColumn("Quarter", F.quarter("Delivery Date"))

# Aggregate cost per customer per quarter
quarterly_orders = order_lines_with_quarter.groupBy("Customer ID", "Quarter") \
    .agg(F.sum("Total Cost price").alias("sum_quarter_orders"))

# Pivot to wide format: Q1, Q2, Q3, Q4
quarterly_pivot = quarterly_orders.groupBy("Customer ID") \
    .pivot("Quarter", [1, 2, 3, 4]) \
    .agg(F.sum("sum_quarter_orders"))

# Fill missing quarters with 0
for q in [1, 2, 3, 4]:
    quarterly_pivot = quarterly_pivot.withColumn(str(q), F.coalesce(F.col(str(q)), F.lit(0)))

# Normalize to get ratios per quarter
total = sum([F.col(str(q)) for q in [1, 2, 3, 4]])
for q in [1, 2, 3, 4]:
    quarterly_pivot = quarterly_pivot.withColumn(f"Q{q}_rate", F.col(str(q)) / total)

# Select final normalized columns
quarterly_rates = quarterly_pivot.select(
    "Customer ID", "Q1_rate", "Q2_rate", "Q3_rate", "Q4_rate"
)

# Join to customers
customers = customers.join(quarterly_rates, on="Customer ID", how="left")


### Purchases from categories

In [14]:
from pyspark.sql.functions import concat_ws

# 1. Join product metadata
order_lines_products = order_lines.join(products, on="Product ID", how="left")

# 2. Create hierarchical category column
order_lines_products = order_lines_products.withColumn(
    "product_group_cat_name",
    concat_ws("_", F.col("Product Line"), F.col("Product Category"))
)

# 3. Sum total spend by customer + category
customer_orders_sum_prod_group = order_lines_products.groupBy("Customer ID", "product_group_cat_name") \
    .agg(F.sum("Total Cost price").alias("group_sum"))

# 4. Pivot to wide format: 1 column per group
order_lines_product_groups_agg = customer_orders_sum_prod_group.groupBy("Customer ID") \
    .pivot("product_group_cat_name") \
    .agg(F.first("group_sum"))

# 5. Fill nulls with 0
order_lines_product_groups_agg = order_lines_product_groups_agg.fillna(0)

# 6. Normalize row-wise: value / row sum
row_sum_expr = sum(F.col(c) for c in order_lines_product_groups_agg.columns if c != "Customer ID")
normalized = order_lines_product_groups_agg.select(
    "Customer ID",
    *[
        (F.col(c) / row_sum_expr).alias(f"category_{c}")
        for c in order_lines_product_groups_agg.columns if c != "Customer ID"
    ]
)

# 7. Join back to customer
customers = customers.join(normalized, on="Customer ID", how="left")

## Save

In [21]:
OUTPUT_PATH = os.path.abspath(os.path.join('/sparkdata/wholesale-recommender', 'processed'))
customers.write.mode("overwrite").parquet(os.path.join(OUTPUT_PATH, "customers_features"))