In [61]:
# Part 2: Prototype Implementation - Shipping Price Estimation MVP (Local Spark with Parquet)
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, rand, round, expr, udf, concat_ws
from pyspark.sql.types import DoubleType, StringType
import random
import os
import re
import builtins

In [62]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("ShippingPriceEstimationMVP") \
    .getOrCreate()

print("SparkSession initialized locally.")


SparkSession initialized locally.


In [63]:
DATA_INPUT_PATH = "data/Amazon Sale Report.csv"
PARQUET_OUTPUT_PATH = "output/shipping_estimates.parquet"

# Check output directory exists
os.makedirs("output", exist_ok=True)
print(f"Output directory 'output' ensured at: {os.path.abspath('output')}")

Output directory 'output' ensured at: /home/jovyan/output


In [64]:
try:
    sales_df = spark.read \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .csv(DATA_INPUT_PATH)
    print(f"Loaded data from {DATA_INPUT_PATH}")
    sales_df.printSchema()
except Exception as e:
    print(f"Error loading CSV: {e}")
    print("Please ensure 'Amazon Sale Report.csv' is in a 'data' folder next to your notebook.")
    spark.stop()
    exit() # Exit if data loading fails

# --- NEW: Clean Column Names ---
# This function will clean column names by removing special characters and spaces,
# and converting them to lowercase (snake_case like).
def clean_column_names(df):
    old_columns = df.columns
    new_columns = []
    for col_name in old_columns:
        # Remove leading/trailing spaces
        cleaned_name = col_name.strip()
        # Replace spaces and hyphens with underscore
        cleaned_name = cleaned_name.replace(" ", "_").replace("-", "_")
        # Convert to lowercase
        cleaned_name = cleaned_name.lower() # Use Python's string.lower()
        # Remove any non-alphanumeric characters (except underscore)
        cleaned_name = re.sub(r'[^a-z0-9_]', '', cleaned_name) 
        new_columns.append(cleaned_name)
    return df.toDF(*new_columns)

sales_df = clean_column_names(sales_df)
print("\nSchema after cleaning column names:")
sales_df.printSchema()
sales_df.show(5)

Loaded data from data/Amazon Sale Report.csv
root
 |-- index: integer (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Fulfilment: string (nullable = true)
 |-- Sales Channel : string (nullable = true)
 |-- ship-service-level: string (nullable = true)
 |-- Style: string (nullable = true)
 |-- SKU: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- ASIN: string (nullable = true)
 |-- Courier Status: string (nullable = true)
 |-- Qty: integer (nullable = true)
 |-- currency: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- ship-city: string (nullable = true)
 |-- ship-state: string (nullable = true)
 |-- ship-postal-code: double (nullable = true)
 |-- ship-country: string (nullable = true)
 |-- promotion-ids: string (nullable = true)
 |-- B2B: boolean (nullable = true)
 |-- fulfilled-by: string (nullable = true)
 |-- Unnamed: 22

In [65]:
# --- Data Cleaning and Preparation ---
# Selecting relevant columns and renaming for clarity
sales_df = sales_df.select(
    col("sku").alias("product_sku"),
    col("category").alias("product_category"),
    col("amount").alias("sale_amount"), 
    col("qty").alias("quantity_sold"), 
    col("ship_city").alias("customer_city"),
    col("ship_state").alias("customer_state"), 
    col("ship_postal_code").alias("customer_postal_code"), 
    col("ship_country").alias("customer_country"), 
    col("ship_service_level").alias("shipping_service_level"), 
    col("fulfilment").alias("fulfillment_method") 
)

# Drop rows with critical missing values for our calculation
sales_df = sales_df.dropna(subset=["product_sku", "sale_amount", "quantity_sold", "customer_postal_code", "customer_state"])

# Casting Amount and Qty to appropriate types
sales_df = sales_df.withColumn("sale_amount", col("sale_amount").cast(DoubleType())) \
                   .withColumn("quantity_sold", col("quantity_sold").cast(DoubleType())) # Cast to double for calculations

# Filtering out cancelled/returned orders or zero quantity/amount
sales_df = sales_df.filter(col("quantity_sold") > 0) \
                   .filter(col("sale_amount") > 0)
print("Data cleaning completed")

Data cleaning completed


In [66]:
# Simulate Product Dimensions and Weight (Not in original dataset) ---
category_dimensions_weights = {
    "Set": {"weight_kg_per_unit": 0.5, "length_cm": 20, "width_cm": 15, "height_cm": 5},
    "kurta": {"weight_kg_per_unit": 0.3, "length_cm": 25, "width_cm": 18, "height_cm": 3},
    "Western Dress": {"weight_kg_per_unit": 0.4, "length_cm": 30, "width_cm": 20, "height_cm": 4},
    "Top": {"weight_kg_per_unit": 0.2, "length_cm": 20, "width_cm": 15, "height_cm": 2},
    "Blouse": {"weight_kg_per_unit": 0.15, "length_cm": 18, "width_cm": 12, "height_cm": 2},
    "Bottom": {"weight_kg_per_unit": 0.25, "length_cm": 22, "width_cm": 16, "height_cm": 3},
    "Saree": {"weight_kg_per_unit": 0.8, "length_cm": 35, "width_cm": 25, "height_cm": 8},
    "Ethnic Dress": {"weight_kg_per_unit": 0.6, "length_cm": 30, "width_cm": 20, "height_cm": 6},
    "T-Shirt": {"weight_kg_per_unit": 0.1, "length_cm": 18, "width_cm": 12, "height_cm": 1.5},
}

# Converting map to a DataFrame
dimensions_df = spark.createDataFrame([
    (k, v["weight_kg_per_unit"], float(v["length_cm"]), float(v["width_cm"]), float(v["height_cm"]))
    for k, v in category_dimensions_weights.items()
], ["product_category", "weight_kg_per_unit", "length_cm", "width_cm", "height_cm"])

sales_with_dims_df = sales_df.join(dimensions_df, on="product_category", how="left")

# Fill missing dimensions/weights with defaults if category not found
default_weight = 0.2
default_length = 20
default_width = 15
default_height = 3

sales_with_dims_df = sales_with_dims_df.fillna({
    "weight_kg_per_unit": default_weight,
    "length_cm": default_length,
    "width_cm": default_width,
    "height_cm": default_height
})

# Calculate total weight and volume per order line
sales_with_dims_df = sales_with_dims_df.withColumn("total_weight_kg", col("weight_kg_per_unit") * col("quantity_sold")) \
                                       .withColumn("total_volume_cm3", col("length_cm") * col("width_cm") * col("height_cm") * col("quantity_sold"))

print("\nSales data with simulated dimensions and calculated total weight/volume:")
sales_with_dims_df.show(5)



Sales data with simulated dimensions and calculated total weight/volume:
+----------------+------------+-----------+-------------+-------------+--------------+--------------------+----------------+----------------------+------------------+------------------+---------+--------+---------+---------------+----------------+
|product_category| product_sku|sale_amount|quantity_sold|customer_city|customer_state|customer_postal_code|customer_country|shipping_service_level|fulfillment_method|weight_kg_per_unit|length_cm|width_cm|height_cm|total_weight_kg|total_volume_cm3|
+----------------+------------+-----------+-------------+-------------+--------------+--------------------+----------------+----------------------+------------------+------------------+---------+--------+---------+---------------+----------------+
|    Ethnic Dress|J0211-DR-XXL|      699.0|          1.0|GREATER NOIDA| UTTAR PRADESH|            201306.0|              IN|             Expedited|            Amazon|               0

In [67]:
# Define Hypothetical Warehouse and Simple Distance/Zone Logic 
# AI-Assisted: Simplified zone mapping function
def get_shipping_zone(postal_code):
    if postal_code is None:
        return "UNKNOWN"
    s_pc = str(int(postal_code)) 
    if s_pc.startswith("400") or s_pc.startswith("410"): 
        return "ZONE_A" 
    elif s_pc.startswith("560"): 
        return "ZONE_B" 
    elif s_pc.startswith("600") or s_pc.startswith("605"): 
        return "ZONE_C" 
    elif s_pc.startswith("110"): 
        return "ZONE_D" 
    else:
        return "ZONE_E" 

get_shipping_zone_udf = udf(get_shipping_zone, StringType())

sales_with_zones_df = sales_with_dims_df.withColumn(
    "shipping_zone",
    get_shipping_zone_udf(col("customer_postal_code"))
)

print("\nSales data with assigned shipping zones:")
sales_with_zones_df.show(5)




Sales data with assigned shipping zones:
+----------------+---------------+-----------+-------------+-------------+--------------+--------------------+----------------+----------------------+------------------+------------------+---------+--------+---------+---------------+----------------+-------------+
|product_category|    product_sku|sale_amount|quantity_sold|customer_city|customer_state|customer_postal_code|customer_country|shipping_service_level|fulfillment_method|weight_kg_per_unit|length_cm|width_cm|height_cm|total_weight_kg|total_volume_cm3|shipping_zone|
+----------------+---------------+-----------+-------------+-------------+--------------+--------------------+----------------+----------------------+------------------+------------------+---------+--------+---------+---------------+----------------+-------------+
|             Top|JNE3671-TU-XXXL|      574.0|          1.0|      CHENNAI|    TAMIL NADU|            600073.0|              IN|             Expedited|            A

In [68]:
# Implementation of Shipping Cost Estimation Logic
# AI-Assisted: Rule-based shipping cost function
# This function calculates a base shipping cost based on weight, volume, and zone.
def estimate_base_shipping_cost(total_weight_kg, total_volume_cm3, shipping_zone, shipping_service_level):
    base_cost = 0.0

    # Handle None values for inputs defensively
    total_weight_kg = total_weight_kg if total_weight_kg is not None else 0.0
    total_volume_cm3 = total_volume_cm3 if total_volume_cm3 is not None else 0.0
    shipping_zone = shipping_zone if shipping_zone is not None else "UNKNOWN"
    shipping_service_level = shipping_service_level if shipping_service_level is not None else "Standard"


    # Weight/Volume component
    # Assume Rs. 50/kg or Rs. 0.005/cm3 (volumetric weight)
    weight_cost = total_weight_kg * 50
    volume_cost = total_volume_cm3 * 0.005
    base_cost += max(weight_cost, volume_cost)

    # Zone component
    if shipping_zone == "ZONE_A":
        base_cost += 50 # Local
    elif shipping_zone == "ZONE_B":
        base_cost += 100 # Regional
    elif shipping_zone == "ZONE_C":
        base_cost += 120 # Regional
    elif shipping_zone == "ZONE_D":
        base_cost += 180 # Long-distance
    else:
        base_cost += 250 # Default/Other

    # Service Level component
    if shipping_service_level == "Expedited":
        base_cost += 100
    elif shipping_service_level == "Standard":
        base_cost += 0
    else: 
        base_cost += 50 
        
    return builtins.round(base_cost, 2)

estimate_base_shipping_cost_udf = udf(estimate_base_shipping_cost, DoubleType())

sales_with_base_shipping_df = sales_with_zones_df.withColumn(
    "base_shipping_cost",
    estimate_base_shipping_cost_udf(
        col("total_weight_kg"),
        col("total_volume_cm3"),
        col("shipping_zone"),
        col("shipping_service_level")
    )
)

print("\nSales data with estimated base shipping costs:")
sales_with_base_shipping_df.show(5)


Sales data with estimated base shipping costs:
+----------------+---------------+-----------+-------------+-------------+--------------+--------------------+----------------+----------------------+------------------+------------------+---------+--------+---------+---------------+----------------+-------------+------------------+
|product_category|    product_sku|sale_amount|quantity_sold|customer_city|customer_state|customer_postal_code|customer_country|shipping_service_level|fulfillment_method|weight_kg_per_unit|length_cm|width_cm|height_cm|total_weight_kg|total_volume_cm3|shipping_zone|base_shipping_cost|
+----------------+---------------+-----------+-------------+-------------+--------------+--------------------+----------------+----------------------+------------------+------------------+---------+--------+---------+---------------+----------------+-------------+------------------+
|             Top|JNE3671-TU-XXXL|      574.0|          1.0|      CHENNAI|    TAMIL NADU|           

In [69]:
# Apply Margin-Aware Logic
# AI-Assisted: Assume a hypothetical cost margin
sales_with_margin_df = sales_with_base_shipping_df.withColumn(
    "product_cost",
    col("sale_amount") * (1 - (0.2 + rand() * 0.2)) # Random cost margin between 20-40%
).withColumn(
    "gross_profit",
    col("sale_amount") - col("product_cost")
).withColumn(
    "gross_profit_margin_pct",
    (col("gross_profit") / col("sale_amount")) * 100
)

# Apply margin-aware free shipping logic
# If gross profit margin is above 35%, offer free shipping.
FREE_SHIPPING_MARGIN_THRESHOLD = 35.0

final_shipping_estimates_df = sales_with_margin_df.withColumn(
    "estimated_shipping_cost",
    when(col("gross_profit_margin_pct") >= FREE_SHIPPING_MARGIN_THRESHOLD, lit(0.0))
    .otherwise(col("base_shipping_cost"))
)

print("\nFinal shipping estimates with margin-aware logic (e.g., free shipping):")
final_shipping_estimates_df.select(
    "product_sku",
    "product_category",
    "sale_amount",
    "quantity_sold",
    "total_weight_kg",
    "total_volume_cm3",
    "customer_postal_code",
    "shipping_zone",
    "shipping_service_level",
    "base_shipping_cost",
    "product_cost",
    "gross_profit",
    "gross_profit_margin_pct",
    "estimated_shipping_cost"
).show(5, truncate=False)


Final shipping estimates with margin-aware logic (e.g., free shipping):
+------------+----------------+-----------+-------------+---------------+----------------+--------------------+-------------+----------------------+------------------+-----------------+------------------+-----------------------+-----------------------+
|product_sku |product_category|sale_amount|quantity_sold|total_weight_kg|total_volume_cm3|customer_postal_code|shipping_zone|shipping_service_level|base_shipping_cost|product_cost     |gross_profit      |gross_profit_margin_pct|estimated_shipping_cost|
+------------+----------------+-----------+-------------+---------------+----------------+--------------------+-------------+----------------------+------------------+-----------------+------------------+-----------------------+-----------------------+
|J0211-DR-XXL|Ethnic Dress    |699.0      |1.0          |0.6            |3600.0          |201306.0            |ZONE_E       |Expedited             |380.0             |5

In [70]:
# Write Results to Parquet Format
# Final columns for output
output_df = final_shipping_estimates_df.select(
    "product_sku",
    "product_category",
    "sale_amount",
    "quantity_sold",
    "total_weight_kg",
    "total_volume_cm3",
    "customer_city",
    "customer_state",
    "customer_postal_code",
    "shipping_zone",
    "shipping_service_level",
    "base_shipping_cost",
    "gross_profit_margin_pct",
    "estimated_shipping_cost"
)

output_df.write.mode("overwrite").parquet(PARQUET_OUTPUT_PATH)
print(f"\nShipping estimation results written to Parquet at: {PARQUET_OUTPUT_PATH}")

# Verify by reading it back
print("\nVerifying data by reading from Parquet:")
read_parquet_df = spark.read.parquet(PARQUET_OUTPUT_PATH)
read_parquet_df.show(5, truncate=False)

# Stop the SparkSession
spark.stop()
print("\nSparkSession stopped.")


Shipping estimation results written to Parquet at: output/shipping_estimates.parquet

Verifying data by reading from Parquet:
+------------+----------------+-----------+-------------+---------------+----------------+-------------+--------------+--------------------+-------------+----------------------+------------------+-----------------------+-----------------------+
|product_sku |product_category|sale_amount|quantity_sold|total_weight_kg|total_volume_cm3|customer_city|customer_state|customer_postal_code|shipping_zone|shipping_service_level|base_shipping_cost|gross_profit_margin_pct|estimated_shipping_cost|
+------------+----------------+-----------+-------------+---------------+----------------+-------------+--------------+--------------------+-------------+----------------------+------------------+-----------------------+-----------------------+
|J0211-DR-XXL|Ethnic Dress    |699.0      |1.0          |0.6            |3600.0          |GREATER NOIDA|UTTAR PRADESH |201306.0           