In [0]:
%sql
CREATE CATALOG log_data;
USE CATALOG log_data;

In [0]:
%sql
CREATE VOLUME raw_vol;

In [0]:
warehouses = spark.read.csv("/Volumes/log_data/default/raw_vol/warehouses.csv", header=True)
carriers   = spark.read.csv("/Volumes/log_data/default/raw_vol/carriers.csv", header=True)
regions    = spark.read.csv("/Volumes/log_data/default/raw_vol/regions.csv", header=True)

warehouse_ids = [r.warehouse_id for r in warehouses.select("warehouse_id").collect()]
carrier_ids   = [r.carrier_id for r in carriers.select("carrier_id").collect()]
region_ids    = [r.region_id for r in regions.select("region_id").collect()]


In [0]:
display(warehouses.count())
display(carriers.count())
display(regions.count())

In [0]:
region_ids

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

existing_df = spark.read.csv(
    "/Volumes/log_data/default/raw_vol/shipments_daily.csv",
    header=True,
    inferSchema=True
)

existing_count = existing_df.count()
existing_count


In [0]:
NEW_ROWS = 2500000



In [0]:
existing_df.groupBy("delivery_days").count().orderBy("delivery_days").show()


#### data generation

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

new_df = (
    spark.range(NEW_ROWS)

    # shipment_id
    .withColumn(
        "shipment_id",
        concat(lit("SHP"), (col("id") + lit(10_000_000)))
    )

    # warehouse_id
    .withColumn(
        "warehouse_id",
        element_at(
            array(*[lit(x) for x in warehouse_ids]),
            ((col("id") % len(warehouse_ids)) + 1).cast("int")
        )
    )

    # region_id
    .withColumn(
        "region_id",
        element_at(
            array(*[lit(x) for x in region_ids]),
            ((col("id") % len(region_ids)) + 1).cast("int")
        )
    )

    # carrier_id (allow ~5% NULLs)
    .withColumn(
        "carrier_id",
        when(col("id") % 20 == 0, lit(None))
        .otherwise(
            element_at(
                array(*[lit(x) for x in carrier_ids]),
                ((col("id") % len(carrier_ids)) + 1).cast("int")
            )
        )
    )

    # shipment_cost
    .withColumn("shipment_cost", round(rand() * 5000 + 50, 2))

    # delivery_status
    .withColumn("delivery_status", lit("Delivered"))

    # fragile
    .withColumn("is_fragile", when(rand() < 0.25, "Y").otherwise("N"))

    # shipment_date
    .withColumn(
        "shipment_date",
        date_add(lit("2023-01-01"), ((col("id") % 365).cast("int")))
    )

    # delivery_days (RANDOM 1â€“7)
    .withColumn(
        "delivery_days",
        (floor(rand() * 7) + 1).cast("int")
    )

    # delivery_date (aligned with delivery_days)
    .withColumn(
        "delivery_date",
        date_add(col("shipment_date"), col("delivery_days"))
    )

    # package weight
    .withColumn("package_weight_kg", round(rand() * 50 + 0.5, 2))

    # declared value
    .withColumn("declared_value_inr", round(rand() * 500000 + 500, 2))

    # payment type
    .withColumn("payment_type", when(rand() < 0.65, "Prepaid").otherwise("COD"))

    # priority
    .withColumn(
        "priority_level",
        when(rand() < 0.5, "Low")
        .when(rand() < 0.85, "Medium")
        .otherwise("High")
    )

    # created timestamp
    .withColumn("created_ts", current_timestamp())

    # DROP Spark-generated ID
    .drop("id")
)


In [0]:
new_df.count()

In [0]:
display(new_df)

In [0]:
display(len(new_df.columns))
display(new_df.columns)

In [0]:
# Ensure column order matches exactly
new_df = new_df.select(existing_df.columns)

final_df = existing_df.unionByName(new_df)


######  save it as a new csv file


In [0]:
final_df.coalesce(1) \
  .write \
  .mode("overwrite") \
  .option("header", True) \
  .csv("/Volumes/log_data/default/raw_vol/shipments_daily_final")


In [0]:
df_clean1 = spark.read.csv("/Volumes/log_data/default/raw_vol/shipments_daily_final/part-00000-tid-4984762928131568083-6c800585-9cbd-469f-ba13-03a8f732e2bb-264-1-c000.csv", header=True)

In [0]:
display(df_clean1.count())

In [0]:
fact_warehouse_keys = (
    df_clean1
    .select("warehouse_id")
    .where(col("warehouse_id").isNotNull())
    .distinct()
)


fact_carrier_keys = (
    df_clean1
    .select("carrier_id")
    .where(col("carrier_id").isNotNull())
    .distinct()
)

fact_region_keys = (
    df_clean1
    .select("region_id")
    .where(col("region_id").isNotNull())
    .distinct()
)


In [0]:
display(fact_warehouse_keys)

In [0]:
display(fact_carrier_keys)


In [0]:
display(fact_region_keys)