In [15]:
from work.pipeline.freshkart_io import (
    create_spark_session,
    read_customers,
    read_refunds,
    read_orders_for_date,
)

spark = create_spark_session()

customers_df = read_customers(spark, "work/data/customers.csv")
refunds_df = read_refunds(spark, "work/data/refunds.csv")
orders_df = read_orders_for_date(spark, "work/data", "2025-03-01")  # Ã  adapter

customers_df.printSchema()
orders_df.printSchema()
refunds_df.printSchema()


root
 |-- customer_id: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- city: string (nullable = true)
 |-- is_active: boolean (nullable = true)

root
 |-- channel: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- qty: long (nullable = true)
 |    |    |-- sku: string (nullable = true)
 |    |    |-- unit_price: double (nullable = true)
 |-- order_id: string (nullable = true)
 |-- payment_status: string (nullable = true)

root
 |-- refund_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- reason: string (nullable = true)
 |-- created_at: timestamp (nullable = true)



In [17]:
from pyspark.sql.functions import col

print("=== Customers ===")
print(f"Rows: {customers_df.count()}")
customers_df.select(
    "customer_id",
    "city",
    "is_active"
).show(5, truncate=False)

print("=== Orders ===")
print(f"Rows: {orders_df.count()}")
orders_df.select(
    "order_id",
    "customer_id",
    "payment_status",
    "channel",
    "created_at"
).show(5, truncate=False)

print("=== Refunds ===")
print(f"Rows: {refunds_df.count()}")
refunds_df.select(
    "refund_id",
    "order_id",
    "amount",
    "created_at"
).show(5, truncate=False)


=== Customers ===
Rows: 800
+-----------+--------+---------+
|customer_id|city    |is_active|
+-----------+--------+---------+
|C0001      |Nantes  |true     |
|C0002      |Toulouse|true     |
|C0003      |Bordeaux|true     |
|C0004      |Bordeaux|true     |
|C0005      |Lyon    |true     |
+-----------+--------+---------+
only showing top 5 rows

=== Orders ===
Rows: 103
+-------------+-----------+--------------+-------+-------------------+
|order_id     |customer_id|payment_status|channel|created_at         |
+-------------+-----------+--------------+-------+-------------------+
|O202503010001|C0793      |paid          |app    |2025-03-01 20:36:44|
|O202503010002|C0676      |paid          |web    |2025-03-01 11:30:49|
|O202503010003|C0642      |paid          |web    |2025-03-01 07:27:00|
|O202503010004|C0283      |pending       |web    |2025-03-01 14:28:46|
|O202503010005|C0571      |paid          |web    |2025-03-01 22:29:42|
+-------------+-----------+--------------+-------+-------