In [0]:
from pyspark.sql.functions import *


In [0]:
base_path = "s3://dhikshith-source-files/retailer_data_sources/"

customer_path = f"{base_path}/customer_data.csv"
product_path  = f"{base_path}/product_data.csv"
sales_path    = f"{base_path}/sales_returns_data.csv"
payment_path  = f"{base_path}/card_payment_refund_data.csv"


In [0]:
df_customer = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "false")
    .csv(customer_path)
)

display(df_customer)
from pyspark.sql.types import *

df_customer_clean = (
    df_customer
    .withColumn("customer_id", col("customer_id").cast(StringType()))
    .withColumn("customer_name", col("customer_name").cast(StringType()))
    .withColumn("email", col("email").cast(StringType()))
    .withColumn("phone", col("phone").cast(StringType()))
    .withColumn("gender", col("gender").cast(StringType()))
    .withColumn("age", col("age").cast(IntegerType()))
    .withColumn("region", col("region").cast(StringType()))
    .withColumn("city", col("city").cast(StringType()))
    .withColumn("signup_date", col("signup_date").cast(DateType()))
    .withColumn("ingestion_ts", current_timestamp())
)
df_customer_clean.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("retailer_sales.bronze.customer_data_raw")




In [0]:
df_product = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "false")
    .csv(product_path)
)

display(df_product)
from pyspark.sql.types import *

df_product_clean = (
    df_product
    .withColumn("product_id", col("product_id").cast(StringType()))
    .withColumn("product_name", col("product_name").cast(StringType()))
    .withColumn("sku", col("sku").cast(StringType()))
    .withColumn("category", col("category").cast(StringType()))
    .withColumn("supplier", col("supplier").cast(StringType()))
    .withColumn("cost_price", col("cost_price").cast(DoubleType()))
    .withColumn("selling_price", col("selling_price").cast(DoubleType()))
    .withColumn("ingestion_ts", current_timestamp())
)
df_product_clean.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("retailer_sales.bronze.product_data_raw")



In [0]:
df_sales = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "false")
    .csv(sales_path)
)

display(df_sales)
from pyspark.sql.types import *

df_sales_clean = (
    df_sales
    .withColumn("order_id", col("order_id").cast(StringType()))
    .withColumn("order_date", to_date(col("order_date"), "yyyy-MM-dd"))
    .withColumn("customer_id", col("customer_id").cast(StringType()))
    .withColumn("product_id", col("product_id").cast(StringType()))
    .withColumn("quantity", col("quantity").cast(IntegerType()))
    .withColumn("unit_price", col("unit_price").cast(DoubleType()))
    .withColumn("total_amount", col("total_amount").cast(DoubleType()))
    .withColumn("order_status", col("order_status").cast(StringType()))
    .withColumn("return_flag", col("return_flag").cast(StringType()))  # Y / N
    .withColumn("region", col("region").cast(StringType()))
    .withColumn("payment_mode", col("payment_mode").cast(StringType()))
    .withColumn("ingestion_ts", current_timestamp())
)
df_sales_clean.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("retailer_sales.bronze.sales_returns_raw")



In [0]:
df_payment = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "false")
    .csv(payment_path)
)

display(df_payment)
from pyspark.sql.types import *

df_payment_clean = (
    df_payment
    .withColumn("transaction_id", col("transaction_id").cast(StringType()))
    .withColumn("order_id", col("order_id").cast(StringType()))
    .withColumn("transaction_type", col("transaction_type").cast(StringType()))  # PAYMENT / REFUND
    .withColumn("amount", col("amount").cast(DoubleType()))
    .withColumn("transaction_date", to_date(col("transaction_date"), "yyyy-MM-dd"))
    .withColumn("payment_mode", col("payment_mode").cast(StringType()))
    .withColumn("ingestion_ts", current_timestamp())
)
df_payment_clean.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("retailer_sales.bronze.card_payment_refund_raw")



In [0]:
spark.sql("SELECT COUNT(*) FROM retailer_sales.bronze.customer_data_raw").show()
spark.sql("SELECT COUNT(*) FROM retailer_sales.bronze.product_data_raw").show()
spark.sql("SELECT COUNT(*) FROM retailer_sales.bronze.sales_returns_raw").show()
spark.sql("SELECT COUNT(*) FROM retailer_sales.bronze.card_payment_refund_raw").show()
