In [74]:
from pyspark.sql.functions import *

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 82, Finished, Available, Finished)

### **Silver Layer Transformations for <mark>Customer</mark>**

In [75]:
customer_df = spark.read.format("csv").option("header","true").load("Files/customer_data_bronze/customers.csv")
#display(customer_df)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 83, Finished, Available, Finished)

In [76]:
customer_df.printSchema

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 84, Finished, Available, Finished)

<bound method DataFrame.printSchema of DataFrame[CustomerID: string, CustomerName: string, Email: string, Location: string, SignupDate: string]>

###### **Convert SignupDate to a DateType**

In [77]:
customer_df = customer_df.withColumn("SignupDate", to_date("SignupDate", "yyyy-MM-dd"))

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 85, Finished, Available, Finished)

###### **Derive Useful Date Columns**

In [78]:

customer_df = customer_df \
    .withColumn("SignupYear", year("SignupDate")) \
    .withColumn("SignupMonth", month("SignupDate")) \
    .withColumn("SignupDay", dayofmonth("SignupDate")) \
    .withColumn("SignupQuarter", quarter("SignupDate")) \
    .withColumn("CustomerTenureDays", datediff(current_date(), "SignupDate"))


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 86, Finished, Available, Finished)

###### **categorize customers by when they joined.**

In [79]:

customer_df = customer_df.withColumn(
    "CustomerType",
    when(col("SignupDate") >= "2025-01-01", "New")
    .otherwise("Existing")
)


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 87, Finished, Available, Finished)

In [80]:
customer_df.select(
    "CustomerID", "SignupDate", "SignupYear",
    "SignupMonth", "SignupQuarter",
    "CustomerTenureDays", "CustomerType"
).show()

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 88, Finished, Available, Finished)

+----------+----------+----------+-----------+-------------+------------------+------------+
|CustomerID|SignupDate|SignupYear|SignupMonth|SignupQuarter|CustomerTenureDays|CustomerType|
+----------+----------+----------+-----------+-------------+------------------+------------+
|   CUST100|2023-01-10|      2023|          1|            1|              1038|    Existing|
|   CUST101|2023-09-14|      2023|          9|            3|               791|    Existing|
|   CUST102|2024-09-20|      2024|          9|            3|               419|    Existing|
|   CUST103|2022-12-02|      2022|         12|            4|              1077|    Existing|
|   CUST104|2023-05-07|      2023|          5|            2|               921|    Existing|
|   CUST105|2022-06-04|      2022|          6|            2|              1258|    Existing|
|   CUST106|2024-07-15|      2024|          7|            3|               486|    Existing|
|   CUST107|2022-09-03|      2022|          9|            3|          

In [81]:
customer_df.groupBy("CustomerID") \
    .agg(count("*").alias("count")) \
    .filter("count > 1") \
    .show()

customer_df.groupBy("Email") \
    .agg(count("*").alias("count")) \
    .filter("count > 1") \
    .show()

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 89, Finished, Available, Finished)

+----------+-----+
|CustomerID|count|
+----------+-----+
+----------+-----+

+-----+-----+
|Email|count|
+-----+-----+
+-----+-----+



###### **Remove Duplicates**

In [82]:
#df_silver = df_bronze.dropDuplicates(["CustomerID", "Email"])

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 90, Finished, Available, Finished)

###### **Trim Whitespaces & Standardize Case**

In [83]:

#customer_df = customer_df.withColumn("CustomerName", trim(df_silver["CustomerName"])) \
                     #.withColumn("Location", trim(df_silver["Location"])) \
                     #.withColumn("Email", trim(df_silver["Email"])) \
                     #.withColumn("CustomerID", upper(df_silver["CustomerID"]))


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 91, Finished, Available, Finished)

###### **Check Invalid Emails**

In [84]:
invalid_emails_df = customer_df.filter(~col("Email").rlike("^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$"))
invalid_emails_df.show(truncate=False)

invalid_count = invalid_emails_df.count()
print(f"Found {invalid_count} invalid email(s).")

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 92, Finished, Available, Finished)

+----------+------------+-----+--------+----------+----------+-----------+---------+-------------+------------------+------------+
|CustomerID|CustomerName|Email|Location|SignupDate|SignupYear|SignupMonth|SignupDay|SignupQuarter|CustomerTenureDays|CustomerType|
+----------+------------+-----+--------+----------+----------+-----------+---------+-------------+------------------+------------+
+----------+------------+-----+--------+----------+----------+-----------+---------+-------------+------------------+------------+

Found 0 invalid email(s).


###### **Handle Invalid Emails**

In [85]:
#customer_df = customer_df.withColumn(
    #"Email",
    #when(col("Email").rlike("^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$"), col("Email"))
    #.otherwise(lit("unknown@example.com"))
#)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 93, Finished, Available, Finished)

###### **Extract email domains**

In [86]:
customer_df = customer_df.withColumn("EmailDomain", split(customer_df["Email"], "@")[1])

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 94, Finished, Available, Finished)

In [87]:
#display(customer_df)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 95, Finished, Available, Finished)

### **Silver Layer Transformations for <mark>Orders</mark>**

In [88]:
orders_df = spark.read.format("csv").option("header","true").load("Files/Orders_data_bronze/orders.csv")
#display(orders_df)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 96, Finished, Available, Finished)

In [89]:
orders_df.printSchema

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 97, Finished, Available, Finished)

<bound method DataFrame.printSchema of DataFrame[OrderID: string, OrderDate: string, CustomerID: string, ProductID: string, Quantity: string, TotalAmount: string, PaymentMethod: string]>

###### **Data Type Standardization**

In [90]:
orders_df = orders_df.withColumn("OrderDate", to_timestamp("OrderDate", "yyyy-MM-dd HH:mm:ss.SSSSSSSSS"))

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 98, Finished, Available, Finished)

In [91]:
order_df = orders_df \
    .withColumn("Quantity", col("Quantity").cast("int")) \
    .withColumn("TotalAmount", col("TotalAmount").cast("double"))

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 99, Finished, Available, Finished)

###### **Check Duplicates**

In [92]:
orders_df.groupBy("OrderID") \
    .agg(count("*").alias("count")) \
    .filter("count > 1") \
    .show()


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 100, Finished, Available, Finished)

+-------+-----+
|OrderID|count|
+-------+-----+
+-------+-----+



In [93]:
# Sometimes the same customer + product + order date can appear multiple times due to duplicate ingestion.
orders_df.groupBy("CustomerID", "ProductID", "OrderDate") \
    .agg(count("*").alias("count")) \
    .filter("count > 1") \
    .show()


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 101, Finished, Available, Finished)

+----------+---------+---------+-----+
|CustomerID|ProductID|OrderDate|count|
+----------+---------+---------+-----+
+----------+---------+---------+-----+



In [94]:
# Payment Method Consistency Check
orders_df.groupBy("OrderID") \
    .agg(countDistinct("PaymentMethod").alias("distinct_payments")) \
    .filter("distinct_payments > 1") \
    .show()


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 102, Finished, Available, Finished)

+-------+-----------------+
|OrderID|distinct_payments|
+-------+-----------------+
+-------+-----------------+



In [95]:
# Amount or Quantity Outliers: check invalid quantity or amount (e.g., negative, zero)
orders_df.filter("Quantity <= 0 OR TotalAmount <= 0").show()

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 103, Finished, Available, Finished)

+-------+---------+----------+---------+--------+-----------+-------------+
|OrderID|OrderDate|CustomerID|ProductID|Quantity|TotalAmount|PaymentMethod|
+-------+---------+----------+---------+--------+-----------+-------------+
+-------+---------+----------+---------+--------+-----------+-------------+



In [96]:
# Invalid or Future order date:
orders_df.filter(to_date(col("OrderDate")) > current_date()).show()

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 104, Finished, Available, Finished)

+-------+---------+----------+---------+--------+-----------+-------------+
|OrderID|OrderDate|CustomerID|ProductID|Quantity|TotalAmount|PaymentMethod|
+-------+---------+----------+---------+--------+-----------+-------------+
+-------+---------+----------+---------+--------+-----------+-------------+



###### **Extract Date Parts: helps with date and time based analysis**

In [97]:
orders_df = orders_df \
    .withColumn("OrderYear", year("OrderDate")) \
    .withColumn("OrderMonth", month("OrderDate")) \
    .withColumn("OrderDay", dayofmonth("OrderDate")) \
    .withColumn("OrderQuarter", quarter("OrderDate")) \
    .withColumn("OrderWeekday", date_format("OrderDate", "E"))

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 105, Finished, Available, Finished)

In [98]:
# Adds an average item price per order for product-level insights.
orders_df = orders_df \
    .withColumn("AvgItemPrice", round(col("TotalAmount") / col("Quantity"), 2))


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 106, Finished, Available, Finished)

In [99]:
# Classify Order Value
orders_df = orders_df.withColumn(
    "OrderValueCategory",
    when(col("TotalAmount") < 100, "Low") \
    .when((col("TotalAmount") >= 100) & (col("TotalAmount") < 300), "Medium") \
    .otherwise("High")
)


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 107, Finished, Available, Finished)

In [100]:
# Standardize Payment Method Names
orders_df = orders_df.withColumn("PaymentMethod", initcap(lower(col("PaymentMethod"))))

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 108, Finished, Available, Finished)

In [101]:
# Calculate Daily Customer Order Frequency
from pyspark.sql.window import Window

window_spec = Window.partitionBy("CustomerID", "OrderYear", "OrderMonth", "OrderDay")

orders_df = orders_df.withColumn("OrdersPerDay", count("OrderID").over(window_spec))


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 109, Finished, Available, Finished)

In [102]:
#display(orders_df)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 110, Finished, Available, Finished)

### **Silver Layer Transformations for <mark>Products</mark>**

In [103]:
products_df = spark.read.format("csv").option("header","true").load("Files/products_data_bronze/products.csv")
#display(products_df)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 111, Finished, Available, Finished)

In [104]:
products_df.printSchema()

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 112, Finished, Available, Finished)

root
 |-- ProductID: string (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Stock: string (nullable = true)
 |-- UnitPrice: string (nullable = true)



###### **Schema enforcement, trim text & canonicalize category**

In [105]:
products_df = products_df \
    .withColumn("ProductID", trim(col("ProductID"))) \
    .withColumn("ProductName", trim(col("ProductName"))) \
    .withColumn("Category", initcap(trim(col("Category")))) \
    .withColumn("Stock", col("Stock").cast("int")) \
    .withColumn("UnitPrice", col("UnitPrice").cast("double"))


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 113, Finished, Available, Finished)

###### **Data validation & null / bad-value handling**

In [106]:
# mark invalids instead of dropping (so you can review)
products_df = products_df.withColumn(
    "is_valid",
    (col("Stock").isNotNull()) &
    (col("UnitPrice").isNotNull()) &
    (col("Stock") >= 0) &
    (col("UnitPrice") > 0)
)


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 114, Finished, Available, Finished)

###### **Inventory value (Stock × UnitPrice)**

In [107]:
# Gives the monetary exposure per product (useful for inventory valuation and prioritization).
products_df = products_df.withColumn(
    "InventoryValue",
    round(col("Stock") * col("UnitPrice"), 2)
)


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 115, Finished, Available, Finished)

###### **Price buckets using quantiles**

In [108]:
# compute quantiles (approxQuantile is fast and scalable)
quantiles = products_df.approxQuantile("UnitPrice", [0.33, 0.66], 0.01)
q1, q2 = quantiles[0], quantiles[1]

products_df = products_df.withColumn(
    "PriceBucket",
    when(col("UnitPrice") <= q1, "Low")
     .when((col("UnitPrice") > q1) & (col("UnitPrice") <= q2), "Medium")
     .otherwise("High")
)


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 116, Finished, Available, Finished)

In [109]:
#display(products_df)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 117, Finished, Available, Finished)

###### **Perform INNER JOIN**

In [110]:
orders_enriched_df = orders_df \
    .join(customer_df, on="CustomerID", how="left") \
    .join(products_df, on="ProductID", how="left")

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 118, Finished, Available, Finished)

In [111]:
orders_enriched_df = orders_enriched_df.withColumn(
    "customer_missing", col("CustomerName").isNull()
).withColumn(
    "product_missing", col("ProductName").isNull()
)


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 119, Finished, Available, Finished)

In [112]:
print("Total orders:", orders_enriched_df.count())  # Should be 3000
print("Distinct customers:", orders_enriched_df.select("CustomerID").distinct().count())  # 100


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 120, Finished, Available, Finished)

Total orders: 3000
Distinct customers: 100


In [113]:
orders_enriched_df = orders_enriched_df.withColumn("Quantity", col("Quantity").cast("integer")) \
                                       .withColumn("TotalAmount", col("TotalAmount").cast("double"))

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 121, Finished, Available, Finished)

In [114]:
#display(orders_enriched_df)
orders_enriched_df.printSchema()

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 122, Finished, Available, Finished)

root
 |-- ProductID: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- OrderID: string (nullable = true)
 |-- OrderDate: timestamp (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- PaymentMethod: string (nullable = true)
 |-- OrderYear: integer (nullable = true)
 |-- OrderMonth: integer (nullable = true)
 |-- OrderDay: integer (nullable = true)
 |-- OrderQuarter: integer (nullable = true)
 |-- OrderWeekday: string (nullable = true)
 |-- AvgItemPrice: double (nullable = true)
 |-- OrderValueCategory: string (nullable = false)
 |-- OrdersPerDay: long (nullable = false)
 |-- CustomerName: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- SignupDate: date (nullable = true)
 |-- SignupYear: integer (nullable = true)
 |-- SignupMonth: integer (nullable = true)
 |-- SignupDay: integer (nullable = true)
 |-- SignupQuarter: integer (nullable = true)
 |-- Custom

In [115]:
orders_enriched_df = orders_enriched_df.select(
    # Orders info
    "OrderID", "OrderDate", "CustomerID", "ProductID", "Quantity", "TotalAmount", "PaymentMethod",
    "OrderYear", "OrderMonth", "OrderDay", "OrderQuarter", "OrderWeekday",
    "AvgItemPrice", "OrderValueCategory", "OrdersPerDay",
    
    # Customer info
    "CustomerName", "Email", "Location", "SignupDate", "SignupYear", "SignupMonth", "SignupDay",
    "SignupQuarter", "CustomerTenureDays", "CustomerType", "EmailDomain",
    
    # Product info
    "ProductName", "Category", "Stock", "UnitPrice", "is_valid", "InventoryValue", "PriceBucket",
    
    # Flags for missing data
    "customer_missing", "product_missing"
)


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 123, Finished, Available, Finished)

In [116]:
orders_enriched_df.groupBy("customer_missing").count().show()
orders_enriched_df.groupBy("product_missing").count().show()


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 124, Finished, Available, Finished)

+----------------+-----+
|customer_missing|count|
+----------------+-----+
|           false| 3000|
+----------------+-----+

+---------------+-----+
|product_missing|count|
+---------------+-----+
|           true|   61|
|          false| 2939|
+---------------+-----+



In [117]:
# Details of missing product
orders_enriched_df.filter(col("product_missing") == True).select(
    "OrderID", "CustomerID", "ProductID", "OrderDate"
).show(truncate=False)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 125, Finished, Available, Finished)

+-------+----------+---------+--------------------------+
|OrderID|CustomerID|ProductID|OrderDate                 |
+-------+----------+---------+--------------------------+
|ORD1061|CUST138   |PROD250  |2022-01-23 06:32:17.445815|
|ORD1114|CUST111   |PROD250  |2022-02-11 14:58:22.767589|
|ORD1173|CUST105   |PROD250  |2022-03-05 03:59:07.182394|
|ORD1246|CUST106   |PROD250  |2022-03-31 19:40:42.814271|
|ORD1258|CUST135   |PROD250  |2022-04-05 04:50:01.000333|
|ORD1307|CUST154   |PROD250  |2022-04-23 02:13:00.260086|
|ORD1346|CUST110   |PROD250  |2022-05-07 07:58:14.364788|
|ORD1462|CUST119   |PROD250  |2022-06-18 16:28:10.163387|
|ORD1660|CUST174   |PROD250  |2022-08-29 23:31:40.233411|
|ORD1698|CUST188   |PROD250  |2022-09-12 20:31:07.822607|
|ORD1701|CUST132   |PROD250  |2022-09-13 22:48:27.369123|
|ORD1724|CUST172   |PROD250  |2022-09-22 08:21:17.225741|
|ORD1735|CUST142   |PROD250  |2022-09-26 08:44:48.896298|
|ORD1757|CUST118   |PROD250  |2022-10-04 09:31:52.237412|
|ORD1807|CUST1

In [118]:
orders_enriched_df.write.mode("overwrite").parquet("Files/customer_data_silver/customers_orders")

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 126, Finished, Available, Finished)

### **Silver Layer Transformations for <mark>Reviews</mark>**

In [119]:
reviews_df = spark.read.json("Files/Bronze_Reviews")
# display(reviews_df)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 127, Finished, Available, Finished)

In [120]:
# Convert timestamp: 
reviews_df = reviews_df.withColumn("review_date", from_unixtime(col("timestamp") / 1000))

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 128, Finished, Available, Finished)

In [121]:
# Extract Date Components
reviews_df = reviews_df.withColumn("review_year", year(col("review_date"))) \
                       .withColumn("review_month", month(col("review_date"))) \
                       .withColumn("review_day", dayofmonth(col("review_date"))) \
                       .withColumn("review_quarter", quarter(col("review_date"))) \
                       .withColumn("review_weekday", date_format(col("review_date"), "E"))


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 129, Finished, Available, Finished)

In [122]:
# Add sentiment length or word count: Analysts can identify short vs detailed reviews.
reviews_df = reviews_df.withColumn("review_length", length(col("review_text")))

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 130, Finished, Available, Finished)

In [123]:
# Catagorize Ratings
reviews_df = reviews_df.withColumn(
    "rating_category",
    when(col("rating") >= 4, "Positive")
    .when(col("rating") == 3, "Neutral")
    .otherwise("Negative")
)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 131, Finished, Available, Finished)

In [124]:
# Handle missing or invalid data: Helps to track incomplete review data.
reviews_df = reviews_df.withColumn(
    "is_valid",
    (~(col("customer_id").isNull() | col("product_id").isNull() | col("rating").isNull()))
)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 132, Finished, Available, Finished)

In [125]:
reviews_df.groupBy("customer_id") \
    .agg(count("*").alias("count")) \
    .filter("count > 1") \
    #.show()

reviews_df.groupBy("product_id") \
    .agg(count("*").alias("count")) \
    .filter("count > 1") \
    #.show()

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 133, Finished, Available, Finished)

DataFrame[product_id: string, count: bigint]

In [126]:
#display(reviews_df)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 134, Finished, Available, Finished)

In [127]:
reviews_df.write.mode("overwrite").parquet("Files/customer_data_silver/customers_reviews")

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 135, Finished, Available, Finished)

### **Silver Layer Transformations for <mark>Social Media</mark>**

In [128]:
social_media_df = spark.read.json("Files/Bronze_Social_Media")
# display(social_media_df)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 136, Finished, Available, Finished)

In [129]:
social_media_df.groupBy("content") \
    .agg(count("*").alias("count")) \
    .filter("count > 1") \
    #.show()

social_media_df.groupBy("platform") \
    .agg(count("*").alias("count")) \
    .filter("count > 1") \
    #.show()

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 137, Finished, Available, Finished)

DataFrame[platform: string, count: bigint]

In [130]:
# Convert Timestamp
social_media_df = social_media_df.withColumn("post_date", from_unixtime(col("timestamp") / 1000))

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 138, Finished, Available, Finished)

In [131]:
# Extract Date Components
social_media_df = social_media_df.withColumn("post_year", year(col("post_date"))) \
                                 .withColumn("post_month", month(col("post_date"))) \
                                 .withColumn("post_day", dayofmonth(col("post_date"))) \
                                 .withColumn("post_quarter", quarter(col("post_date"))) \
                                 .withColumn("post_weekday", date_format(col("post_date"), "E"))


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 139, Finished, Available, Finished)

In [132]:
# Calculate Post Length
social_media_df = social_media_df.withColumn("content_length", length(col("content")))

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 140, Finished, Available, Finished)

In [133]:
# Flag invalid and empty content
social_media_df = social_media_df.withColumn(
    "is_valid",
    ~(col("content").isNull() | (col("content") == ""))
)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 141, Finished, Available, Finished)

In [134]:
# Sentiment Scoring
social_media_df = social_media_df.withColumn(
    "sentiment_score",
    when(col("sentiment") == "positive", 1)
    .when(col("sentiment") == "neutral", 0)
    .otherwise(-1)
)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 142, Finished, Available, Finished)

In [135]:
# display(social_media_df)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 143, Finished, Available, Finished)

In [136]:
social_media_df.write.mode("overwrite").parquet("Files/customer_data_silver/customers_social_media")

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 144, Finished, Available, Finished)

### **Silver Layer Transformations for <mark>Web Logs</mark>**

In [137]:
web_logs_df = spark.read.json("Files/Bronze_Web_Logs")
#display(web_logs_df)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 145, Finished, Available, Finished)

In [138]:
web_logs_df.groupBy("user_id") \
        .agg(count("*").alias("count")) \
        .filter("count > 1") \
        #.show()

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 146, Finished, Available, Finished)

DataFrame[user_id: string, count: bigint]

In [139]:
# Convert Timestamp
web_logs_df = web_logs_df.withColumn("event_time", from_unixtime(col("timestamp") / 1000))

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 147, Finished, Available, Finished)

In [140]:
# Extract date components
web_logs_df = web_logs_df.withColumn("event_year", year(col("event_time"))) \
                         .withColumn("event_month", month(col("event_time"))) \
                         .withColumn("event_day", dayofmonth(col("event_time"))) \
                         .withColumn("event_weekday", date_format(col("event_time"), "E")) \
                         .withColumn("event_hour", date_format(col("event_time"), "H"))


StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 148, Finished, Available, Finished)

In [141]:
# Categorize User Action: Useful for funnel analysis (browsing → shopping → conversion).
web_logs_df = web_logs_df.withColumn(
    "action_group",
    when(col("action").isin("click", "view"), "Browsing")
    .when(col("action") == "add_to_cart", "Shopping")
    .when(col("action") == "purchase", "Conversion")
    .otherwise("Other")
)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 149, Finished, Available, Finished)

In [142]:
# display(web_logs_df)

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 150, Finished, Available, Finished)

In [143]:
web_logs_df.write.mode("overwrite").parquet("Files/customer_data_silver/customer_web_logs")

StatementMeta(, e8231c7d-4987-4bdd-8686-abeb553243e6, 151, Finished, Available, Finished)