###Importing Pyspark Functions

In [0]:
from pyspark.sql.functions import *

###Reading the transactions file

In [0]:
tran_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv")

In [0]:
display(tran_df.limit(5))

Invoice ID,Line,Customer ID,Product ID,Size,Color,Unit Price,Quantity,Date,Discount,Line Total,Store ID,Employee ID,Currency,Currency Symbol,SKU,Transaction Type,Payment Method,Invoice Total
INV-US-001-03558761,1,47162,485,M,,80.5,1,2023-01-01T15:42:00.000Z,0.0,80.5,1,7,USD,$,MASU485-M-,Sale,Cash,126.7
INV-US-001-03558761,2,47162,2779,G,,31.5,1,2023-01-01T15:42:00.000Z,0.4,18.9,1,7,USD,$,CHCO2779-G-,Sale,Cash,126.7
INV-US-001-03558761,3,47162,64,M,NEUTRAL,45.5,1,2023-01-01T15:42:00.000Z,0.4,27.3,1,7,USD,$,MACO64-M-NEUTRAL,Sale,Cash,126.7
INV-US-001-03558762,1,10142,131,M,BLUE,70.0,1,2023-01-01T20:04:00.000Z,0.4,42.0,1,6,USD,$,FECO131-M-BLUE,Sale,Cash,77.0
INV-US-001-03558762,2,10142,716,L,WHITE,26.0,1,2023-01-01T20:04:00.000Z,0.0,26.0,1,6,USD,$,MAT-716-L-WHITE,Sale,Cash,77.0


### Import Python's regular expression module

In [0]:
import re

###Fixing the column name issue

In [0]:
# Clean column names
clean_columns = [re.sub(r'[ ,;{}()\n\t=]', '_', c) for c in tran_df.columns]
tran_df = tran_df.toDF(*clean_columns)

# Add ingestion timestamp
tran_df = tran_df.withColumn("ingestion_ts", current_timestamp())

###Saving the table

In [0]:
(tran_df.write
 .format("delta")
 .mode("overwrite")
 .saveAsTable("retail_analytics.bronze.transactions")
)

In [0]:
spark.read.table("retail_analytics.bronze.transactions").limit(5).display()

Invoice_ID,Line,Customer_ID,Product_ID,Size,Color,Unit_Price,Quantity,Date,Discount,Line_Total,Store_ID,Employee_ID,Currency,Currency_Symbol,SKU,Transaction_Type,Payment_Method,Invoice_Total,ingestion_ts
INV-DE-011-03185034,1,729207,2380,XL,GREEN,11.9,1,2023-01-08T20:22:00.000Z,0.0,11.9,11,134,EUR,€,MAT-2380-XL-GREEN,Sale,Cash,11.9,2026-01-16T15:06:42.722Z
INV-DE-011-03185035,1,745379,495,M,YELLOW,38.5,3,2023-01-08T17:11:00.000Z,0.4,69.3,11,132,EUR,€,FECO495-M-YELLOW,Sale,Credit Card,69.3,2026-01-16T15:06:42.722Z
INV-DE-011-03185036,1,722261,782,S,BLUE,25.2,1,2023-01-08T17:13:00.000Z,0.4,15.12,11,129,EUR,€,FESW782-S-BLUE,Sale,Credit Card,75.6,2026-01-16T15:06:42.722Z
INV-DE-011-03185036,2,722261,856,M,,33.6,3,2023-01-08T17:13:00.000Z,0.4,60.48,11,129,EUR,€,CHSW856-M-,Sale,Credit Card,75.6,2026-01-16T15:06:42.722Z
INV-DE-011-03185037,1,722762,1617,L,,25.2,1,2023-01-08T19:38:00.000Z,0.0,25.2,11,131,EUR,€,FET-1617-L-,Sale,Credit Card,25.2,2026-01-16T15:06:42.722Z
