###Importing Pyspark Functions

In [0]:
from pyspark.sql.functions import *

###Reading the employee file

In [0]:
emp_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/Volumes/retail_analytics/raw/kaggle/global_fashion/employees.csv")

In [0]:
display(emp_df.limit(5))

Employee ID,Store ID,Name,Position
1,1,Stephen Johnson,Store Manager
2,1,Rebecca Myers,Assistant Manager
3,1,Katherine Buchanan,Cashier
4,1,Jessica Hicks,Stock Clerk
5,1,Ryan Gross,Sales Associate


### Import Python's regular expression module

In [0]:
import re

###Fixing the column name issue

In [0]:
# Clean column names
clean_columns = [re.sub(r'[ ,;{}()\n\t=]', '_', c) for c in emp_df.columns]
emp_df = emp_df.toDF(*clean_columns)

# Add ingestion timestamp
emp_df = emp_df.withColumn("ingestion_ts", current_timestamp())

###Saving the table

In [0]:
(emp_df.write
 .format("delta")
 .mode("overwrite")
 .saveAsTable("retail_analytics.bronze.employees")
)

In [0]:
spark.read.table("retail_analytics.bronze.employees").limit(5).display()

Employee_ID,Store_ID,Name,Position,ingestion_ts
1,1,Stephen Johnson,Store Manager,2026-01-16T05:32:22.948Z
2,1,Rebecca Myers,Assistant Manager,2026-01-16T05:32:22.948Z
3,1,Katherine Buchanan,Cashier,2026-01-16T05:32:22.948Z
4,1,Jessica Hicks,Stock Clerk,2026-01-16T05:32:22.948Z
5,1,Ryan Gross,Sales Associate,2026-01-16T05:32:22.948Z
