###Importing Pyspark Functions 

In [0]:
from pyspark.sql.functions import *

###Reading the stores file

In [0]:
stor_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/Volumes/retail_analytics/raw/kaggle/global_fashion/stores.csv")

In [0]:
display(stor_df.limit(5))

Store ID,Country,City,Store Name,Number of Employees,ZIP Code,Latitude,Longitude
1,United States,New York,Store New York,10,10001,40.7128,-74.006
2,United States,Los Angeles,Store Los Angeles,8,90001,34.0522,-118.2437
3,United States,Chicago,Store Chicago,9,60601,41.8781,-87.6298
4,United States,Houston,Store Houston,10,77001,29.7604,-95.3698
5,United States,Phoenix,Store Phoenix,9,85001,33.4484,-112.074


### Import Python's regular expression module

In [0]:
import re

###Fixing the column name issue

In [0]:
# Clean column names
clean_columns = [re.sub(r'[ ,;{}()\n\t=]', '_', c) for c in stor_df.columns]
stor_df = stor_df.toDF(*clean_columns)

# Add ingestion timestamp
stor_df = stor_df.withColumn("ingestion_ts", current_timestamp())

###Saving the table

In [0]:
(stor_df.write
 .format("delta")
 .mode("overwrite")
 .saveAsTable("retail_analytics.bronze.stores")
)

In [0]:
spark.read.table("retail_analytics.bronze.stores").limit(5).display()

Store_ID,Country,City,Store_Name,Number_of_Employees,ZIP_Code,Latitude,Longitude,ingestion_ts
1,United States,New York,Store New York,10,10001,40.7128,-74.006,2026-01-16T05:42:56.232Z
2,United States,Los Angeles,Store Los Angeles,8,90001,34.0522,-118.2437,2026-01-16T05:42:56.232Z
3,United States,Chicago,Store Chicago,9,60601,41.8781,-87.6298,2026-01-16T05:42:56.232Z
4,United States,Houston,Store Houston,10,77001,29.7604,-95.3698,2026-01-16T05:42:56.232Z
5,United States,Phoenix,Store Phoenix,9,85001,33.4484,-112.074,2026-01-16T05:42:56.232Z
