### Importing Pyspark functions

In [0]:
from pyspark.sql.functions import *

### Creating schema for bronze layer

In [0]:
%sql
create schema if not exists retail_analytics.bronze

### Reading the customer file

In [0]:
cust_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/Volumes/retail_analytics/raw/kaggle/global_fashion/customers.csv")

In [0]:
display(cust_df.limit(5))

Customer ID,Name,Email,Telephone,City,Country,Gender,Date Of Birth,Job Title
1,Tyler Garcia,tyler.garcia@fake_gmail.com,922.970.2265x47563,New York,United States,M,2003-07-15,
2,Joshua Miller,joshua.miller@fake_gmail.com,+1-958-729-6169,New York,United States,M,2000-06-16,Records manager
3,Alison Marshall DDS,alison.marshall.dds@fake_hotmail.com,+1-645-567-0876x5409,New York,United States,F,2003-07-22,
4,Jeffery Acosta,jeffery.acosta@fake_yahoo.com,212.336.0912x84994,New York,United States,M,1996-11-12,Proofreader
5,Ashley Sanders,ashley.sanders@fake_hotmail.com,7814535781,New York,United States,F,1998-02-10,Exercise physiologist


### Import Python's regular expression module

In [0]:
import re

###Fixing the column name issue

In [0]:
# Clean column names (For Bronze)
clean_columns = [re.sub(r'[ ,;{}()\n\t=]', '_', c) for c in cust_df.columns]
cust_df = cust_df.toDF(*clean_columns)

# Add ingestion timestamp
cust_df = cust_df.withColumn("ingestion_ts", current_timestamp())

### Saving the table

In [0]:
(cust_df.write
 .format("delta")
 .mode("overwrite")
 .saveAsTable("retail_analytics.bronze.customers")
)

In [0]:
spark.read.table("retail_analytics.bronze.customers").limit(5).display()

Customer_ID,Name,Email,Telephone,City,Country,Gender,Date_Of_Birth,Job_Title,ingestion_ts
1203365,Brigitte de Huet,brigitte.de.huet@fake_voila.fr,0444057753,Villefranche-sur-Saône,France,F,1988-07-18,Clothing/textile technologist,2026-01-16T05:29:29.853Z
1203366,Audrey Gosselin de la Maréchal,audrey.gosselin.de.la.maréchal@fake_hotmail.fr,0230184124,Villefranche-sur-Saône,France,F,1983-02-16,Chemical engineer,2026-01-16T05:29:29.853Z
1203367,Marine Marchand,marine.marchand@fake_gmail.com,+33 (0)4 67 32 30 14,Villefranche-sur-Saône,France,F,1986-01-02,Database administrator,2026-01-16T05:29:29.853Z
1203368,Patrick Grondin,patrick.grondin@fake_club-internet.fr,0143258515,Villefranche-sur-Saône,France,M,1994-09-04,"Editor, magazine features",2026-01-16T05:29:29.853Z
1203369,Marianne Bertrand-Ruiz,marianne.bertrand-ruiz@fake_orange.fr,+33 7 72 55 15 65,Villefranche-sur-Saône,France,F,2003-02-15,,2026-01-16T05:29:29.853Z
