In [0]:
from pyspark.sql.functions import current_timestamp

# 1. Definir rutas
bucket_name = "lakehouseyelp"
raw_business_path = f"s3a://{bucket_name}/raw/business_sample.json"
raw_review_path = f"s3a://{bucket_name}/raw/review_sample.json"

bronze_base_path = f"s3a://{bucket_name}/bronze/"

# 2. Leer Business JSON y escribir en Bronze
print("Procesando Business...")
df_business = spark.read.json(raw_business_path)

# Añadimos una columna de auditoría 
df_business = df_business.withColumn("ingestion_timestamp", current_timestamp())

df_business.write.format("delta").mode("overwrite").save(bronze_base_path + "business")
# 3. Leer Review JSON y escribir en Bronze
print("Procesando Reviews...")
df_review = spark.read.json(raw_review_path)
df_review = df_review.withColumn("ingestion_timestamp", current_timestamp())

df_review.write.format("delta").mode("overwrite").save(bronze_base_path + "review")

print("¡Capa Bronze completada con éxito!")


Procesando Business...


In [0]:
from pyspark.sql import functions as F

df_business.select(F.col("name"), F.col("business_id"), F.col("categories")).show(5)

+--------------------+--------------------+--------------------+
|                name|         business_id|          categories|
+--------------------+--------------------+--------------------+
|Abby Rappoport, L...|Pns2l4eNsfO8kk83d...|Doctors, Traditio...|
|       The UPS Store|mpf3x-BjTdTEA3yCZ...|Shipping Centers,...|
|              Target|tUFrWirKiKi_TAnsV...|Department Stores...|
|  St Honore Pastries|MTSW4McQd7CbVtyjq...|Restaurants, Food...|
|Perkiomen Valley ...|mWMc6_wTdE0EUBKIG...|Brewpubs, Breweri...|
+--------------------+--------------------+--------------------+
only showing top 5 rows


In [0]:
%sql
-- Verificar que podemos leer la tabla Delta recién creada
SELECT * FROM delta.`s3a://lakehouseyelp/bronze/business` LIMIT 5

address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state,ingestion_timestamp
"1616 Chapala St, Ste 2","List(null, null, null, null, null, null, null, null, null, null, null, True, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null)",Pns2l4eNsfO8kk83dixA6A,"Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists",Santa Barbara,,0,34.4266787,-119.7111968,"Abby Rappoport, LAC, CMQ",93101,7,5.0,CA,2025-12-31T15:03:12.594Z
87 Grasso Plaza Shopping Center,"List(null, null, null, null, null, null, null, null, null, True, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null)",mpf3x-BjTdTEA3yCZrAYPw,"Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services",Affton,"List(8:0-18:30, 0:0-0:0, 8:0-14:0, null, 8:0-18:30, 8:0-18:30, 8:0-18:30)",1,38.551126,-90.335695,The UPS Store,63123,15,3.0,MO,2025-12-31T15:03:12.594Z
5255 E Broadway Blvd,"List(null, null, null, null, null, null, null, True, null, True, {'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}, False, False, False, null, null, False, null, null, null, null, null, False, False, null, null, null, False, null, null, False, null, 2, False, null, False, null, True, u'no')",tUFrWirKiKi_TAnsVWINQQ,"Department Stores, Shopping, Fashion, Home & Garden, Electronics, Furniture Stores",Tucson,"List(8:0-23:0, 8:0-22:0, 8:0-23:0, 8:0-22:0, 8:0-22:0, 8:0-22:0, 8:0-22:0)",0,32.223236,-110.880452,Target,85711,22,3.5,AZ,2025-12-31T15:03:12.594Z
935 Race St,"List(null, null, u'none', null, null, null, null, True, null, False, {'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}, False, True, null, null, null, null, null, null, null, null, null, null, null, null, null, null, False, null, null, False, null, 1, null, null, True, null, null, u'free')",MTSW4McQd7CbVtyjqoe9mw,"Restaurants, Food, Bubble Tea, Coffee & Tea, Bakeries",Philadelphia,"List(7:0-21:0, 7:0-20:0, 7:0-21:0, 7:0-21:0, 7:0-20:0, 7:0-20:0, 7:0-20:0)",1,39.9555052,-75.1555641,St Honore Pastries,19107,80,4.0,PA,2025-12-31T15:03:12.594Z
101 Walnut St,"List(null, null, null, null, null, null, null, True, null, True, {'garage': None, 'street': None, 'validated': None, 'lot': True, 'valet': False}, null, False, null, null, null, null, null, null, True, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, True, null, True, null)",mWMc6_wTdE0EUBKIGXDVfA,"Brewpubs, Breweries, Food",Green Lane,"List(12:0-22:0, null, 12:0-22:0, 12:0-18:0, 16:0-22:0, null, 14:0-22:0)",1,40.3381827,-75.4716585,Perkiomen Valley Brewery,18054,13,4.5,PA,2025-12-31T15:03:12.594Z


In [0]:
# Esta es la forma oficial de ver cuántos archivos/particiones tiene una tabla Delta
df_info = spark.sql(f"DESCRIBE DETAIL delta.`{bronze_base_path}business`").select("numFiles").collect()
print(f"Número de archivos (particiones físicas): {df_info[0][0]}")

Número de archivos (particiones físicas): 2
