# _Inicio Bronce_

### Configuración inicial de scopes, keys, conns, paths

## IMPORTANTE: 
### Encriptar la cadena de eventhub, Spark Event Hubs no acepta directamente la cadena de conexión plana para seguridad y manejo interno.

Se necesita 
- Instalar libreria com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.21
- Computo con Scala 2.12 (2.13 no soporta encrypt al dia de la fecha 7/2025)

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json

eventhub_secret_scope = "eventhub_scope"
eventhub_secret_key = "eventhub_conn"

storage_secret_scope = "storage-scope"
storage_secret_key = "storage-conn"

eventhub_connection_string = dbutils.secrets.get(scope=eventhub_secret_scope, key=eventhub_secret_key)
client_secret = dbutils.secrets.get(scope="client-scope", key="client-secret")

starting_position = {
    "offset": "0",
    "seqNo": 0,
    "enqueuedTime": None,
    "isInclusive": True
}

eventhub_config = {
    "eventhubs.connectionString": dbutils.secrets.get(scope=eventhub_secret_scope, key=eventhub_secret_key),
    "eventhubs.consumerGroup": "market-consumer-group",
    "eventhubs.startingPosition": json.dumps(starting_position)
}
configs = {
  "fs.azure.account.auth.type": "OAuth",
  "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
  "fs.azure.account.oauth2.client.id": "42add4c7-97b5-46af-b412-3d22af3f601b",
  "fs.azure.account.oauth2.client.secret": client_secret,
  "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/f8665296-99ec-4c56-8ec6-6385ed1c7cc9/oauth2/token"
}

# Una vez montado el path, comentar la linea para que no desmonte cada vez que se ejecuta el script
# Mount rawmarket
if any(mount.mountPoint == '/mnt/rawmarket' for mount in dbutils.fs.mounts()): dbutils.fs.unmount('/mnt/rawmarket')
dbutils.fs.mount(
  source = "abfss://rawmarket@mistorageprincipal.dfs.core.windows.net/",
  mount_point = "/mnt/rawmarket",
  extra_configs = configs
)

# Mount bronze
if any(mount.mountPoint == '/mnt/bronze' for mount in dbutils.fs.mounts()): dbutils.fs.unmount('/mnt/bronze')
dbutils.fs.mount(
  source = "abfss://bronze@mistorageprincipal.dfs.core.windows.net/",
  mount_point = "/mnt/bronze",
  extra_configs = configs
)
# Mount silver
if any(mount.mountPoint == '/mnt/silver' for mount in dbutils.fs.mounts()): dbutils.fs.unmount('/mnt/silver')
dbutils.fs.mount(
  source = "abfss://silver@mistorageprincipal.dfs.core.windows.net/",
  mount_point = "/mnt/silver",
  extra_configs = configs
)

# Mount gold
if any(mount.mountPoint == '/mnt/gold' for mount in dbutils.fs.mounts()): dbutils.fs.unmount('/mnt/gold')
dbutils.fs.mount(
  source = "abfss://gold@mistorageprincipal.dfs.core.windows.net/",
  mount_point = "/mnt/gold",
  extra_configs = configs
)

# Conexión a Storage (ADLS)
storage_account_name = "mistorageprincipal"
storage_key = dbutils.secrets.get(scope="storage-scope", key="storage-conn")
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", storage_key)

bronze_mount_path = f"/mnt/bronze/sales"
bronze_checkpoint_path = f"abfss://bronze@mistorageprincipal.dfs.core.windows.net/checkpoints"

/mnt/rawmarket has been unmounted.
/mnt/bronze has been unmounted.
/mnt/silver has been unmounted.
/mnt/gold has been unmounted.


In [0]:
dbutils.fs.ls(bronze_mount_path)

[FileInfo(path='dbfs:/mnt/bronze/sales/_checkpoint/', name='_checkpoint/', size=0, modificationTime=1752604704000),
 FileInfo(path='dbfs:/mnt/bronze/sales/_delta_log/', name='_delta_log/', size=0, modificationTime=1752605867000),
 FileInfo(path='dbfs:/mnt/bronze/sales/part-00000-0442ecec-8b44-4461-9471-609000481646-c000.snappy.parquet', name='part-00000-0442ecec-8b44-4461-9471-609000481646-c000.snappy.parquet', size=3237, modificationTime=1752971318000),
 FileInfo(path='dbfs:/mnt/bronze/sales/part-00000-0522e7b5-9bfe-4703-beab-2e7f8f2a7cfb-c000.snappy.parquet', name='part-00000-0522e7b5-9bfe-4703-beab-2e7f8f2a7cfb-c000.snappy.parquet', size=3280, modificationTime=1752971680000),
 FileInfo(path='dbfs:/mnt/bronze/sales/part-00000-05655117-672c-483b-9a7b-67ba703384cc-c000.snappy.parquet', name='part-00000-05655117-672c-483b-9a7b-67ba703384cc-c000.snappy.parquet', size=3192, modificationTime=1752787462000),
 FileInfo(path='dbfs:/mnt/bronze/sales/part-00000-05661a08-713f-4a86-8b58-1d2c23ef4

### Definicion de esquema

In [0]:
event_schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("store", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("channel", StringType(), True)
])


Prueba para ver si el stream esta activo

In [0]:
# Obtener connection string desde el secret
eventhub_connection_string = dbutils.secrets.get(scope=eventhub_secret_scope, key=eventhub_secret_key)

# Encriptar correctamente desde JVM sin class
encrypted_conn_str = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(eventhub_connection_string)

# Definir configuración con connection string encriptado
eventhub_config = {
    "eventhubs.connectionString": encrypted_conn_str,
    "eventhubs.consumerGroup": "market-consumer-group",
    "eventhubs.startingPosition": json.dumps(starting_position)
}

In [0]:
spark.conf.set(
    "fs.azure.account.key.mistorageprincipal.dfs.core.windows.net",
    dbutils.secrets.get(scope="storage-scope", key="storage-conn")
)

raw_stream_df = (
        spark.readStream
        .format("eventhubs")
        .options(**eventhub_config)
        .load()
    )

parsed_df = (
        raw_stream_df
        .selectExpr("cast(body as string) as json_data")
        .select(from_json(col("json_data"), event_schema).alias("data"))
        .select("data.*")
        .withColumn("event_timestamp", to_timestamp("timestamp"))
        .withColumn("ingestion_time", current_timestamp())
    )

(
    parsed_df.writeStream
    .format("delta")
    .option("checkpointLocation", bronze_mount_path + "/_checkpoint")
    .option("path", bronze_mount_path)
    .outputMode("append")
    .start()
)
display(parsed_df)


transaction_id,timestamp,product_id,store,customer_id,amount,channel,event_timestamp,ingestion_time
7020636d-9642-48ff-aa86-d637d3d01e91,2025-07-20T20:25:03.714485+00:00,P005,SucursalB,CUST001,197.58,SelfCheckout,2025-07-20T20:25:03.714485Z,2025-07-20T20:25:47.487Z
ece28497-b8e0-4a26-8ae9-5fc9d8b06794,2025-07-20T20:25:15.325047+00:00,P003,SucursalC,CUST004,200.32,SelfCheckout,2025-07-20T20:25:15.325047Z,2025-07-20T20:25:47.487Z
65666e16-447b-4a54-9695-195dae0a2041,2025-07-20T20:25:25.382575+00:00,P004,SucursalA,CUST003,318.25,SelfCheckout,2025-07-20T20:25:25.382575Z,2025-07-20T20:25:47.487Z
2214e6bc-454d-4139-b53a-9505b1cec184,2025-07-20T20:26:32.244801+00:00,P001,SucursalA,CUST005,426.04,POS,2025-07-20T20:26:32.244801Z,2025-07-20T20:26:34.007Z
55667286-e399-4539-9e2a-bef3d88b58c9,2025-07-20T20:26:43.594623+00:00,P004,SucursalC,CUST003,358.64,Online,2025-07-20T20:26:43.594623Z,2025-07-20T20:26:44.006Z
9248bf57-c08b-4820-87ee-2e9cfe535102,2025-07-20T20:26:53.651229+00:00,P004,SucursalA,CUST002,496.97,SelfCheckout,2025-07-20T20:26:53.651229Z,2025-07-20T20:26:54.006Z
4dcce014-4049-4ad2-b2fd-9299a2322f01,2025-07-20T20:27:03.710038+00:00,P004,SucursalA,CUST004,342.3,POS,2025-07-20T20:27:03.710038Z,2025-07-20T20:27:04.006Z
fd9d9b92-331e-43d5-b5a4-e71089e12897,2025-07-20T20:27:13.768399+00:00,P004,SucursalA,CUST002,101.29,SelfCheckout,2025-07-20T20:27:13.768399Z,2025-07-20T20:27:14.006Z
dfeab038-4eba-4191-8e10-a45b2b8bccf1,2025-07-20T20:27:23.825747+00:00,P001,SucursalA,CUST004,128.64,SelfCheckout,2025-07-20T20:27:23.825747Z,2025-07-20T20:27:24.011Z
459ff323-65d8-4f52-9165-2f836e198cca,2025-07-20T20:27:33.885256+00:00,P002,SucursalA,CUST003,50.91,SelfCheckout,2025-07-20T20:27:33.885256Z,2025-07-20T20:27:34.006Z


In [0]:
spark.sql(f"""SELECT transaction_id,event_timestamp, ingestion_time
FROM delta.`{bronze_mount_path}` ORDER BY ingestion_time DESC""").display()

transaction_id,event_timestamp,ingestion_time
3f1c34f9-54e5-421e-ab3b-11f3f326c733,2025-07-20T00:36:29.900075Z,2025-07-20T00:36:30.514Z
92f32130-b209-47c6-83d9-79294bc91ccc,2025-07-20T00:36:19.843606Z,2025-07-20T00:36:20.605Z
d7271a22-6fe5-4bb6-a9a8-09771b47664b,2025-07-20T00:36:09.785171Z,2025-07-20T00:36:10.268Z
63b0c32b-c5c9-41ec-8740-4b04ab6d6cfc,2025-07-20T00:35:59.727645Z,2025-07-20T00:36:00.458Z
48d4f2e4-3ded-4a21-b484-867088c897e8,2025-07-20T00:35:49.671619Z,2025-07-20T00:35:50.427Z
98aa3ece-2cd4-41a1-bc96-66310f9094e7,2025-07-20T00:35:39.614308Z,2025-07-20T00:35:40.221Z
37294891-36c0-45fa-b053-e56dd80b78b8,2025-07-20T00:35:29.558157Z,2025-07-20T00:35:30.45Z
6c7cb7cd-5dc9-4c5e-99af-2950566c06b5,2025-07-20T00:35:19.494604Z,2025-07-20T00:35:20.236Z
ee476922-4782-43c7-9fb5-f8d809f8c676,2025-07-20T00:35:09.438501Z,2025-07-20T00:35:10.16Z
5ec268d8-e78a-447a-a163-05a14861fcec,2025-07-20T00:34:59.381417Z,2025-07-20T00:34:59.791Z


In [0]:
# catalog_table = spark.read.format("delta").load(bronze_mount_path)
# display(catalog_table) 

## _Fin bronce_