Script en celda

En dos terminales ejecutar:

```bash
docker exec -it kafka bash
```
y luego en uno esto:

```bash
 ./opt/kafka/bin/kafka-console-consumer.sh \
  --bootstrap-server localhost:9092 \
  --topic mydata_prediction_response
```

y en otro:
```bash
/opt/kafka/bin/kafka-console-producer.sh   --broker-list kafka:9092   --topic mydata_prediction_request   --property "parse.key=false"   --property "key.separator=:"   --property "value.serializer=org.apache.kafka.common.serialization.StringSerializer"
```
y de ejemplo en el producer:
```bash
{"order_id": 1, "customer_id": "cust_123", "restaurant_id": "rest_456", "order_date_and_time": "2025-08-06T13:00:00", "delivery_date_and_time": "2025-08-06T13:45:00", "order_value": 1000, "delivery_fee": 100, "payment_method": "credit_card", "discounts_and_offers": "10% off", "commission_fee": 100, "payment_processing_fee": 20, "refunds/chargebacks": 0}
```

Pudes abrir mongo express y ver el resultado http://localhost:8081/db/agile_data_science/mydata_prediction_response

In [None]:
#!/usr/bin/env python
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType, LongType
from pyspark.sql.functions import from_json, col, hour, dayofweek, when
from pyspark.ml import PipelineModel
import pymongo

# ---- SparkSession ----
spark = (
    SparkSession.builder.appName("script sc")
    .master("spark://agile:7077")
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")
print("✅ SparkSession creada")

# ---- Modelo ----
modelo = PipelineModel.load("./models/pipeline_model.bin")
print("✅ Modelo cargado")

# ---- Schema de entrada desde Kafka (con UUID y decimales) ----
schema = StructType([
    StructField("UUID", StringType(), True),
    StructField("order_id", LongType(), True),
    StructField("customer_id", StringType(), True),
    StructField("restaurant_id", StringType(), True),
    StructField("order_date_and_time", TimestampType(), True),
    StructField("delivery_date_and_time", TimestampType(), True),
    StructField("order_value", DoubleType(), True),
    StructField("delivery_fee", DoubleType(), True),
    StructField("payment_method", StringType(), True),
    StructField("discounts_and_offers", StringType(), True),
    StructField("commission_fee", DoubleType(), True),
    StructField("payment_processing_fee", DoubleType(), True),
    StructField("refunds/chargebacks", DoubleType(), True)
])

# ---- Lectura desde Kafka ----
raw_stream = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "mydata_prediction_request")
    .option("startingOffsets", "latest")
    .load()
)

json_df = (
    raw_stream
    .selectExpr("CAST(value AS STRING) as json_data")
    .select(from_json("json_data", schema).alias("data"))
    .select("data.*")
)

# ---- Enriquecido EXACTO como en entrenamiento ----
df_enriched = (
    json_df
    .withColumn("day_of_week", dayofweek("order_date_and_time"))
    .withColumn("hour_of_day", hour("order_date_and_time"))
    .withColumn("es_fin_de_semana", when(col("day_of_week").isin([1, 7]), 1).otherwise(0))
    .withColumn("es_hora_punta", when((col("hour_of_day").between(13, 15)) | (col("hour_of_day").between(20, 22)), 1).otherwise(0))
    .withColumn("has_discount", when(col("discounts_and_offers").isNotNull(), 1).otherwise(0))
    .withColumn("discount_value", when(col("has_discount") == 1, col("order_value") * 0.1).otherwise(0.0))
    .withColumn("refunded", when(col("refunds/chargebacks") > 0, 1).otherwise(0))
)

# ---- Solo columnas que espera el pipeline + UUID + order_id ----
required_numeric = [
    "order_value", "delivery_fee", "commission_fee", "payment_processing_fee",
    "refunds/chargebacks", "discount_value", "has_discount", "refunded",
    "day_of_week", "hour_of_day", "es_fin_de_semana", "es_hora_punta"
]
required_categorical = ["payment_method", "discounts_and_offers"]
required_cols = ["UUID", "order_id"] + required_numeric + required_categorical

features_df = df_enriched.select(*required_cols)

# ---- Predicción ----
predicciones = modelo.transform(features_df)

# Lo que vamos a persistir/publicar
resultado = predicciones.select("UUID", "order_id", "prediction")

# ---- foreachBatch: escribe en Mongo y Kafka en el mismo microbatch ----
def write_to_mongo_and_kafka(batch_df, epoch_id):
    client = pymongo.MongoClient("mongo")
    db = client["agile_data_science"]
    out = db["mydata_prediction_response"]
    err = db["mydata_prediction_errors"]
    out.create_index("UUID", unique=False)

    try:
        rows = [r.asDict() for r in batch_df.collect()]
        if rows:
            out.insert_many(rows)

        (batch_df
         .selectExpr("UUID as key", "to_json(struct(*)) as value")
         .write
         .format("kafka")
         .option("kafka.bootstrap.servers", "kafka:9092")
         .option("topic", "mydata_prediction_response")
         .save())
    except Exception as e:
        err.insert_one({
            "epoch_id": int(epoch_id),
            "error": str(e),
            "sample_rows": rows[:3] if 'rows' in locals() else [],
        })
    finally:
        client.close()

query = (resultado.writeStream
         .outputMode("append")
         .option("checkpointLocation", "/tmp/checkpoints-foreachbatch-v2")  # cambia/borra si reinicias
         .foreachBatch(write_to_mongo_and_kafka)
         .start())

query.awaitTermination()



:: loading settings :: url = jar:file:/usr/local/spark-3.2.0-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-63d684ea-3e9a-4df0-aad0-fb4e907d3237;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central

✅ SparkSession creada


                                                                                

✅ Modelo cargado


                                                                                