Script en celda

En dos terminales ejecutar:

```bash
docker exec -it kafka bash
```
y luego en uno esto:

```bash
 ./kafka-console-consumer.sh \
  --bootstrap-server localhost:9092 \
  --topic mydata_prediction_response
{"order_id":1,"prediction":66.68752312402263}
```

y en otro:
```bash
kafka-console-producer.sh   --broker-list kafka:9092   --topic mydata_prediction_request   --property "parse.key=false"   --property "key.separator=:"   --property "value.serializer=org.apache.kafka.common.serialization.StringSerializer"
```
y de ejemplo:
```bash
{"order_id": 1, "customer_id": "cust_123", "restaurant_id": "rest_456", "order_date_and_time": "2025-08-06T13:00:00", "delivery_date_and_time": "2025-08-06T13:45:00", "order_value": 1000, "delivery_fee": 100, "payment_method": "credit_card", "discounts_and_offers": "10% off", "commission_fee": 100, "payment_processing_fee": 20, "refunds/chargebacks": 0}
```

In [None]:
#!/usr/bin/env python

#
# Run with: spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 Food_delivery/Deploying_Predictive_Systems/MY_make_predictions_streaming.py $PROJECT_PATH
#
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType
from pyspark.sql.functions import from_json, col, hour, dayofweek, when
from pyspark.ml import PipelineModel

# Crear la SparkSession apuntando al cluster
spark = (
    SparkSession.builder.appName("script sc")
    .master("spark://agile:7077")
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0")
    .getOrCreate()
)


spark.sparkContext.setLogLevel("ERROR")

print("✅ SparkSession creada")

# Cargar el modelo entrenado
modelo = PipelineModel.load("./models/pipeline_model.bin")

print("✅ Recibiendo mensaje de Kafka")

# Esquema del JSON recibido por Kafka
schema = StructType([
    StructField("order_id", LongType(), True),
    StructField("customer_id", StringType(), True),
    StructField("restaurant_id", StringType(), True),
    StructField("order_date_and_time", TimestampType(), True),
    StructField("delivery_date_and_time", TimestampType(), True),
    StructField("order_value", LongType(), True),
    StructField("delivery_fee", LongType(), True),
    StructField("payment_method", StringType(), True),
    StructField("discounts_and_offers", StringType(), True),
    StructField("commission_fee", LongType(), True),
    StructField("payment_processing_fee", LongType(), True),
    StructField("refunds/chargebacks", LongType(), True)
])

print("✅ Datos crudos de Kafka cargados")
# Leer desde Kafka
raw_stream = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "mydata_prediction_request")
    .option("startingOffsets", "latest")
    .load()
)

# Decodificar el JSON
json_df = (
    raw_stream
    .selectExpr("CAST(value AS STRING) as json_data")
    .select(from_json("json_data", schema).alias("data"))
    .select("data.*")
)

# Añadir columnas derivadas como en el análisis
df_enriched = (
    json_df
    .withColumn("day_of_week", dayofweek("order_date_and_time"))
    .withColumn("hour_of_day", hour("order_date_and_time"))
    .withColumn("es_fin_de_semana", when(col("day_of_week").isin([1, 7]), 1).otherwise(0))
    .withColumn("es_hora_punta", when((col("hour_of_day").between(13, 15)) | (col("hour_of_day").between(20, 22)), 1).otherwise(0))
    .withColumn("has_discount", when(col("discounts_and_offers").isNotNull(), 1).otherwise(0))
    .withColumn("discount_value", when(col("has_discount") == 1, col("order_value") * 0.1).otherwise(0.0))
    .withColumn("refunded", when(col("refunds/chargebacks") > 0, 1).otherwise(0))
)

# Aplicar el modelo
predicciones = modelo.transform(df_enriched)

# Seleccionar solo lo relevante
resultado = predicciones.select("order_id", "prediction")

# Enviar resultados a Kafka
query = (
    resultado
    .selectExpr("CAST(order_id AS STRING) AS key", "to_json(struct(*)) AS value")
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("topic", "mydata_prediction_response")
    .option("checkpointLocation", "/tmp/kafka-checkpoints")
    .start()
)

query.awaitTermination()



:: loading settings :: url = jar:file:/usr/local/spark-3.2.0-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-42f5edbd-1eba-4852-8257-c51f4eac8b0d;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central

✅ Script iniciado 2


                                                                                

✅ Recibiendo mensaje de Kafka
✅ Datos crudos de Kafka cargados


ERROR:root:Exception while sending command.                                     
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=68>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 503, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 475, in send_command
    answer

Py4JError: An error occurred while calling o523.awaitTermination