In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

from datetime import datetime
import time
import pytz


# Crear la sesión de Spark
spark = SparkSession.builder.appName("data_read_write").getOrCreate()

# Crear un esquema para los datos
schema = StructType([
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("date", StringType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("employee_id", IntegerType(), True),
    StructField("quantity_products", IntegerType(), True),
    StructField("order_id", StringType(), True),
])

try:

    # Leer los datos del socket
    raw_data = spark.readStream.format("socket").option("host", "host.docker.internal").option("port", 5555).load()

    # Convertir los datos a DataFrame usando el esquema
    json_df = raw_data.select(from_json(col("value").cast("string"), schema).alias("data")).select("data.*")


    # Definir el intervalo de tiempo para acumular datos
    interval = "30 seconds"  # Acumular datos durante 10 minutos
    
    json_df = json_df.repartition(1)
    
    date_now = datetime.now(pytz.timezone('America/Bogota')).strftime("%d%m%Y_%H%M%S")

    query = json_df.writeStream \
            .format("parquet") \
            .outputMode("append") \
            .option("path", f"/datalake/raw/stagging") \
            .option("checkpointLocation", "/checkpoints") \
            .trigger(processingTime=interval) \
            .start()
    query.awaitTermination()
        
except Exception as e:
    print(f"Error: {e}")
    
finally:
    query.stop()

    

Error: Connection refused (Connection refused)
=== Streaming Query ===
Identifier: [id = 66e5dd64-bea9-454c-8dbd-59991acd65dc, runId = 0e29a7f9-f253-4df3-be4d-037173b9d7ec]
Current Committed Offsets: {}
Current Available Offsets: {TextSocketV2[host: host.docker.internal, port: 5555]: -1}

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
Repartition 1, true
+- Project [data#27.latitude AS latitude#29, data#27.longitude AS longitude#30, data#27.date AS date#31, data#27.customer_id AS customer_id#32, data#27.employee_id AS employee_id#33, data#27.quantity_products AS quantity_products#34, data#27.order_id AS order_id#35]
   +- Project [from_json(StructField(latitude,DoubleType,true), StructField(longitude,DoubleType,true), StructField(date,StringType,true), StructField(customer_id,IntegerType,true), StructField(employee_id,IntegerType,true), StructField(quantity_products,IntegerType,true), StructField(order_id,StringType,true), cast(value#25 as string), Some(Etc/UTC)) AS data#2

24/06/15 06:40:26 ERROR streaming.MicroBatchExecution: Query [id = 66e5dd64-bea9-454c-8dbd-59991acd65dc, runId = 0e29a7f9-f253-4df3-be4d-037173b9d7ec] terminated with error
java.net.ConnectException: Connection refused (Connection refused)
	at java.net.PlainSocketImpl.socketConnect(Native Method)
	at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350)
	at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206)
	at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188)
	at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392)
	at java.net.Socket.connect(Socket.java:607)
	at java.net.Socket.connect(Socket.java:556)
	at java.net.Socket.<init>(Socket.java:452)
	at java.net.Socket.<init>(Socket.java:229)
	at org.apache.spark.sql.execution.streaming.sources.TextSocketMicroBatchStream.initialize(TextSocketMicroBatchStream.scala:71)
	at org.apache.spark.sql.execution.streaming.sources.TextSocketMicroBatchStream.planI