In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Crear la sesión de Spark
spark = SparkSession.builder.appName("data_read_write").getOrCreate()

# Crear un esquema para los datos
schema = StructType([
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("date", StringType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("employee_id", IntegerType(), True),
    StructField("quantity_products", IntegerType(), True),
    StructField("order_id", StringType(), True),
])

def receive_data(interval):
    try:

        # Leer los datos del socket
        raw_data = spark.readStream.format("socket").option("host", "host.docker.internal").option("port", 4223).load()

        # Convertir los datos a DataFrame usando el esquema
        json_df = raw_data.select(from_json(col("value").cast("string"), schema).alias("data")).select("data.*")
    
        json_df = json_df.repartition(1)

        query = json_df.writeStream \
                .format("parquet") \
                .outputMode("append") \
                .option("path", f"/datalake/raw/stagging") \
                .option("checkpointLocation", "/checkpoints") \
                .trigger(processingTime=interval) \
                .start()
        query.awaitTermination()
            
    except Exception as e:
        print(f"Error: {e}")
        
    finally:
        query.stop()

In [None]:
#receive_data("30 minutes")
receive_data("30 seconds")