In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

from datetime import datetime
import time
import pytz


# Crear la sesión de Spark
spark = SparkSession.builder.appName("data_read_write").getOrCreate()

# Crear un esquema para los datos
schema = StructType([
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("date", StringType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("employee_id", IntegerType(), True),
    StructField("quantity_products", IntegerType(), True),
    StructField("order_id", StringType(), True),
])

try:

    # Leer los datos del socket
    raw_data = spark.readStream.format("socket").option("host", "host.docker.internal").option("port", 4221).load()

    # Convertir los datos a DataFrame usando el esquema
    json_df = raw_data.select(from_json(col("value").cast("string"), schema).alias("data")).select("data.*")


    # Definir el intervalo de tiempo para acumular datos
    interval = "30 seconds"  # Acumular datos durante 10 minutos
    
    json_df = json_df.repartition(1)
    
    date_now = datetime.now(pytz.timezone('America/Bogota')).strftime("%d%m%Y_%H%M%S")

    query = json_df.writeStream \
            .format("parquet") \
            .outputMode("append") \
            .option("path", f"/datalake/raw/stagging/{date_now}") \
            .option("checkpointLocation", "/checkpoints") \
            .trigger(processingTime=interval) \
            .start()
    query.awaitTermination()
    
    
    while True:
        # Obtén la fecha y hora actuales
        date_now = datetime.now(pytz.timezone('America/Bogota')).strftime("%d%m%Y_%H%M%S")
        
        new_checkpoint_location = f"/checkpoints/{date_now}"
        new_data_path = f"/datalake/raw/stagging/{date_now}"
         
        # Renombrar la carpeta de checkpoint actual (esto simula una nueva escritura)
        spark.conf.set("spark.sql.streaming.checkpointLocation", new_checkpoint_location)
        spark.conf.set("spark.sql.streaming.path", new_data_path)

        time.sleep(60)
        
        
except Exception as e:
    print(f"Error: {e}")
    
finally:
    query.stop()

    

KeyboardInterrupt: 