# Spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

from datetime import datetime
import time
import pytz

## Sesión de spark

In [2]:
# Crear la sesión de Spark
spark = SparkSession.builder.appName("data_read_write").getOrCreate()

# Crear un esquema para los datos
schema = StructType([
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("date", StringType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("employee_id", StringType(), True),
    StructField("quantity_products", IntegerType(), True),
    StructField("order_id", StringType(), True),
    StructField("commune", StringType(), True),
    StructField("neighborhood", StringType(), True),
])

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [None]:
try:

    # Leer los datos del socket
    raw_data = spark.readStream.format("socket").option("host", "localhost").option("port", 2222).load()

    # Convertir los datos a DataFrame usando el esquema
    json_df = raw_data.select(from_json(col("value").cast("string"), schema).alias("data")).select("data.*")


    # Definir el intervalo de tiempo para acumular datos
    interval = "30 seconds"  # Acumular datos durante 10 minutos
    
    json_df = json_df.repartition(1)
    
    date_now = datetime.now(pytz.timezone('America/Bogota')).strftime("%d%m%Y_%H%M%S")

    query = json_df.writeStream \
            .format("parquet") \
            .outputMode("append") \
            .option("path", f"/datalake/raw/stagging") \
            .option("checkpointLocation", "/checkpoints") \
            .trigger(processingTime=interval) \
            .start()
    
    print("Started the streaming query.")
    query.awaitTermination()
    
except KeyboardInterrupt:
    print("Disconnecting from the socket ...")
    if query:
        query.stop()
    print("Disconnected from the socket.")
        
except Exception as e:
    print(f"Error: {e}")
    
finally:
    query.stop()

Started the streaming query.


                                                                                