In [1]:
import findspark
findspark.init()
import pyspark

In [2]:
# Inicializando la Spark Session
spark = pyspark.sql.SparkSession.builder.getOrCreate() 

###  Streaming Estructurado

#### DataFrame Estático

In [3]:
staticDataFrame = spark.read.format("csv").option("header", "true")\
.option("inferSchema", "true").load("D:/APRENDER/Spark/Spark-The-Definitive-Guide-master/data/retail-data/by-day/*.csv")

staticDataFrame.createOrReplaceTempView("retail_data")
staticSchema = staticDataFrame.schema

In [4]:
"""En este ejemplo veremos las horas de venta durante las cuales 
un cliente determinado (CustomerId) hace una compra grande, con costo
total sabremos qué días el cliente gastó más, la función ventana incluirá
los datos de c/día en la agrgación"""

from pyspark.sql.functions import window, column, desc, col
staticDataFrame.selectExpr("CustomerId", 
                           "(UnitPrice * Quantity) as total_cost",
                          "InvoiceDate")\
.groupBy(col("CustomerId"), window(col("InvoiceDate"),"1 day"))\
.sum("total_cost").show(6)

+----------+--------------------+-----------------+
|CustomerId|              window|  sum(total_cost)|
+----------+--------------------+-----------------+
|   16057.0|[2011-12-04 19:00...|            -37.6|
|   14126.0|[2011-11-28 19:00...|643.6300000000001|
|   13500.0|[2011-11-15 19:00...|497.9700000000001|
|   17160.0|[2011-11-07 19:00...|516.8499999999999|
|   15608.0|[2011-11-10 19:00...|            122.4|
|   15253.0|[2011-11-22 19:00...|            277.6|
+----------+--------------------+-----------------+
only showing top 6 rows



In [5]:
# Por defecto son 200 particiones, pero lo pondremos solo en 5
spark.conf.set("spark.sql.shuffle.partitions", "5")

#### DataFrame Streaming

In [6]:
streamingDataFrame = spark.readStream.schema(staticSchema)\
.option("maxFilesPerTrigger", 1).format("csv")\
.option("header", "true")\
.load("D:/APRENDER/Spark/Spark-The-Definitive-Guide-master/data/retail-data/by-day/*.csv")

In [7]:
# Verificamos si nuestro DataFrame está transmitiendo
streamingDataFrame.isStreaming

True

In [8]:
purchaseByCustomerPerHour = streamingDataFrame\
.selectExpr(
    "CustomerId",
    "(UnitPrice * Quantity) as total_cost",
    "InvoiceDate")\
.groupBy(
col("CustomerId"), window(col("InvoiceDate"), "1 day"))\
.sum("total_cost")    

In [9]:
purchaseByCustomerPerHour.writeStream.format("memory")\
.queryName("customer_purchases").outputMode("complete")\
.start()

<pyspark.sql.streaming.StreamingQuery at 0x25ee75ab6d8>