## 스트리밍 코드

In [1]:
import pyspark
import pyspark.sql.functions as F

In [2]:
staticDataFrame = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("../../assets/exercises/week03/by-day/2010-12-01.csv")

In [3]:
staticSchema = staticDataFrame.schema

In [4]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [14]:
staticDataFrame\
.selectExpr(
    "CustomerID",
    "(UnitPrice * Quantity) as total_cost",
    "to_date(to_timestamp(InvoiceDate, 'M/d/yyyy H:mm')) as InvoiceDate")\
.groupBy(
    F.col("CustomerID"), F.window(F.col("InvoiceDate"), "1 day"))\
.sum("total_cost")\
.show(5)

+----------+--------------------+------------------+
|CustomerID|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|   17920.0|{2010-11-30 09:00...| 514.4099999999999|
|   17025.0|{2010-11-30 09:00...|            160.97|
|   17951.0|{2010-11-30 09:00...|295.50000000000006|
|   13468.0|{2010-11-30 09:00...|360.05000000000007|
|   17690.0|{2010-11-30 09:00...|376.80000000000007|
+----------+--------------------+------------------+
only showing top 5 rows



In [6]:
streamingDataFrame = spark.readStream.format("csv")\
.schema(staticSchema)\
.option("maxFilesPerTrigger", 1)\
.format("csv")\
.option("header", "true")\
.load("../../assets/exercises/week03/by-day/*.csv")

In [7]:
streamingDataFrame.isStreaming

True

In [9]:
purchaseByCustomerPerHour = streamingDataFrame\
.selectExpr(
    "CustomerID",
    "(UnitPrice * Quantity) as total_cost",
    "to_date(to_timestamp(InvoiceDate, 'M/d/yyyy H:mm')) as InvoiceDate")\
.groupBy(
    F.col("CustomerID"), F.window(F.col("InvoiceDate"), "1 day"))\
.sum("total_cost")

In [10]:
# lazy operation(지연 연산)이므로 데이터 플로를 실행하기 위해 스트리밍 액션을 호출해야 한다.
# 스트리밍 액션은 정적 액션과는 다른 특성을 가진다.
# 트리거가 실행된 다음 데이터를 갱신하게 될 인메모리 테이블에 데이터를 저장한다.
# 스파크는 이전 집계값보다 더 큰 값이 발생한 경우에만 인메모리 테이블을 갱신한다

```python
purchaseByCustomerPerHour.writeStream\
.format("memory")\ # 인메모리 테이블에 저장
.queryName("customer_purchases")\ # 인메모리에 저장될 테이블명
.outputMode("complete")\ # complete = 모든 카운트 수행 결과를 테이블에 저장
.start()
```


In [12]:
purchaseByCustomerPerHour.writeStream\
.format("memory")\
.queryName("customer_purchases")\
.outputMode("complete")\
.start()

<pyspark.sql.streaming.StreamingQuery at 0x10b30a4f0>

In [18]:
# 쿼리 실행 결과가 어떠한 형태로 인메모리 테이블에 기록되는지 확인 가능
spark.sql("""
SELECT *
FROM customer_purchases
ORDER BY `sum(total_cost)` DESC
""")\
.show(5)

+----------+--------------------+------------------+
|CustomerID|              window|   sum(total_cost)|
+----------+--------------------+------------------+
|      null|{2010-12-20 09:00...|31347.479999999938|
|   18102.0|{2010-12-06 09:00...|          25920.37|
|      null|{2010-12-09 09:00...|25399.560000000012|
|      null|{2010-12-16 09:00...|25371.769999999768|
|      null|{2010-12-05 09:00...|23395.099999999904|
+----------+--------------------+------------------+
only showing top 5 rows



In [19]:
purchaseByCustomerPerHour.writeStream\
.format("console")\
.queryName("customer_purchases_2")\
.outputMode("complete")\
.start()

<pyspark.sql.streaming.StreamingQuery at 0x10b34c2b0>

In [20]:
# 쿼리 실행 결과가 어떠한 형태로 인메모리 테이블에 기록되는지 확인 가능
spark.sql("""
SELECT *
FROM customer_purchases_2
ORDER BY `sum(total_cost)` DESC
""")\
.show(5)

AnalysisException: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient