## 정적 데이터프레임 버전

In [89]:
import pyspark
import pyspark.sql.functions as F

# Working with 정적 DataFrame

In [118]:
# 로컬 모드로 이 코드를 실행하려면 로컬 모드에 적합한 셔플 파티션 수를 설정해 주는 것이 좋다.
# 기본 200이지만, 로컬 모드에서는 많은 익스큐터가 필요하지 않기 때문에 5로 설정한다.
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [119]:
staticDataFrame = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("../../assets/exercises/week03/all/online-retail-dataset.csv")

In [120]:
staticDataFrame.take(3)

[Row(InvoiceNo='536365', StockCode='85123A', Description='WHITE HANGING HEART T-LIGHT HOLDER', Quantity=6, InvoiceDate='12/1/2010 8:26', UnitPrice=2.55, CustomerID=17850, Country='United Kingdom'),
 Row(InvoiceNo='536365', StockCode='71053', Description='WHITE METAL LANTERN', Quantity=6, InvoiceDate='12/1/2010 8:26', UnitPrice=3.39, CustomerID=17850, Country='United Kingdom'),
 Row(InvoiceNo='536365', StockCode='84406B', Description='CREAM CUPID HEARTS COAT HANGER', Quantity=8, InvoiceDate='12/1/2010 8:26', UnitPrice=2.75, CustomerID=17850, Country='United Kingdom')]

In [121]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [122]:
staticDataFrame.createOrReplaceTempView("retail_data")

In [123]:
staticSchema = staticDataFrame.schema

- 시계열 데이터를 그룹화하고 집계할 수 있다.
- 특정 고객이 대량으로 구매하는 영업시간을 확인할 수 있다.

In [124]:
staticDataFrame\
.selectExpr(
"CustomerID",
"(UnitPrice * Quantity) as total_cost",
"InvoiceDate")\
.groupBy(F.col("CustomerID"))\
.sum("total_cost")\
.show(5)

+----------+------------------+
|CustomerID|   sum(total_cost)|
+----------+------------------+
|     17850| 5288.630000000009|
|     13047|3079.1000000000004|
|     15311|59419.339999999975|
|     18074|             489.6|
|     17420| 598.8299999999999|
+----------+------------------+
only showing top 5 rows



In [125]:
# window function을 활용하기 위해 .. The time column must be of pyspark.sql.types.TimestampType.
# 참고: https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.window.html
staticDataFrame = staticDataFrame\
.withColumn("date", F.to_date(F.to_timestamp(F.col("InvoiceDate"), "M/d/yyyy H:mm")))

In [126]:
staticDataFrame.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- date: date (nullable = true)



In [127]:
staticDataFrame\
.selectExpr(
"CustomerID",
"(UnitPrice * Quantity) as total_cost",
"date")\
.groupBy(
    F.col("CustomerID"), F.window(F.col("date"), "1 day"))\
.sum("total_cost")\
.show(5)

# ? -27 ????

+----------+--------------------+-----------------+
|CustomerID|              window|  sum(total_cost)|
+----------+--------------------+-----------------+
|     13047|{2010-11-30 09:00...|366.6300000000001|
|     13748|{2010-11-30 09:00...|            204.0|
|     15100|{2010-11-30 09:00...|            350.4|
|     14527|{2010-11-30 09:00...|            -27.5|
|     12431|{2010-11-30 09:00...|           358.25|
+----------+--------------------+-----------------+
only showing top 5 rows



In [128]:
spark.sql("""
SELECT CustomerID, to_date(to_timestamp(InvoiceDate, 'M/d/yyyy H:mm')) as date, sum(UnitPrice * Quantity) as total_cost
FROM retail_data
GROUP BY CustomerID, date""").show(5)

+----------+----------+-----------------+
|CustomerID|      date|       total_cost|
+----------+----------+-----------------+
|     13047|2010-12-01|366.6300000000001|
|     15100|2010-12-01|            350.4|
|     15291|2010-12-01|            328.8|
|     17809|2010-12-01|             34.8|
|     17924|2010-12-01|            279.0|
+----------+----------+-----------------+
only showing top 5 rows

