In [1]:
import pyspark

### 동작 테스트

In [2]:
myRange = spark.range(1000).toDF("number")

In [3]:
myRange

DataFrame[number: bigint]

In [4]:
myRange = spark.range(1000).toDF("number")

In [5]:
divisBy2 = myRange.where("number % 2 = 0")

In [6]:
divisBy2.count()

                                                                                

500

### Example

In [10]:
flightData2015 = spark\
.read\
.option("inferSchema", "true")\
.option("header", "true")\
.csv("./2015-summary.csv")

In [11]:
flightData2015.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [12]:
## 주의: sort는 데이터를 변환하지 않는다. 새로운 데이터프레임을 반환하는 것(immutable)
flightData2015.sort("count").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#49 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(count#49 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=89]
      +- FileScan csv [DEST_COUNTRY_NAME#47,ORIGIN_COUNTRY_NAME#48,count#49] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/anchangbae/spark/spark-the-definitive-guide/assets/exercis..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




In [13]:
## Default는 200개
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [14]:
flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

### SQL
- Q. 스칼라 스타일이라서 자꾸 카멜케이스?

In [15]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [16]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

dataFrameWay = flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.count()
sqlWay.explain()
dataFrameWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#47], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#47, 5), ENSURE_REQUIREMENTS, [plan_id=111]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#47], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#47] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/anchangbae/spark/spark-the-definitive-guide/assets/exercis..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#47], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#47, 5), ENSURE_REQUIREMENTS, [plan_id=124]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#47], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#47] Batched: false, DataFilters: [], Format: CSV, Location: InMe

In [19]:
spark.sql("SELECT max(count) from flight_data_2015").take(1)

[Row(max(count)=370002)]

In [18]:
## built-in-function인 max를 덮어쓴다;
# from pyspark.sql.functions import max
import pyspark.sql.functions as F
flightData2015.select(F.max("count")).take(1)

[Row(max(count)=370002)]

In [21]:
# from pyspark.sql.functions import desc
flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.sum("count")\
.withColumnRenamed("sum(count)", "destination_total")\
.sort(F.desc("destination_total"))\
.limit(5)\
.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [22]:
# in Python
flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.sum("count")\
.withColumnRenamed("sum(count)", "destination_total")\
.sort(desc("destination_total"))\
.limit(5)\
.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[destination_total#148L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#47,destination_total#148L])
   +- HashAggregate(keys=[DEST_COUNTRY_NAME#47], functions=[sum(count#49)])
      +- Exchange hashpartitioning(DEST_COUNTRY_NAME#47, 5), ENSURE_REQUIREMENTS, [plan_id=324]
         +- HashAggregate(keys=[DEST_COUNTRY_NAME#47], functions=[partial_sum(count#49)])
            +- FileScan csv [DEST_COUNTRY_NAME#47,count#49] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/anchangbae/spark/spark-the-definitive-guide/assets/exercis..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>


