In [3]:
# [+] PySpark 설정
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster('local').setAppName('shuflling-and-partitioning')
sc = SparkContext(conf=conf)

In [4]:
# 데이터셋
filename = "fhvhv_tripdata_2020-03_short.csv"

In [7]:
# [+] 데이터셋 로딩 및 헤더 추출
lines = sc.textFile('./data/' + filename)
header = lines.first()

In [8]:
header

'hvfhs_license_num,dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,SR_Flag'

In [9]:
# [+] filter()를 이용한 헤더 제거
filtered_lines = lines.filter(lambda x: x != header)

In [10]:
filtered_lines.first()

'HV0005,B02510,2020-03-01 00:03:40,2020-03-01 00:23:39,81,159,'

In [19]:
# [+] map()을 이용한 승차일자 추출하기(K-V RDD로 저장)

dates = filtered_lines.map(lambda x: (x.split(",")[2].split(" ")[0], 1))

In [20]:
# 첫 번째 값 출력하기, 데이터가 클 때 collect() 사용은 비효율적
dates.first()

('2020-03-01', 1)

In [23]:
"""
  일별 승차횟수 계산: reduceByKey()
"""

# [+] reduceByKey()를 이용한 일별 승차횟수 계산
reduced = dates.reduceByKey(lambda x, y: x + y)

In [24]:
reduced.collect()

[('2020-03-04', 707879),
 ('2020-03-01', 784246),
 ('2020-03-03', 697880),
 ('2020-03-02', 648986),
 ('2020-03-06', 872012),
 ('2020-03-07', 418828),
 ('2020-03-05', 731165)]

In [25]:
"""
  일별 승차횟수 계산: groupByKey()
"""

# [+] groupByKey()를 이용한 일별 데이터 그룹핑
groups = dates.groupByKey()

In [27]:
groups.first() # (1, ,1, 1, 1, 1, ....)

('2020-03-04', <pyspark.resultiterable.ResultIterable at 0x2760d4dc7f0>)

In [28]:
# [+] mapValues()를 이용한 일별 승차횟수 계산
counts = groups.mapValues(len)

In [29]:
counts.collect()

[('2020-03-04', 707879),
 ('2020-03-01', 784246),
 ('2020-03-03', 697880),
 ('2020-03-02', 648986),
 ('2020-03-06', 872012),
 ('2020-03-07', 418828),
 ('2020-03-05', 731165)]

In [30]:
# partitionBy
x = sc.parallelize([(0,1),(1,2),(2,3)],2)
y = x.partitionBy(numPartitions=3, partitionFunc=lambda x: x)
print(x.glom().collect())
print(y.glom().collect())

[[(0, 1)], [(1, 2), (2, 3)]]
[[(0, 1)], [(1, 2)], [(2, 3)]]


In [31]:
# repartition

x = sc.parallelize([1,2,3,4,5],2)
y = x.repartition(numPartitions=3)
print(x.glom().collect())
print(y.glom().collect())

[[1, 2], [3, 4, 5]]
[[], [1, 2], [3, 4, 5]]


In [32]:
# coalesce

x = sc.parallelize([1,2,3,4,5],2)
y = x.coalesce(numPartitions=1)

print(x.glom().collect())
print(y.glom().collect())

[[1, 2], [3, 4, 5]]
[[1, 2, 3, 4, 5]]
