In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext

sc = SparkContext("local", "first app")
# local은 Standalone 모드로 돌리라는 것
# 내 컴퓨터 안에서 멀티 스레딩을 진행

In [3]:
data = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 1)
# 두 번째 인자는 partition의 개수

In [4]:
data_reduce = data.reduce(lambda x, y : x + y)
print(data_reduce)

55


In [6]:
print(data.collect())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [7]:
print(data.count())

10


In [8]:
print(data.first())

1


In [10]:
print(data.take(7))

[1, 2, 3, 4, 5, 6, 7]


In [11]:
print(data.takeSample(False, 3))

[7, 9, 3]


In [13]:
print(data.takeOrdered(3))
print(data.take(3))

[1, 2, 3]
[1, 2, 3]


In [14]:
data.saveAsTextFile("data")

In [15]:
data_reread = sc.textFile("data")
print(data_reread.collect())

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']


In [17]:
data2 = sc.parallelize([('a', 1), ('b', 2), ('c', 3), ('a', 4)])
print(data2.countByKey())

defaultdict(<class 'int'>, {'a': 2, 'b': 1, 'c': 1})


In [18]:
print(data.map(lambda x : x + 1).collect())

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


In [19]:
print(data.filter(lambda x : x < 5).collect())

[1, 2, 3, 4]


In [20]:
print(data.map(lambda x : [x, x*x]).collect())
print(data.flatMap(lambda x : [x, x*x]).collect())

[[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36], [7, 49], [8, 64], [9, 81], [10, 100]]
[1, 1, 2, 4, 3, 9, 4, 16, 5, 25, 6, 36, 7, 49, 8, 64, 9, 81, 10, 100]


In [22]:
data = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 3)

def f(x) : 
    yield sum(x)

print(data.mapPartitions(f).collect())

[6, 15, 34]


In [23]:
x = sc.parallelize([('a', ['apple', 'banana', 'lemon']), ('b', ['grapes'])])

def f(x) : 
    return len(x)

print(x.mapValues(f).collect())

[('a', 3), ('b', 1)]


In [24]:
data = sc.parallelize([('a', 1), ('b', 2), ('c', 3), ('a', 4)])
print(data.reduceByKey(lambda x, y : x + y).collect())

[('a', 5), ('b', 2), ('c', 3)]


In [26]:
print(data.groupByKey().mapValues(list).collect())

[('a', [1, 4]), ('b', [2]), ('c', [3])]


In [29]:
words = ['one', 'two', 'two', 'three', 'three', 'three']
wordPairsRDD = sc.parallelize(words).map(lambda word: (word, 1))

print(wordPairsRDD.reduceByKey(lambda a, b : a + b).collect())
print(wordPairsRDD.groupByKey().map(lambda t : (t[0], sum(t[1]))).collect())
# reduce가 cost가 더 적음

[('one', 1), ('two', 2), ('three', 3)]
[('one', 1), ('two', 2), ('three', 3)]


In [30]:
print(data.sortByKey().collect())
print(data.sortByKey(False).collect())

[('a', 1), ('a', 4), ('b', 2), ('c', 3)]
[('c', 3), ('b', 2), ('a', 1), ('a', 4)]


In [31]:
data = sc.parallelize([1, 2, 3, 4, 5], 3)
print(data.glom().collect())
print(data.coalesce(1).glom().collect())
# coalesce(numPartitions, shuffle = False)
# numPartitions개의 파티션으로 파티션 개수를 줄인 새 RDD를 생성

[[1], [2, 3], [4, 5]]
[[1, 2, 3, 4, 5]]


In [32]:
print(data.glom().collect())
# 각 파티션을 하나의 리스트에 나열

[[1], [2, 3], [4, 5]]


In [35]:
data = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 4)
print(data.glom().collect())

print(data.repartition(2).glom().collect())
# 데이터 파티션을 새로 나눔
# 셔플링이 일어남

[[1, 2], [3, 4], [5, 6], [7, 8, 9, 10]]
[[1, 2, 5, 6, 7, 8, 9, 10], [3, 4]]


In [36]:
data = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 3)
sample = data.sample(False, 0.5, 777)
print(sample.collect())
# sample(withReplacement, fraction, seed)
# 중복추출 여부, 전체 데이터셋과 리턴 데이터 셋 간의 크기 비율, seed

[2, 3, 4, 5, 9]


In [37]:
data = sc.parallelize([1, 2, 3, 4, 5, 4, 5], 1)
print(data.distinct().collect())
# 중복된 값을 제거한 새 RDD 리턴

[1, 2, 3, 4, 5]


In [39]:
data1 = sc.parallelize([1, 2, 3, 4, 5, 6], 1)
data2 = sc.parallelize([6, 7, 8, 9, 10], 1)
print(data1.union(data2).collect())
# 중복 제거가 안됨

[1, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10]


In [40]:
data1 = sc.parallelize([1, 2, 3, 4, 5], 1)
data2 = sc.parallelize([3, 4, 5, 6, 7], 1)
print(data1.intersection(data2).collect())

[4, 3, 5]


In [41]:
data1 = sc.parallelize([(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')])
data2 = sc.parallelize([(1, 'e'), (2, 'f'), (3, 'g'), (4, 'h')])

group1 = data1.cogroup(data2).collect()
print(group1[0])

group2 = data1.cogroup(data2).mapValues(lambda t : (list(t[0]), list(t[1]))).collect()
print(group2)

(2, (<pyspark.resultiterable.ResultIterable object at 0x7f363a85a908>, <pyspark.resultiterable.ResultIterable object at 0x7f363a85a588>))
[(2, (['b'], ['f'])), (4, (['d'], ['h'])), (1, (['a'], ['e'])), (3, (['c'], ['g']))]


In [45]:
pairs = sc.parallelize([1, 2, 3, 4, 2, 4, 1]).map(lambda x : (x, x))
print(pairs.collect())
sets = pairs.partitionBy(2).glom().collect()
print(sets)
print(len(set(sets[0]).intersection(set(sets[1]))))

[(1, 1), (2, 2), (3, 3), (4, 4), (2, 2), (4, 4), (1, 1)]
[[(2, 2), (4, 4), (2, 2), (4, 4)], [(1, 1), (3, 3), (1, 1)]]
0


In [47]:
x = sc.parallelize([('a', 1), ('b', 4)])
y = sc.parallelize([('a', 2), ('a', 3)])
print(sorted(x.join(y).collect()))

[('a', (1, 2)), ('a', (1, 3))]
