In [5]:
import findspark
findspark.init()

import pyspark
from pyspark import SparkContext
from datetime import datetime

## Initialize Spark Context

In [2]:
sc = SparkContext("local", "tutorial")

## Word Count

In [7]:
words = sc.parallelize (
   ["scala", 
   "java", 
   "hadoop", 
   "spark", 
   "akka",
   "spark vs hadoop", 
   "pyspark",
   "pyspark and spark"]
)
counts = words.count()

%time print(f'counts: {counts}')

counts: 8
CPU times: user 69 µs, sys: 57 µs, total: 126 µs
Wall time: 132 µs


## Collect

In [8]:
words.collect()

['scala',
 'java',
 'hadoop',
 'spark',
 'akka',
 'spark vs hadoop',
 'pyspark',
 'pyspark and spark']

## ForEach

In [23]:
def f(x):
    print(x)
    
words.foreach(f)

## Filter

In [26]:
words.filter(lambda x: 'py' in x).collect()

['pyspark', 'pyspark and spark']

## Map

In [31]:
words.map(lambda x: (x, 1, 3)).collect()

[('scala', 1, 3),
 ('java', 1, 3),
 ('hadoop', 1, 3),
 ('spark', 1, 3),
 ('akka', 1, 3),
 ('spark vs hadoop', 1, 3),
 ('pyspark', 1, 3),
 ('pyspark and spark', 1, 3)]

## Reduce 

In [33]:
from operator import add

nums = sc.parallelize([1, 2, 3, 4, 5, 10])
nums.reduce(add)

25

## Join

1. **join**: 두개의 RDD에 모두 존재하는 elements만 join이 되고, 나머지는 제외
2. **fullOuterJoin**: 모든 elements를 join 시킨다 

In [37]:
x = sc.parallelize([('spark', 1), ('ml', 10), ('power', 2)])
y = sc.parallelize([('spark', 2), ('ml', 5), ('happy', 3)])
joined = x.join(y)
joined.collect()

[('ml', (10, 5)), ('spark', (1, 2))]

In [39]:
x.fullOuterJoin(y).collect()

[('ml', (10, 5)),
 ('power', (2, None)),
 ('spark', (1, 2)),
 ('happy', (None, 3))]

In [40]:
x.leftOuterJoin(y).collect()

[('ml', (10, 5)), ('power', (2, None)), ('spark', (1, 2))]

## Cache

"MEMORY_ONLY" 일경우.. 메모리에 RDD를 persist시킨다 

In [42]:
words.cache()
words.persist().is_cached

True