In [263]:
from pyspark import SparkConf, SparkContext
import os
import shutil

In [264]:
number_cores = 2
memory_gb = 4
# Create a configuration object and
# set the name of the application
conf = (
    SparkConf()
        .setAppName("SparkExample")
        .setMaster('local[{}]'.format(number_cores))
        .set('spark.driver.memory', '{}g'.format(memory_gb))
)
# Create a Spark Context object
sc = SparkContext(conf=conf)

# Reduce

In [265]:
numbers = sc.parallelize(range(10), numSlices=3)

In [266]:
numbers.glom().collect()

[[0, 1, 2], [3, 4, 5], [6, 7, 8, 9]]

In [267]:
numbers.reduce(max)

9

In [268]:
numbers.reduce(lambda x,y: x + y)

45

# Collect

In [269]:
numbers = sc.parallelize(range(10))

In [270]:
numbers.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [271]:
huge_rdd = sc.parallelize(range(100000))

In [272]:
huge_rdd.sample(withReplacement=False, fraction=0.00001, seed=1).collect()

[20963, 45815]

# Count

In [273]:
numbers = sc.parallelize(range(1000000), numSlices=10)

In [274]:
numbers.count()

1000000

In [275]:
numbers.countApprox(timeout=200, confidence=0.5)

1000000

In [276]:
numbers.countApproxDistinct(relativeSD=0.04)

                                                                                

1036503

# First

In [277]:
numbers = sc.parallelize(range(100))

In [278]:
numbers.first()

0

In [279]:
sc.parallelize([3,2,1]).sortBy(lambda x:x).first()

1

# Take

In [280]:
numbers = sc.parallelize(range(10))

In [281]:
numbers.take(1)

[0]

In [282]:
numbers.first()

0

In [283]:
numbers.take(5)

[0, 1, 2, 3, 4]

In [284]:
numbers.takeSample(withReplacement=False, num=1, seed=1)

[6]

In [285]:
numbers.takeSample(withReplacement=False, num=1, seed=2)

[5]

In [286]:
len(numbers.takeSample(withReplacement=False, num=100000, seed=1))

10

In [287]:
len(numbers.takeSample(withReplacement=True, num=100000, seed=1))

100000

In [288]:
numbers = sc.parallelize([1,2,3,6,7,8,4,5,1])

In [289]:
numbers.take(10)

[1, 2, 3, 6, 7, 8, 4, 5, 1]

In [290]:
numbers.takeOrdered(10)

[1, 1, 2, 3, 4, 5, 6, 7, 8]

In [291]:
numbers.takeOrdered(10, key=lambda x: -x)

[8, 7, 6, 5, 4, 3, 2, 1, 1]

# Save as text file

In [292]:
numbers = sc.parallelize(range(1000), numSlices=5)

In [293]:
# Remove output dir before doing anything
outpath = 'output/test_numbers'
if os.path.exists(outpath) and os.path.isdir(outpath):
    shutil.rmtree(outpath)

numbers.saveAsTextFile(outpath)

In [294]:
# Remove output dir before doing anything
outpath = 'output/test_numbers_gz'
if os.path.exists(outpath) and os.path.isdir(outpath):
    shutil.rmtree(outpath)

numbers.saveAsTextFile(outpath,
                    compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")

# Count by key

In [295]:
pairs = sc.parallelize([("a", 1), ("b", 2), ("b", 3)])

In [296]:
pairs.countByKey()

defaultdict(int, {'a': 1, 'b': 2})

# ForEach

In [297]:
def add_to_queue(x, queue=[]):
    queue += [x]
    return queue

In [298]:
add_to_queue(3)

[3]

In [299]:
add_to_queue(4)

[3, 4]

In [300]:
numbers = sc.parallelize(range(100))

In [301]:
numbers.foreach(add_to_queue)

In [302]:
add_to_queue(5)

[3, 4, 5]

# Stop the Spark Context

In [303]:
sc.stop()