In [177]:
from pyspark import SparkConf, SparkContext, StorageLevel
import sys
import os
import shutil

In [178]:
number_cores = 2
memory_gb = 4
# Create a configuration object and
# set the name of the application
conf = (
    SparkConf()
        .setAppName("SparkExample")
        .setMaster('local[{}]'.format(number_cores))
        .set('spark.driver.memory', '{}g'.format(memory_gb))
)
# Create a Spark Context object
sc = SparkContext(conf=conf)

# RDDs

In [179]:
rdd = sc.parallelize([1,2,3,4,5])

In [180]:
rdd

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274

In [181]:
doubled = rdd.map(lambda x: x * 2)

In [182]:
doubled

PythonRDD[1] at RDD at PythonRDD.scala:53

In [183]:
doubled.toDebugString()

b'(2) PythonRDD[1] at RDD at PythonRDD.scala:53 []\n |  ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:274 []'

# Text file

In [184]:
text = sc.textFile("input/sample.txt")

In [185]:
text.collect()

['Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
 'Donec vitae consectetur nisl, vel blandit magna.',
 'Pellentesque vel magna faucibus lectus suscipit varius ut id lorem.',
 'Nulla pretium diam mauris, vel tincidunt quam tempor non.',
 'Praesent non convallis ante. Nunc in velit ut dolor malesuada pellentesque.',
 'Nulla ultrices bibendum posuere. Mauris at est pulvinar sapien ultricies commodo.',
 'Morbi eu nunc turpis. Nunc interdum sem eget nunc aliquet suscipit.',
 'Nullam aliquet varius neque, sit amet condimentum odio euismod at.',
 'Donec semper dapibus odio, non luctus sem commodo nec.',
 'Quisque at tristique sapien, ut porta nibh.',
 'Donec mauris arcu, elementum eu commodo sed, condimentum et enim.',
 'In sed nunc neque.',
 'Vestibulum a suscipit velit. Sed viverra purus sit amet facilisis vehicula.']

In [186]:
text = sc.textFile("input/sample.txt", minPartitions=20)

In [187]:
text.collect()

['Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
 'Donec vitae consectetur nisl, vel blandit magna.',
 'Pellentesque vel magna faucibus lectus suscipit varius ut id lorem.',
 'Nulla pretium diam mauris, vel tincidunt quam tempor non.',
 'Praesent non convallis ante. Nunc in velit ut dolor malesuada pellentesque.',
 'Nulla ultrices bibendum posuere. Mauris at est pulvinar sapien ultricies commodo.',
 'Morbi eu nunc turpis. Nunc interdum sem eget nunc aliquet suscipit.',
 'Nullam aliquet varius neque, sit amet condimentum odio euismod at.',
 'Donec semper dapibus odio, non luctus sem commodo nec.',
 'Quisque at tristique sapien, ut porta nibh.',
 'Donec mauris arcu, elementum eu commodo sed, condimentum et enim.',
 'In sed nunc neque.',
 'Vestibulum a suscipit velit. Sed viverra purus sit amet facilisis vehicula.']

# Actions

In [188]:
numbers = sc.parallelize(range(20))

In [189]:
numbers.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [190]:
numbers.sum()

190

In [191]:
numbers.take(3)

[0, 1, 2]

In [192]:
numbers.count()

20

In [193]:
# Remove output dir before doing anything
outpath = 'output/out_numbers'
if os.path.exists(outpath) and os.path.isdir(outpath):
    shutil.rmtree(outpath)

numbers.saveAsTextFile(outpath)

# Transformations

In [194]:
numbers = sc.parallelize(range(20))
small_numbers = sc.parallelize(range(5))

In [195]:
numbers.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [196]:
small_numbers.collect()

[0, 1, 2, 3, 4]

In [197]:
combined = numbers.union(small_numbers)

In [198]:
combined.collect()

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 0,
 1,
 2,
 3,
 4]

# Persisting data

In [199]:
input = sc.parallelize(range(1000))

In [200]:
result = input.map(lambda x: x ** 5)

In [201]:
# Remove output dir before doing anything
outpath = 'output/first_output'
if os.path.exists(outpath) and os.path.isdir(outpath):
    shutil.rmtree(outpath)

result.saveAsTextFile(outpath)

In [202]:
# Remove output dir before doing anything
outpath = 'output/second_output'
if os.path.exists(outpath) and os.path.isdir(outpath):
    shutil.rmtree(outpath)

result.map(lambda x: "number: {0}".format(x)).saveAsTextFile(outpath)

In [203]:
input.persist(StorageLevel.MEMORY_ONLY)

PythonRDD[26] at RDD at PythonRDD.scala:53

In [204]:
input.is_cached

True

In [205]:
input.unpersist()

PythonRDD[26] at RDD at PythonRDD.scala:53

In [206]:
input.is_cached

False

# Stop the Spark Context

In [207]:
sc.stop()