In [323]:
from pyspark import SparkConf, SparkContext

In [324]:
number_cores = 2
memory_gb = 4
# Create a configuration object and
# set the name of the application
conf = (
    SparkConf()
        .setAppName("SparkExample")
        .setMaster('local[{}]'.format(number_cores))
        .set('spark.driver.memory', '{}g'.format(memory_gb))
)
# Create a Spark Context object
sc = SparkContext(conf=conf)

# Map

In [325]:
numbers = sc.parallelize(range(10))

In [326]:
numbers.map(lambda x:x * 10).collect()

                                                                                

[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]

In [327]:
def times_ten(x):
    return x * 10

numbers.map(times_ten).collect()

[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]

# Filter

In [328]:
numbers = sc.parallelize(range(10))

In [329]:
numbers.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [330]:
def is_even(x):
    return (x % 2) == 0

numbers.filter(is_even).collect()

[0, 2, 4, 6, 8]

In [331]:
def is_odd(x):
    return (x % 2)

numbers.filter(is_odd).collect()

[1, 3, 5, 7, 9]

# Flatmap

In [332]:
text = sc.textFile("input/sample.txt")

In [333]:
text.collect()

['Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
 'Donec vitae consectetur nisl, vel blandit magna.',
 'Pellentesque vel magna faucibus lectus suscipit varius ut id lorem.',
 'Nulla pretium diam mauris, vel tincidunt quam tempor non.',
 'Praesent non convallis ante. Nunc in velit ut dolor malesuada pellentesque.',
 'Nulla ultrices bibendum posuere. Mauris at est pulvinar sapien ultricies commodo.',
 'Morbi eu nunc turpis. Nunc interdum sem eget nunc aliquet suscipit.',
 'Nullam aliquet varius neque, sit amet condimentum odio euismod at.',
 'Donec semper dapibus odio, non luctus sem commodo nec.',
 'Quisque at tristique sapien, ut porta nibh.',
 'Donec mauris arcu, elementum eu commodo sed, condimentum et enim.',
 'In sed nunc neque.',
 'Vestibulum a suscipit velit. Sed viverra purus sit amet facilisis vehicula.']

In [334]:
words = text.flatMap(lambda x: x.split(" "))

In [335]:
words.collect()

['Lorem',
 'ipsum',
 'dolor',
 'sit',
 'amet,',
 'consectetur',
 'adipiscing',
 'elit.',
 'Donec',
 'vitae',
 'consectetur',
 'nisl,',
 'vel',
 'blandit',
 'magna.',
 'Pellentesque',
 'vel',
 'magna',
 'faucibus',
 'lectus',
 'suscipit',
 'varius',
 'ut',
 'id',
 'lorem.',
 'Nulla',
 'pretium',
 'diam',
 'mauris,',
 'vel',
 'tincidunt',
 'quam',
 'tempor',
 'non.',
 'Praesent',
 'non',
 'convallis',
 'ante.',
 'Nunc',
 'in',
 'velit',
 'ut',
 'dolor',
 'malesuada',
 'pellentesque.',
 'Nulla',
 'ultrices',
 'bibendum',
 'posuere.',
 'Mauris',
 'at',
 'est',
 'pulvinar',
 'sapien',
 'ultricies',
 'commodo.',
 'Morbi',
 'eu',
 'nunc',
 'turpis.',
 'Nunc',
 'interdum',
 'sem',
 'eget',
 'nunc',
 'aliquet',
 'suscipit.',
 'Nullam',
 'aliquet',
 'varius',
 'neque,',
 'sit',
 'amet',
 'condimentum',
 'odio',
 'euismod',
 'at.',
 'Donec',
 'semper',
 'dapibus',
 'odio,',
 'non',
 'luctus',
 'sem',
 'commodo',
 'nec.',
 'Quisque',
 'at',
 'tristique',
 'sapien,',
 'ut',
 'porta',
 'nibh.',
 'Do

In [336]:
text.map(lambda x: x.split(" ")).collect()

[['Lorem',
  'ipsum',
  'dolor',
  'sit',
  'amet,',
  'consectetur',
  'adipiscing',
  'elit.'],
 ['Donec', 'vitae', 'consectetur', 'nisl,', 'vel', 'blandit', 'magna.'],
 ['Pellentesque',
  'vel',
  'magna',
  'faucibus',
  'lectus',
  'suscipit',
  'varius',
  'ut',
  'id',
  'lorem.'],
 ['Nulla',
  'pretium',
  'diam',
  'mauris,',
  'vel',
  'tincidunt',
  'quam',
  'tempor',
  'non.'],
 ['Praesent',
  'non',
  'convallis',
  'ante.',
  'Nunc',
  'in',
  'velit',
  'ut',
  'dolor',
  'malesuada',
  'pellentesque.'],
 ['Nulla',
  'ultrices',
  'bibendum',
  'posuere.',
  'Mauris',
  'at',
  'est',
  'pulvinar',
  'sapien',
  'ultricies',
  'commodo.'],
 ['Morbi',
  'eu',
  'nunc',
  'turpis.',
  'Nunc',
  'interdum',
  'sem',
  'eget',
  'nunc',
  'aliquet',
  'suscipit.'],
 ['Nullam',
  'aliquet',
  'varius',
  'neque,',
  'sit',
  'amet',
  'condimentum',
  'odio',
  'euismod',
  'at.'],
 ['Donec',
  'semper',
  'dapibus',
  'odio,',
  'non',
  'luctus',
  'sem',
  'commodo',
  'n

In [337]:
words.count()

118

# Map partitions

In [338]:
text = sc.textFile("input/sample.txt", minPartitions=5)

In [339]:
words = text.flatMap(lambda x:x.split(" "))

In [340]:
def count_words(iterator):
    counts = {}
    for w in iterator:
        if w in counts:
            counts[w] += 1
        else:
            counts[w] = 1
    yield counts

word_counts = words.mapPartitions(count_words)

In [341]:
word_counts.collect()

[{'Lorem': 1,
  'ipsum': 1,
  'dolor': 1,
  'sit': 1,
  'amet,': 1,
  'consectetur': 2,
  'adipiscing': 1,
  'elit.': 1,
  'Donec': 1,
  'vitae': 1,
  'nisl,': 1,
  'vel': 2,
  'blandit': 1,
  'magna.': 1,
  'Pellentesque': 1,
  'magna': 1,
  'faucibus': 1,
  'lectus': 1,
  'suscipit': 1,
  'varius': 1,
  'ut': 1,
  'id': 1,
  'lorem.': 1},
 {'Nulla': 2,
  'pretium': 1,
  'diam': 1,
  'mauris,': 1,
  'vel': 1,
  'tincidunt': 1,
  'quam': 1,
  'tempor': 1,
  'non.': 1,
  'Praesent': 1,
  'non': 1,
  'convallis': 1,
  'ante.': 1,
  'Nunc': 1,
  'in': 1,
  'velit': 1,
  'ut': 1,
  'dolor': 1,
  'malesuada': 1,
  'pellentesque.': 1,
  'ultrices': 1,
  'bibendum': 1,
  'posuere.': 1,
  'Mauris': 1,
  'at': 1,
  'est': 1,
  'pulvinar': 1,
  'sapien': 1,
  'ultricies': 1,
  'commodo.': 1},
 {'Morbi': 1,
  'eu': 1,
  'nunc': 2,
  'turpis.': 1,
  'Nunc': 1,
  'interdum': 1,
  'sem': 1,
  'eget': 1,
  'aliquet': 2,
  'suscipit.': 1,
  'Nullam': 1,
  'varius': 1,
  'neque,': 1,
  'sit': 1,
  'ame

# Sample

In [342]:
data = sc.parallelize(range(10000))

In [343]:
data.count()

10000

In [344]:
data.sample(False, 0.1).count()

1009

# Union

In [345]:
rdd1 = sc.parallelize(range(5))

In [346]:
rdd1.collect()

[0, 1, 2, 3, 4]

In [347]:
rdd2 = sc.parallelize(range(5,10))

In [348]:
rdd2.collect()

[5, 6, 7, 8, 9]

In [349]:
rdd1.union(rdd2).collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# Intersection

In [350]:
rdd1 = sc.parallelize([1,1,2,3,4,5])

In [351]:
rdd2 = sc.parallelize([1,1,4,6])

In [352]:
rdd1.intersection(rdd2).collect()

[4, 1]

# Distinct

In [353]:
rdd = sc.parallelize(["a", "b"]).cartesian(sc.parallelize(range(100)))

In [354]:
rdd.collect()

[('a', 0),
 ('a', 1),
 ('a', 2),
 ('a', 3),
 ('a', 4),
 ('a', 5),
 ('a', 6),
 ('a', 7),
 ('a', 8),
 ('a', 9),
 ('a', 10),
 ('a', 11),
 ('a', 12),
 ('a', 13),
 ('a', 14),
 ('a', 15),
 ('a', 16),
 ('a', 17),
 ('a', 18),
 ('a', 19),
 ('a', 20),
 ('a', 21),
 ('a', 22),
 ('a', 23),
 ('a', 24),
 ('a', 25),
 ('a', 26),
 ('a', 27),
 ('a', 28),
 ('a', 29),
 ('a', 30),
 ('a', 31),
 ('a', 32),
 ('a', 33),
 ('a', 34),
 ('a', 35),
 ('a', 36),
 ('a', 37),
 ('a', 38),
 ('a', 39),
 ('a', 40),
 ('a', 41),
 ('a', 42),
 ('a', 43),
 ('a', 44),
 ('a', 45),
 ('a', 46),
 ('a', 47),
 ('a', 48),
 ('a', 49),
 ('a', 50),
 ('a', 51),
 ('a', 52),
 ('a', 53),
 ('a', 54),
 ('a', 55),
 ('a', 56),
 ('a', 57),
 ('a', 58),
 ('a', 59),
 ('a', 60),
 ('a', 61),
 ('a', 62),
 ('a', 63),
 ('a', 64),
 ('a', 65),
 ('a', 66),
 ('a', 67),
 ('a', 68),
 ('a', 69),
 ('a', 70),
 ('a', 71),
 ('a', 72),
 ('a', 73),
 ('a', 74),
 ('a', 75),
 ('a', 76),
 ('a', 77),
 ('a', 78),
 ('a', 79),
 ('a', 80),
 ('a', 81),
 ('a', 82),
 ('a', 83),
 (

In [355]:
first = rdd.map(lambda x:x[0])

In [356]:
first.collect()

['a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'a',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b',
 'b'

In [357]:
first.distinct().collect()

['b', 'a']

# Cartesian

In [358]:
ice_creams = range(5)

In [359]:
cookies = range(7)

In [360]:
[(a,b) for a in ice_creams for b in cookies]

[(0, 0),
 (0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (1, 0),
 (1, 1),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (2, 0),
 (2, 1),
 (2, 2),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (3, 0),
 (3, 1),
 (3, 2),
 (3, 3),
 (3, 4),
 (3, 5),
 (3, 6),
 (4, 0),
 (4, 1),
 (4, 2),
 (4, 3),
 (4, 4),
 (4, 5),
 (4, 6)]

In [361]:
ice_creams = sc.parallelize(range(5))

In [362]:
cookies = sc.parallelize(range(7))

In [363]:
combinations = ice_creams.cartesian(cookies)

In [364]:
combinations.collect()

[(0, 0),
 (0, 1),
 (0, 2),
 (1, 0),
 (1, 1),
 (1, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (0, 6),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (2, 0),
 (2, 1),
 (2, 2),
 (3, 0),
 (4, 0),
 (3, 1),
 (3, 2),
 (4, 1),
 (4, 2),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (3, 3),
 (4, 3),
 (3, 4),
 (3, 5),
 (4, 4),
 (4, 5),
 (3, 6),
 (4, 6)]

# Pipe

In [365]:
numbers = sc.parallelize(range(11))

In [366]:
numbers.pipe("grep 1").collect()

['1', '10']

In [367]:
rdd = sc.parallelize(["b,b", "c,c,c", "a"])

In [368]:
rdd.pipe("tr '[a-z]' '[A-Z]'").collect()

['B,B', 'C,C,C', 'A']

In [369]:
rdd.pipe("grep a").collect()

['a']

In [370]:
rdd.pipe("tr -s ',' '[\n*]'").collect()

['b', 'b', 'c', 'c', 'c', 'a']

# Coalesce

In [371]:
rdd = sc.parallelize(range(10000), numSlices=100)

In [372]:
rdd2 = rdd.coalesce(10)

# Repartition

In [373]:
numbers = sc.parallelize(range(1000), numSlices=1)

In [374]:
numbers.repartition(100)

MapPartitionsRDD[62] at coalesce at NativeMethodAccessorImpl.java:0

In [375]:
pairs = sc.parallelize([[1,1], [1,2], [2,3], [3,3]])

In [376]:
pairs.repartitionAndSortWithinPartitions(2).glom().collect()

[[(2, 3)], [(1, 1), (1, 2), (3, 3)]]

In [377]:
pairs.repartitionAndSortWithinPartitions(
    2, partitionFunc=lambda x:x == 1).glom().collect()

[[(2, 3), (3, 3)], [(1, 1), (1, 2)]]

# Stop the Spark Context

In [378]:
sc.stop()