In [1]:
from pyspark import SparkContext

## Basic Filter

In [2]:
sc = SparkContext.getOrCreate()
nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7])

In [3]:
filtered1 = nums.filter(lambda x : x % 2 == 1)
filtered1.collect()

[1, 3, 5, 7]

In [4]:
filtered2 = nums.filter(lambda x : x % 2 == 0)
filtered2.collect()

[2, 4, 6]

## Find average by using combineByKey()

In [5]:
input = [("k1", 1), ("k1", 2), ("k1", 3), ("k1", 4), ("k1", 5), 
         ("k2", 6), ("k2", 7), ("k2", 8), ("k3", 10), ("k3", 12)]
sc = SparkContext.getOrCreate()
rdd = sc.parallelize(input)
sumCount = rdd.combineByKey((lambda x: (x,1)),  # ceateCombiner
                            (lambda x,value: (x[0] + value, x[1] + 1)),  # mergeValue
                            (lambda x,y: (x[0] + y[0], x[1] + y[1])))  # mergeCombiner
sumCount.collect()

[('k1', (15, 5)), ('k2', (21, 3)), ('k3', (22, 2))]

In [6]:
avg = sumCount.mapValues( lambda v : v[0] / v[1])
avg.collect()

[('k1', 3.0), ('k2', 7.0), ('k3', 11.0)]

## How to find average

In [7]:
sc = SparkContext.getOrCreate()
nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 20])
sumAndCount = nums.map(lambda x: (x,1)).fold((0,0),
                                            lambda x,y: (x[0] + y[0], x[1] + y[1]))
sumAndCount

(56, 9)

In [8]:
avg = float(sumAndCount[0]) / float(sumAndCount[1])
avg

6.222222222222222

## Cartesian Product: rdd1.cartesian(rdd2)

In [9]:
sc = SparkContext.getOrCreate()
a = [('k1','v1'), ('k2', 'v2')]
b = [('k3','v3'), ('k4', 'v4'), ('k5', 'v5')]
rdd1 = sc.parallelize(a)
rdd2 = sc.parallelize(b)
rdd3 = rdd1.cartesian(rdd2)
rdd3.collect()

[(('k1', 'v1'), ('k3', 'v3')),
 (('k1', 'v1'), ('k4', 'v4')),
 (('k1', 'v1'), ('k5', 'v5')),
 (('k2', 'v2'), ('k3', 'v3')),
 (('k2', 'v2'), ('k4', 'v4')),
 (('k2', 'v2'), ('k5', 'v5'))]

## Sort By Key: sortByKey() ascending/descending

In [10]:
sc = SparkContext.getOrCreate()
lines = sc.textFile('data.txt',1)
lines.collect()

['crazy crazy fox jumped',
 'crazy fox jumped',
 'fox is fast',
 'fox is smart',
 'dog is smart']

In [11]:
frequencies = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y)
frequencies.collect()

[('crazy', 3),
 ('fox', 4),
 ('jumped', 2),
 ('is', 3),
 ('fast', 1),
 ('smart', 2),
 ('dog', 1)]

In [12]:
frequencies.count()

7

In [13]:
sorted_freq = frequencies.sortByKey()
sorted_freq.collect()

[('crazy', 3),
 ('dog', 1),
 ('fast', 1),
 ('fox', 4),
 ('is', 3),
 ('jumped', 2),
 ('smart', 2)]

In [14]:
sorted_freq_descending = frequencies.sortByKey(False)
sorted_freq_descending.collect()

[('smart', 2),
 ('jumped', 2),
 ('is', 3),
 ('fox', 4),
 ('fast', 1),
 ('dog', 1),
 ('crazy', 3)]

## How to Add Indices

In [15]:
sc = SparkContext.getOrCreate()
a = [('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)]
rdd = sc.parallelize(a)
sort_rdd = rdd.sortByKey()
sort_rdd.collect()

[('g1', 2), ('g2', 4), ('g3', 3), ('g4', 8)]

In [16]:
rdd2 = rdd.map(lambda x: (x[1],x[0]))
rdd2.collect()

[(2, 'g1'), (4, 'g2'), (3, 'g3'), (8, 'g4')]

In [17]:
sort_rdd2 = rdd2.sortByKey()
sort_rdd2.collect()

[(2, 'g1'), (3, 'g3'), (4, 'g2'), (8, 'g4')]

In [18]:
sort2_rdd2 = rdd2.sortByKey(False)
sort2_rdd2.collect()

[(8, 'g4'), (4, 'g2'), (3, 'g3'), (2, 'g1')]

In [19]:
indices = sort_rdd2.zipWithIndex()
indices.collect()

[((2, 'g1'), 0), ((3, 'g3'), 1), ((4, 'g2'), 2), ((8, 'g4'), 3)]

## Map Partitions: mapPartitions() by Examples

In [20]:
numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
rdd = sc.parallelize(numbers, 3)
rdd.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [21]:
rdd.getNumPartitions()

3

In [22]:
def f(iterator):
    for x in iterator:
        print(x)
    print("===")

In [23]:
rdd.foreachPartition(f)  # have problem with print function

In [24]:
def adder(iterator):
    yield sum(iterator)

In [25]:
rdd.mapPartitions(adder).collect()

[6, 15, 34]