## imports

In [1]:
from pyspark.sql import SparkSession

## Create SparkSession

In [2]:
spark = (SparkSession.builder.appName("TesttinRDDs").getOrCreate()) 

In [3]:
words_list = "Spark makes life a lot easier and puts me into good Spirits, Spark is Awesome!".split(" ")

In [4]:
type(words_list)

list

In [5]:
print(words_list)

['Spark', 'makes', 'life', 'a', 'lot', 'easier', 'and', 'puts', 'me', 'into', 'good', 'Spirits,', 'Spark', 'is', 'Awesome!']


In [6]:
words_rdd = spark.sparkContext.parallelize(words_list)

In [7]:
words_data = words_rdd.collect()

In [8]:
for word in words_data:
    print(word)

Spark
makes
life
a
lot
easier
and
puts
me
into
good
Spirits,
Spark
is
Awesome!


In [9]:
words_rdd.count()

15

In [10]:
words_rdd.distinct().count()

14

## Return words that start with S

In [12]:
def wordStartsWith(word, letter):
    return word.startswith(letter)

In [13]:
words_rdd.filter(lambda word: wordStartsWith(word, "S")).collect()

['Spark', 'Spirits,', 'Spark']

## SortByKey Transformation

In [14]:
countries_list = [("india",91),("USA",4),("Greece",13)]
countries_rdd = spark.sparkContext.parallelize(countries_list)

In [15]:
srtd_countries_list = countries_rdd.sortByKey().collect()

In [16]:
for country in srtd_countries_list:
    print(country)

('Greece', 13)
('USA', 4)
('india', 91)


In [17]:
srtd_countries_list = countries_rdd.map(lambda c: (c[1], c[0])).sortByKey(False).collect()

In [18]:
for country in srtd_countries_list:
    print(country)

(91, 'india')
(13, 'Greece')
(4, 'USA')


## Actions

In [19]:
num_list = [1,5,2,3,4]

## reduce

In [20]:
result = spark.sparkContext.parallelize(num_list).reduce(lambda x, y: x+y)
print(result)

15


In [21]:
def sumList(x, y):
    print(x, y)
    return x + y

In [22]:
result = spark.sparkContext.parallelize(num_list).reduce(lambda x, y: sumList(x, y))
print(result)

1 5
6 2
8 7
15


In [23]:
def wordLengthReducer(leftWord, rightWord):
    if len(leftWord) > len(rightWord):
        return leftWord
    else:
        return rightWord

In [24]:
words_rdd.reduce(wordLengthReducer)

'Awesome!'

In [25]:
words_rdd.first()

'Spark'

In [26]:
spark.sparkContext.parallelize(range(1, 21)).max()

20

In [27]:
spark.sparkContext.parallelize(range(1, 21)).min()

1