In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("TestingRDDS").getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/16 13:59:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
words_list = "Spark makes life a lot easier and puts me into good Spirits, Spark is too Awesome!".split(" ")

In [3]:
type(words_list)

list

In [4]:
print(words_list)

['Spark', 'makes', 'life', 'a', 'lot', 'easier', 'and', 'puts', 'me', 'into', 'good', 'Spirits,', 'Spark', 'is', 'too', 'Awesome!']


In [5]:
words_rdd = spark.sparkContext.parallelize(words_list)

In [6]:
words_data = words_rdd.collect()

[Stage 0:>                                                          (0 + 4) / 4]                                                                                

In [7]:
for word in words_data:
    print(word)

Spark
makes
life
a
lot
easier
and
puts
me
into
good
Spirits,
Spark
is
too
Awesome!


In [8]:
# Distinct alllows us to remove dupicates, e.g the word Spark.

words_rdd.count()

                                                                                

16

In [9]:
words_rdd.distinct().count()

                                                                                

15

In [10]:
words_data = words_rdd.collect()
for word in words_data:
    print(word)

Spark
makes
life
a
lot
easier
and
puts
me
into
good
Spirits,
Spark
is
too
Awesome!


In [11]:
words_unique_rdd = words_rdd.distinct()

In [12]:
for word in words_unique_rdd.collect():
    print(word)

good
makes
life
a
lot
and
puts
Awesome!
Spark
into
Spirits,
is
easier
me
too


In [13]:
# To see if a word starts with S or not. Using python function called startswith.

def wordStartsWith(word, letter):
    return word.startswith(letter)

In [14]:
# Filtering records - normally use anonymous functions like lambda.
# Using anonymous lambda function to test whether the word starts with an S.
# Collecting it into a list.

words_rdd.filter(lambda word: wordStartsWith(word, "S")).collect()

['Spark', 'Spirits,', 'Spark']

In [15]:
# lambda word: wordStartsWith(word, "S")
# The lambda variable is just word.
# After the semi-colon we write any expression that can operate on the word variable.
# As for the expression we have chosen to call a function that would return either true/false.
# Dependeing on whether the match has been found or not.
# If true (match is found), it is collected in a python list as shown in our output above.

In [16]:
# Map and Flat Map RDD Transformations

# Map Transformation - Used to apply any complex operations like adding/updating column 
# or just transforming the data. The output of the map transformations would always have 
# the same number of records as inputs.

# Example with list of numbers. For each number calculate the square and return the RDD that 
# contains a list of tuples.

In [17]:
# Creating a list of numbers using range function.
# Star added in front to unpack the range function.
num_list = [*range(1, 21)]
print(num_list)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [18]:
# Create an RDD to hold the list of numbers
nums_rdd = spark.sparkContext.parallelize(num_list)

In [19]:
# Create rdd to store list of squared numbers as tuples (original number, squared value)
nums_squared_rdd = nums_rdd.map(lambda n: (n, n*n))

In [20]:
for element in nums_squared_rdd.collect():
    print (element)

(1, 1)
(2, 4)
(3, 9)
(4, 16)
(5, 25)
(6, 36)
(7, 49)
(8, 64)
(9, 81)
(10, 100)
(11, 121)
(12, 144)
(13, 169)
(14, 196)
(15, 225)
(16, 256)
(17, 289)
(18, 324)
(19, 361)
(20, 400)


In [21]:
# Create an RDD that would store the transformed dataset of words starting with S:
words_trd_rdd = words_rdd.map(lambda word: (word, word[0], wordStartsWith(word, "S")))

In [22]:
for element in words_trd_rdd.collect():
    print(element)

('Spark', 'S', True)
('makes', 'm', False)
('life', 'l', False)
('a', 'a', False)
('lot', 'l', False)
('easier', 'e', False)
('and', 'a', False)
('puts', 'p', False)
('me', 'm', False)
('into', 'i', False)
('good', 'g', False)
('Spirits,', 'S', True)
('Spark', 'S', True)
('is', 'i', False)
('too', 't', False)
('Awesome!', 'A', False)


In [23]:
# Flat Map Transformation - Provides a simple extension of the map function. Sometimes you 
# may want to be able to take in a list of words and flatten the structure in a list of 
# letters. Can use flat map to do this:

In [24]:
# Lambda function taking in each word to create a list of the word and printing out the first 
# 10 records. Taken the entire word list and then conveted it into a huge list of letters and
# then printed out the first 10 elements.

words_rdd.flatMap(lambda word: list(word)).take(10)

['S', 'p', 'a', 'r', 'k', 'm', 'a', 'k', 'e', 's']

In [25]:
# Sorting using SortByKey() transformation.

In [26]:
# We will need a list of tuples that contain countries and some ranking. 
# SortByKey() tranformation requires a key value pair.

In [27]:
countries_list = [("India", 91), ("USA", 4),("Greece", 13)]

# Create RDD of countries list:
countries_rdd = spark.sparkContext.parallelize(countries_list)

In [28]:
# Create a list from a sorted RDD:

sorted_countries_list = countries_rdd.sortByKey().collect()

In [29]:
# Sorted the country list by country name. As the country is the key and the ranking is
# the value.

for country in sorted_countries_list:
    print(country)

('Greece', 13)
('India', 91)
('USA', 4)


In [30]:
# Want to sort by value/ranking of countries in descending order instead.

sorted_countries_list = countries_rdd.map(lambda c: (c[1], c[0])).sortByKey(False).collect()

In [31]:
for country in sorted_countries_list:
    print(country)

(91, 'India')
(13, 'Greece')
(4, 'USA')


In [32]:
# Now work with Spark Actions: Collect, Count and Take.

In [33]:
# Reduce Action - used to reduce a RDD. By reduce we mean to aggregrate values into just 
# one value as a result. Given a set of numbers, can reduce the set of numbers to once 
# value by summing them up.

In [34]:
num_list = [1,5,2,3,4]

In [35]:
# Create variable that will hold result:
result = spark.sparkContext.parallelize(num_list).reduce(lambda x, y: x + y)
print(result)

15


In [36]:
# Explanation of lambda function above:

def sumList(x,y):
    print(x, y)
    return x + y

In [37]:
result = spark.sparkContext.parallelize(num_list).reduce(lambda x, y: sumList(x,y))
print(result)

1 5
6 2
8 7
15


3 4


In [41]:
# Start testing if left word is greater than right word. And the reduce funtion should 
# give the largest word in our list of words.

def wordLengthReducer(leftWord, rightWord):
    if len(leftWord) > len(rightWord):
        return leftWord
    else:
        return rightWord

In [42]:
words_rdd.reduce(wordLengthReducer)

'Awesome!'

In [44]:
# Get the first elemnet of words_rdd
words_rdd.first()

'Spark'

In [45]:
# Max value from a list of values:
spark.sparkContext.parallelize(range(1,21)).max()

20

In [47]:
# Min value from a list of values:
spark.sparkContext.parallelize(range(1,21)).min()

1