In [1]:
# Creating a session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RDDExamples").master("local[*]").getOrCreate()
sc = spark.sparkContext

In [7]:
# From a python list
# parallelize() is used to create RDD.
nums = sc.parallelize([1,2,3,4,5,6])


In [10]:
# Actions -> The execution starts only when actions take place.
# .collect() -> shows all the elements
nums.collect()

# .count() -> counts number of elements
nums.count()

# .take() -> takes specific number of elements from RDD
nums.take(4)

[1, 2, 3, 4]

In [12]:
# Transformations [are lazy they happen when we use collect() or any other action]
squares = nums.map(lambda x: x*x)
even_squares = squares.filter(lambda x: x%2 == 0)
even_squares.collect()
#squares.collect() -> Has all squares

[4, 16, 36]

In [13]:
# Another RDD
lines = sc.parallelize(["spark makes big data simple",
                        "rdds are distributed datasets",
                        "spark runs fast"])
# Transformations
word_counts = (lines
               .flatMap(lambda line: line.split()) # splits words
               .map(lambda w: (w.lower(), 1))      # counts word occurances separately [("spark", 1), ("makes",1)...]
               .reduceByKey(lambda a,b: a+b))      # reduces counts (adding occurances) [("spark",2), ("makes",1)...]

# Actual execution
word_counts.take(10)

[('big', 1),
 ('are', 1),
 ('distributed', 1),
 ('datasets', 1),
 ('runs', 1),
 ('fast', 1),
 ('spark', 2),
 ('makes', 1),
 ('data', 1),
 ('simple', 1)]