In [1]:
import pyspark

In [2]:
# SparkContext is useful for loading in data and parallelizing the data
spark = pyspark.SparkContext(appName="map-and-lazy-evaluation")

log_of_songs = [
        "Despacito",
        "Nice for what",
        "No tears left to cry",
        "Despacito",
        "Havana",
        "In my feelings",
        "Nice for what",
        "despacito",
        "All the stars"
]

distributed_log_songs = spark.parallelize(log_of_songs)
distributed_log_songs

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

In [3]:
def convert_song_to_lowercase(song):
    return song.lower()

convert_song_to_lowercase("Despacito")

'despacito'

- We can map the function across all of data in the `clusters`. However, lazy evaluation is used. This means that the process is not carried out until an `action` is carried out.

In [6]:
%%time
# This is used to perform the mapping and consequently data is aggregated onto a list in the master node
distributed_log_songs.map(convert_song_to_lowercase).collect()

CPU times: user 2.7 ms, sys: 4.44 ms, total: 7.14 ms
Wall time: 176 ms


['despacito',
 'nice for what',
 'no tears left to cry',
 'despacito',
 'havana',
 'in my feelings',
 'nice for what',
 'despacito',
 'all the stars']

In [7]:
# We can also collect the original distributed data across the cluster
distributed_log_songs.collect()

['Despacito',
 'Nice for what',
 'No tears left to cry',
 'Despacito',
 'Havana',
 'In my feelings',
 'Nice for what',
 'despacito',
 'All the stars']

- We can also use a `lambda` function on the distributed data

In [8]:
distributed_log_songs.map(lambda song: song.lower()).collect()

['despacito',
 'nice for what',
 'no tears left to cry',
 'despacito',
 'havana',
 'in my feelings',
 'nice for what',
 'despacito',
 'all the stars']