# Question 4

In [3]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.master("local[*]").appName("jdbc data sources").getOrCreate()

## Map vs FlatMap
Both map() and flatMap() are used for transformations. 
The map() transformation takes in a function and applies it to each element in the RDD 
and the result of the function is a new value of each element in the resulting RDD.
The flatMap() is used to produce multiple output elements for each input element.
When using map(), the function we provide to flatMap() is called individually for each element
in our input RDD. Instead of returning a single element, an iterator with the return
values is returned.

In [4]:
sc = spark.sparkContext

In [30]:
rdd = sc.parallelize(["Roses are red", "Violets are blue"]) 
rdd.collect()

['Roses are red', 'Violets are blue']

#### map transforms an RDD of length N into another RDD of length N.
For example, it maps from two lines into two line-lengths:

In [32]:
rdd.map(lambda x: len(x)).collect()

[13, 16]

In [36]:
rdd.map(lambda x: x.split(" ")).collect()

[['Roses', 'are', 'red'], ['Violets', 'are', 'blue']]

#### flatMap (loosely speaking) transforms an RDD of length N into a collection of N collections, 
then flattens these into a single RDD of results.

In [34]:
rdd.flatMap(lambda x: x.split(" ")).collect()

['Roses', 'are', 'red', 'Violets', 'are', 'blue']

### Question 4: You've been given following content in three different files.
which of the followings code snippet does the activity mentioned below?
Concatenate all the data in single RDD and then count all the words of all three files

In [8]:
pathQ4 = "tmp/Quest4/"
linesA = sc.textFile(pathQ4 + "trainExam4A.txt")
linesB = sc.textFile(pathQ4 + "trainExam4B.txt")
linesC = sc.textFile(pathQ4 + "trainExam4C.txt")

## Using Map

In [43]:
linesA1 = linesA.map(lambda line: line.split(" "))
linesB1 = linesB.map(lambda line: line.split(" "))
linesC1 = linesC.map(lambda line: line.split(" "))

In [44]:
linesAB = linesA1.union(linesB1)
linesABC = linesAB.union(linesC1)

In [46]:
flatData = linesABC.flatMap(lambda y:y)
flatData.count()

15

## Using FlatMap

In [52]:
linesA1 = linesA.flatMap(lambda line: line.split(" "))
linesB1 = linesB.flatMap(lambda line: line.split(" "))
linesC1 = linesC.flatMap(lambda line: line.split(" "))

In [53]:
linesAB = linesA1.union(linesB1)
linesABC = linesAB.union(linesC1)

In [55]:
flatData = linesABC.map(lambda y:y)
flatData.count()

15