In [6]:
try:
    sc.stop()
except:
    pass
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

In [7]:
mtcarsfile="../../data/mtcars.csv"

In [8]:
mtcars_df = spark.read.csv(mtcarsfile, inferSchema=True, header=True)

In [9]:
mtcars_df.show(2)

+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|    Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 2 rows



### Aggregate functions
Two aggregate functions:

* aggregate()
* aggregateByKey()
#### aggregate(zeroValue, seqOp, combOp)
* **zeroValue** is like a data container. Its structure should match with the data structure of the returned values from the seqOp function.
* **seqOp** is a function that takes two arguments: the first argument is the zeroValue and the second argument is an element from the RDD. The zeroValue gets updated with the returned value after every run.
* **combOp** is a function that takes two arguments: the first argument is the final zeroValue from one partition and the other is another final zeroValue from another partition.
The code below calculates the total sum of squares for **mpg** and **disp** in data set mtcars.


In [22]:
# Step 1: get some data.
# Step 2: calculate averages of mgp and disp

mpg_mean  = mtcars_df.select('mpg').rdd.map(lambda x:x[0]).mean()
disp_mean = mtcars_df.select('disp').rdd.map(lambda x:x[0]).mean() 

print("""
   Mean of mpg  : {0}
   Mean of disp : {1}
""".format(mpg_mean,disp_mean) )


   Mean of mpg  : 20.090625000000003
   Mean of disp : 230.721875



In [23]:
# Step 3: build zeroValue, seqOp and combOp
zeroValue = (0, 0)

In [24]:
list(enumerate(mtcars_df.columns))

[(0, 'model'),
 (1, 'mpg'),
 (2, 'cyl'),
 (3, 'disp'),
 (4, 'hp'),
 (5, 'drat'),
 (6, 'wt'),
 (7, 'qsec'),
 (8, 'vs'),
 (9, 'am'),
 (10, 'gear'),
 (11, 'carb')]

In [29]:
mtcars_df_1 = mtcars_df.select(['mpg','disp'])
mtcars_df_1.show(2)
seqOp = lambda z, x: (z[0] + (x[0] - mpg_mean)**2, z[1] + (x[1] - disp_mean)**2)
combOp = lambda px, py: ( px[0] + py[0], px[1] + py[1] )

+----+-----+
| mpg| disp|
+----+-----+
|21.0|160.0|
|21.0|160.0|
+----+-----+
only showing top 2 rows



In [30]:
mtcars_df_1.rdd.aggregate(zeroValue, seqOp, combOp)

(1126.0471874999998, 476184.7946875)

### aggregateByKey(zeroValue, seqOp, combOp)
This function does similar things as **aggregate()**. The **aggregate()** aggregate all results to the very end, but **aggregateByKey()** merge results by key.

In [31]:
iris_rdd = sc.textFile('../../data/iris.csv')
iris_rdd.take(2)

['sepal_length,sepal_width,petal_length,petal_width,species',
 '5.1,3.5,1.4,0.2,setosa']

In [33]:
iris_rdd = sc.textFile('../../data/iris.csv',use_unicode=True)
iris_rdd.take(2)

['sepal_length,sepal_width,petal_length,petal_width,species',
 '5.1,3.5,1.4,0.2,setosa']

In [37]:
header=iris_rdd.map(lambda x : x.split(',')).first()
header

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

In [41]:
iris_row = iris_rdd.map(lambda x : x.split(',')).filter(lambda x: x != header)
iris_row.take(2)

[['5.1', '3.5', '1.4', '0.2', 'setosa'],
 ['4.9', '3.0', '1.4', '0.2', 'setosa']]

In [45]:
iris_row_tuple = iris_row.map(lambda x: (x[-1], [*map(float,x[:-1])]))

In [47]:
iris_row_tuple.take(4)

[('setosa', [5.1, 3.5, 1.4, 0.2]),
 ('setosa', [4.9, 3.0, 1.4, 0.2]),
 ('setosa', [4.7, 3.2, 1.3, 0.2]),
 ('setosa', [4.6, 3.1, 1.5, 0.2])]

### Define initial values, seqOp and combOp

In [48]:
zero_value = (0, 0)
seqOp = (lambda x, y: (x[0] + (y[0])**2, x[1] + (y[1])**2))
combOp = (lambda x, y: (x[0] + y[0], x[1] + y[1]))

### Implement aggregateByKey()¶

In [50]:
iris_row_tuple.aggregateByKey(zero_value, seqOp, combOp).collect()

[('setosa', (1259.0899999999997, 591.2500000000002)),
 ('versicolor', (1774.8600000000006, 388.4700000000001)),
 ('virginica', (2189.9000000000005, 447.33))]

### Map functions
These functions are probably the most commonly used functions when dealing with an RDD object.

    *map()
    *mapValues()
    *flatMap()
    *flatMapValues()
#### map
The map() method applies a function to each elements of the RDD. Each element has to be a valid input to the function. The returned RDD has the function outputs as its new elements.

Elements in the RDD object map_exp_rdd below are rows of the mtcars in string format. We are going to apply the map() function multiple times to convert each string elements as a list elements. Each list element has two values: the first value will be the auto model in string format; the second value will be a list of numeric values.

In [51]:
mtcars_df.show(4)

+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|         model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|     Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
| Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|    Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 4 rows



In [53]:
header = mtcars_df.columns
header

['model',
 'mpg',
 'cyl',
 'disp',
 'hp',
 'drat',
 'wt',
 'qsec',
 'vs',
 'am',
 'gear',
 'carb']

In [54]:
mtcars_rdd = sc.textFile(mtcarsfile)
mtcars_rdd.take(2)

['model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb',
 'Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4']

In [55]:
mtcars_rdd_map = mtcars_rdd.map(lambda x : x.split(','))

In [57]:
mtcars_rdd_map.take(2)

[['model',
  'mpg',
  'cyl',
  'disp',
  'hp',
  'drat',
  'wt',
  'qsec',
  'vs',
  'am',
  'gear',
  'carb'],
 ['Mazda RX4',
  '21',
  '6',
  '160',
  '110',
  '3.9',
  '2.62',
  '16.46',
  '0',
  '1',
  '4',
  '4']]

In [61]:
mtcars_rdd_map_header = mtcars_rdd_map.first()
print(mtcars_rdd_map_header)
mtcars_rdd_map_rows = mtcars_rdd_map.filter(lambda x : x!=mtcars_rdd_map_header)
print(mtcars_rdd_map_rows.take(2))

['model', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']
[['Mazda RX4', '21', '6', '160', '110', '3.9', '2.62', '16.46', '0', '1', '4', '4'], ['Mazda RX4 Wag', '21', '6', '160', '110', '3.9', '2.875', '17.02', '0', '1', '4', '4']]


In [64]:
# # split auto model from other feature values

# convert string values to numeric values

mtcars_rdd_map_rows_s_f = mtcars_rdd_map_rows.map(lambda x : (x[0],[*map(float,x[1:])]))

In [65]:
mtcars_rdd_map_rows_s_f.take(1)

[('Mazda RX4',
  [21.0, 6.0, 160.0, 110.0, 3.9, 2.62, 16.46, 0.0, 1.0, 4.0, 4.0])]

### mapValues
The mapValues function requires that each element in the RDD has a **key/value** pair structure, for example, a tuple of 2 items, or a list of 2 items. The mapValues function applies a function to each of the element values. The element key will remain unchanged.

We can apply the mapValues function to the RDD object mapValues_exp_rdd below.

In [67]:
mtcars_rdd_map_rows_s_f.mapValues(lambda x:x).take(1)

[('Mazda RX4',
  [21.0, 6.0, 160.0, 110.0, 3.9, 2.62, 16.46, 0.0, 1.0, 4.0, 4.0])]

In [70]:
import numpy as np

In [75]:
mtcars_rdd_map_rows_s_f.mapValues(lambda x:np.mean(x)).collect()[:4]
### When using mapValues(), the x in the above lambda function refers to the element value, not including the element key.

[('Mazda RX4', 29.90727272727273),
 ('Mazda RX4 Wag', 29.98136363636364),
 ('Datsun 710', 23.59818181818182),
 ('Hornet 4 Drive', 38.73954545454546)]

### flatMap
This function first applies a function to each elements of an RDD and then flatten the results. We can simply use this function to flatten elements of an RDD without extra operation on each elements.

In [81]:
x = [('a','b','c'),('d','e','e'),('f','g')]
x
flapMap_x = sc.parallelize(x)
flapMap_x.collect()

[('a', 'b', 'c'), ('d', 'e', 'e'), ('f', 'g')]

In [83]:
flapMap_x.flatMap(lambda x:x).collect()

['a', 'b', 'c', 'd', 'e', 'e', 'f', 'g']

In [84]:
y = np.arange(1,10)
y.shape = (3,3)
flapMap_y = sc.parallelize(y)
flapMap_y.collect()

[array([1, 2, 3]), array([4, 5, 6]), array([7, 8, 9])]

In [85]:
flapMap_y.flatMap(lambda y:y).collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9]

### flatMapValues
The flatMapValues function requires that each element in the RDD has a key/value pair structure. It applies a function to each element value of the RDD object and then flatten the results.

For example, my raw data looks like below. But I would like to transform the data so that it has three columns: the first column is the sample id; the second the column is the three types (A,B or C); the third column is the values.

 
|sample id|A    |B    |C    |
|---------|-----|-----|-----|
|1        |23   |18   |32   |
|2        |18   |29   |31   |
|3        |34   |21   |18   |   

In [90]:
my_data = [
    [1, (23, 28, 32)],
    [2, (18, 29, 31)],
    [3, (34, 21, 18)]
]

flatMapValues_rdd = sc.parallelize(my_data)
print(flatMapValues_rdd.collect())
flatMapValues_rdd_1 = flatMapValues_rdd.flatMapValues(lambda x: list(zip(list('ABC'), x)))
flatMapValues_rdd_1.collect()

[[1, (23, 28, 32)], [2, (18, 29, 31)], [3, (34, 21, 18)]]


[(1, ('A', 23)),
 (1, ('B', 28)),
 (1, ('C', 32)),
 (2, ('A', 18)),
 (2, ('B', 29)),
 (2, ('C', 31)),
 (3, ('A', 34)),
 (3, ('B', 21)),
 (3, ('C', 18))]

In [103]:
def changefmv(x):
    v= [x[0]] + list(x[1])
    return v
print(flatMapValues_rdd_1.flatMapValues(lambda x: [x[0]]).collect())
print(flatMapValues_rdd_1.flatMapValues(lambda x: [x[1]]).collect())

[(1, 'A'), (1, 'B'), (1, 'C'), (2, 'A'), (2, 'B'), (2, 'C'), (3, 'A'), (3, 'B'), (3, 'C')]
[(1, 23), (1, 28), (1, 32), (2, 18), (2, 29), (2, 31), (3, 34), (3, 21), (3, 18)]


## First Data Check

In [111]:
def describe_columns(df):
    for i in df.columns:
        print('Column: ' + i)
        df.select(i).describe().show()
describe_columns(mtcars_df)

Column: model
+-------+-----------+
|summary|      model|
+-------+-----------+
|  count|         32|
|   mean|       null|
| stddev|       null|
|    min|AMC Javelin|
|    max| Volvo 142E|
+-------+-----------+

Column: mpg
+-------+------------------+
|summary|               mpg|
+-------+------------------+
|  count|                32|
|   mean|20.090624999999996|
| stddev| 6.026948052089103|
|    min|              10.4|
|    max|              33.9|
+-------+------------------+

Column: cyl
+-------+------------------+
|summary|               cyl|
+-------+------------------+
|  count|                32|
|   mean|            6.1875|
| stddev|1.7859216469465444|
|    min|                 4|
|    max|                 8|
+-------+------------------+

Column: disp
+-------+------------------+
|summary|              disp|
+-------+------------------+
|  count|                32|
|   mean|230.72187500000004|
| stddev|123.93869383138195|
|    min|              71.1|
|    max|             4