# RDD - Resilient Distributed Dataset

In [1]:
#### First import modules and start spark context.

try:
    sc.stop()
except:
    pass

from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession

In [2]:
sc=SparkContext()
spark = SparkSession(sparkContext=sc)

In [7]:
#from list
List = list(range(1,4))
print("List %s" %(List))

rdd = sc.parallelize(List)
print("RDD List %s" %(rdd.collect()))

List [1, 2, 3]
RDD List [1, 2, 3]


In [8]:
Tuple = ("ashok","kumar","rama","krish")
rdd = sc.parallelize(Tuple)
rdd.collect()

['ashok', 'kumar', 'rama', 'krish']

In [9]:
Set = set(Tuple)
rdd = sc.parallelize(Set)
rdd.collect()

['rama', 'krish', 'kumar', 'ashok']

In [11]:
s = {'cat', 'dog', 'fish', 'cat', 'dog', 'dog'}
s

{'cat', 'dog', 'fish'}

In [12]:
rdd = sc.parallelize(s)
rdd.collect()

['dog', 'fish', 'cat']

In [13]:
list_t = [('cat', 'dog', 'fish'), ('orange', 'apple')]
rdd = sc.parallelize(list_t)
rdd.collect()

[('cat', 'dog', 'fish'), ('orange', 'apple')]

In [14]:
employee={}
employee['firstname']="Ashokkumar"
employee['lastname']="Ramakrishnan"
employee['gender']='Male'

rdd = sc.parallelize(employee)
rdd.collect()

#### When it is a dict, only the keys are used to form the RDD.

['firstname', 'lastname', 'gender']

In [23]:
# read csv file 
mtcarsfile = "../../data/mtcars.csv"
rdd = sc.textFile(mtcarsfile)
rdd.take(5)

['model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb',
 'Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4',
 'Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4',
 'Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1',
 'Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1']

In [22]:
# read a txt file
rdd = sc.textFile('../../data/twitter.txt')
rdd.take(5)

['Fresh install of XP on new computer. Sweet relief! fuck vista 1018769417 1.0 Well. Now I know where to go when I want my knives. #ChiChevySXSW http://post.ly/RvDl 10284216536 1.0 “Literally six weeks before I can take off “”SSC Chair”” off my email. Its like the torturous 4th mile before everything stops hurting.” 10298589026 1.0 Mitsubishi i MiEV - Wikipedia, the free encyclopedia - http://goo.gl/xipe Cutest car ever! 109017669432377344 1.0 ‘Cheap Eats in SLP’ - http://t.co/4w8gRp7 109642968603963392 1.0 Teenage Mutant Ninja Turtle art is never a bad thing... http://bit.ly/aDMHyW 10995492579 1.0 New demographic survey of online video viewers: http://bit.ly/cx8b7I via @KellyOlexa 11713360136 1.0 hi all - i’m going to be tweeting things lookstat at the @lookstat twitter account. please follow me there 1208319583 1.0 Holy carp, no. That movie will seriously suffer for it. RT @MouseInfo: Anyone excited for The Little Mermaid in 3D? 121330835726155776 1.0 “Did I really need to learn “”I 

# DataFrame

## Create a DataFrame object

### Creat DataFrame by reading a file

In [24]:
mtcars = spark.read.csv(mtcarsfile,sep=",",
                        header=True, comment=None,
                        inferSchema=True,encoding='UTF-8',
                       )

In [26]:
mtcars.show(5)

+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|            model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 5 rows



In [27]:
mtcars.show(n=5,truncate=False)

+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|model            |mpg |cyl|disp |hp |drat|wt   |qsec |vs |am |gear|carb|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|Mazda RX4        |21.0|6  |160.0|110|3.9 |2.62 |16.46|0  |1  |4   |4   |
|Mazda RX4 Wag    |21.0|6  |160.0|110|3.9 |2.875|17.02|0  |1  |4   |4   |
|Datsun 710       |22.8|4  |108.0|93 |3.85|2.32 |18.61|1  |1  |4   |1   |
|Hornet 4 Drive   |21.4|6  |258.0|110|3.08|3.215|19.44|1  |0  |3   |1   |
|Hornet Sportabout|18.7|8  |360.0|175|3.15|3.44 |17.02|0  |0  |3   |2   |
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 5 rows



In [29]:
mtcars.collect()[0]

Row(model='Mazda RX4', mpg=21.0, cyl=6, disp=160.0, hp=110, drat=3.9, wt=2.62, qsec=16.46, vs=0, am=1, gear=4, carb=4)

### Create DataFrame with createDataFrame function

#### From an RDD

In [30]:
from pyspark.sql import Row

rdd = sc.parallelize([
    Row(x=[1,2,3],y=['a','b','c']),
    Row(x=[4,5,6],y=['e','f','g']),
])

rdd.collect()

[Row(x=[1, 2, 3], y=['a', 'b', 'c']), Row(x=[4, 5, 6], y=['e', 'f', 'g'])]

In [31]:
df = spark.createDataFrame(rdd)
df.show(5)

+---------+---------+
|        x|        y|
+---------+---------+
|[1, 2, 3]|[a, b, c]|
|[4, 5, 6]|[e, f, g]|
+---------+---------+



In [32]:
import string
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [36]:
alphabets = [[y,x+1] for x,y in enumerate(string.ascii_lowercase) ] 
alphabets[:5]

[['a', 1], ['b', 2], ['c', 3], ['d', 4], ['e', 5]]

In [38]:
df = spark.createDataFrame(alphabets,['letter','number'])
df.show(5,truncate=False)

+------+------+
|letter|number|
+------+------+
|a     |1     |
|b     |2     |
|c     |3     |
|d     |4     |
|e     |5     |
+------+------+
only showing top 5 rows



In [39]:

df.dtypes

[('letter', 'string'), ('number', 'bigint')]

In [43]:
SPARK=spark.createDataFrame([employee])

In [44]:
SPARK.show()

+----------+------+------------+
| firstname|gender|    lastname|
+----------+------+------------+
|Ashokkumar|  Male|Ramakrishnan|
+----------+------+------------+



In [45]:
SPARK.collect()

[Row(firstname='Ashokkumar', gender='Male', lastname='Ramakrishnan')]

In [46]:
df = spark.createDataFrame(alphabets)
df.show(5,truncate=False)

+---+---+
|_1 |_2 |
+---+---+
|a  |1  |
|b  |2  |
|c  |3  |
|d  |4  |
|e  |5  |
+---+---+
only showing top 5 rows



In [48]:
def collect_show(df):
    print("""
    Printing collect \n
    %s
    \n
    Printing show \n
    %s
    \n
    """ %(df.collect(),df.show()))
my_list = [(['a', 1], ['b', 2])]
df = spark.createDataFrame(my_list, ['x', 'y'])
collect_show(df)

+------+------+
|     x|     y|
+------+------+
|[a, 1]|[b, 2]|
+------+------+


    Printing collect 

    [Row(x=['a', '1'], y=['b', '2'])]
    

    Printing show 

    None
    

    


## Column instance
### Column instances can be created in two ways:

directly select a column out of a DataFrame: df.colName
create from a column expression: df.colName + 1
Technically, there is only one way to create a column instance. Column expressions start from a column instance.

#### Remember how to create column instances, because this is usually the starting point if we want to operate DataFrame columns.

The column classes come with some methods that can operate on a column instance. ```However, almost all functions from the pyspark.sql.functions module take one or more column instances as argument(s)```. These functions are important for data manipulation tools.

## DataFrame column methods
### Methods that take column names as arguments:
* corr(col1, col2): two column names.
* cov(col1, col2): two column names.
* crosstab(col1, col2): two column names.
* describe(*cols): `cols` refers to only column names (strings).*
### Methods that take column names or column expressions or both as arguments:
* cube(*cols): column names (string) or column expressions or both.
* drop(*cols): a list of column names OR a single column expression.
* groupBy(*cols): column name (string) or column expression or both.
* rollup(*cols): column name (string) or column expression or both.
* select(*cols): column name (string) or column expression or both.
* sort(*cols, **kwargs): column name (string) or column expression or both.
* sortWithinPartitions(*cols, **kwargs): column name (string) or column expression or both.
* orderBy(*cols, **kwargs): column name (string) or column expression or both.
* sampleBy(col, fractions, sed=None): a column name.
* toDF(*cols): a list of column names (string).
* withColumn(colName, col): colName refers to column name; col refers to a column expression.
* withColumnRenamed(existing, new): takes column names as arguments.
* filter(condition): *condition refers to a column expression that returns types.BooleanType of values.

In [51]:
mtcars.count()

32

In [52]:
mtcars.show(5)

+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|            model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 5 rows



In [54]:
mtcars.corr("disp","hp")

0.7909485863698064

In [55]:
mtcars.cov("disp","hp")

6721.158669354838

In [56]:
mtcars.crosstab("disp","hp")

DataFrame[disp_hp: string, 105: bigint, 109: bigint, 110: bigint, 113: bigint, 123: bigint, 150: bigint, 175: bigint, 180: bigint, 205: bigint, 215: bigint, 230: bigint, 245: bigint, 264: bigint, 335: bigint, 52: bigint, 62: bigint, 65: bigint, 66: bigint, 91: bigint, 93: bigint, 95: bigint, 97: bigint]

In [58]:
mtcars.show(4)

+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|         model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|     Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
| Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|    Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
|Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 4 rows



In [60]:
mtcars.orderBy("model").show(4)

+------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|             model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|       AMC Javelin|15.2|  8|304.0|150|3.15|3.435| 17.3|  0|  0|   3|   2|
|Cadillac Fleetwood|10.4|  8|472.0|205|2.93| 5.25|17.98|  0|  0|   3|   4|
|        Camaro Z28|13.3|  8|350.0|245|3.73| 3.84|15.41|  0|  0|   3|   4|
| Chrysler Imperial|14.7|  8|440.0|230|3.23|5.345|17.42|  0|  0|   3|   4|
+------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 4 rows



In [68]:
mtcars.drop('carb').show(4)

+--------------+----+---+-----+---+----+-----+-----+---+---+----+
|         model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+
|     Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|
| Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|
|    Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|
|Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+
only showing top 4 rows



In [73]:
row_group_cars = mtcars.groupBy().avg().collect()
row_group_cars

[Row(avg(mpg)=20.090624999999996, avg(cyl)=6.1875, avg(disp)=230.72187500000004, avg(hp)=146.6875, avg(drat)=3.5965625000000006, avg(wt)=3.2172499999999995, avg(qsec)=17.848750000000003, avg(vs)=0.4375, avg(am)=0.40625, avg(gear)=3.6875, avg(carb)=2.8125)]

In [76]:
len(row_group_cars[0])

11

In [97]:
row_group_cars[0].asDict().items()

dict_items([('avg(mpg)', 20.090624999999996), ('avg(cyl)', 6.1875), ('avg(disp)', 230.72187500000004), ('avg(hp)', 146.6875), ('avg(drat)', 3.5965625000000006), ('avg(wt)', 3.2172499999999995), ('avg(qsec)', 17.848750000000003), ('avg(vs)', 0.4375), ('avg(am)', 0.40625), ('avg(gear)', 3.6875), ('avg(carb)', 2.8125)])

In [99]:
mtcars.select('*').show(2)

+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|    Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|
+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 2 rows



In [100]:
mtcars.select("model","mpg").show(2)

+-------------+----+
|        model| mpg|
+-------------+----+
|    Mazda RX4|21.0|
|Mazda RX4 Wag|21.0|
+-------------+----+
only showing top 2 rows



In [101]:
mtcars.select(mtcars.model, mtcars.mpg).show(2)

+-------------+----+
|        model| mpg|
+-------------+----+
|    Mazda RX4|21.0|
|Mazda RX4 Wag|21.0|
+-------------+----+
only showing top 2 rows



In [102]:
mtcars.select(mtcars.model, (mtcars.mpg+10).alias('MPG')).show(2)

+-------------+----+
|        model| MPG|
+-------------+----+
|    Mazda RX4|31.0|
|Mazda RX4 Wag|31.0|
+-------------+----+
only showing top 2 rows



In [106]:
mtcars.sort(mtcars.model.desc()).show(4)

+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|         model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|    Volvo 142E|21.4|  4|121.0|109|4.11| 2.78| 18.6|  1|  1|   4|   2|
|       Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|
| Toyota Corona|21.5|  4|120.1| 97| 3.7|2.465|20.01|  1|  0|   3|   1|
|Toyota Corolla|33.9|  4| 71.1| 65|4.22|1.835| 19.9|  1|  1|   4|   1|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 4 rows



In [108]:
mtcars.sort("model", ascending=False).show(3)

+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|   Volvo 142E|21.4|  4|121.0|109|4.11| 2.78| 18.6|  1|  1|   4|   2|
|      Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|
|Toyota Corona|21.5|  4|120.1| 97| 3.7|2.465|20.01|  1|  0|   3|   1|
+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 3 rows



In [110]:
mtcars.orderBy(["model","cyl"],ascending=[1,0]).show(3)

+------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|             model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|       AMC Javelin|15.2|  8|304.0|150|3.15|3.435| 17.3|  0|  0|   3|   2|
|Cadillac Fleetwood|10.4|  8|472.0|205|2.93| 5.25|17.98|  0|  0|   3|   4|
|        Camaro Z28|13.3|  8|350.0|245|3.73| 3.84|15.41|  0|  0|   3|   4|
+------------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 3 rows



In [111]:
mtcars.orderBy(["model","cyl"],ascending=[0,1]).show(3)

+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|        model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+
|   Volvo 142E|21.4|  4|121.0|109|4.11| 2.78| 18.6|  1|  1|   4|   2|
|      Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1|
|Toyota Corona|21.5|  4|120.1| 97| 3.7|2.465|20.01|  1|  0|   3|   1|
+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+
only showing top 3 rows



In [115]:
mtcars.sortWithinPartitions("model",ascending=False).count()

32

In [116]:
mtcars.count()

32

In [118]:
mtcars.sortWithinPartitions("mpg",ascending=False).show(4)

+--------------+----+---+----+---+----+-----+-----+---+---+----+----+
|         model| mpg|cyl|disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+--------------+----+---+----+---+----+-----+-----+---+---+----+----+
|Toyota Corolla|33.9|  4|71.1| 65|4.22|1.835| 19.9|  1|  1|   4|   1|
|      Fiat 128|32.4|  4|78.7| 66|4.08|  2.2|19.47|  1|  1|   4|   1|
|   Honda Civic|30.4|  4|75.7| 52|4.93|1.615|18.52|  1|  1|   4|   2|
|  Lotus Europa|30.4|  4|95.1|113|3.77|1.513| 16.9|  1|  1|   5|   2|
+--------------+----+---+----+---+----+-----+-----+---+---+----+----+
only showing top 4 rows



In [119]:
df = spark.createDataFrame([("2016-03-11 09:00:07", 1)]).toDF("date", "val")

In [120]:
df.show()

+-------------------+---+
|               date|val|
+-------------------+---+
|2016-03-11 09:00:07|  1|
+-------------------+---+



In [129]:
mtcars.groupBy("cyl").sum().collect()

[Row(cyl=6, sum(mpg)=138.2, sum(cyl)=42, sum(disp)=1283.2, sum(hp)=856, sum(drat)=25.099999999999998, sum(wt)=21.82, sum(qsec)=125.84, sum(vs)=4, sum(am)=3, sum(gear)=27, sum(carb)=24),
 Row(cyl=4, sum(mpg)=293.3, sum(cyl)=44, sum(disp)=1156.5, sum(hp)=909, sum(drat)=44.78, sum(wt)=25.143, sum(qsec)=210.50999999999996, sum(vs)=10, sum(am)=8, sum(gear)=45, sum(carb)=17),
 Row(cyl=8, sum(mpg)=211.40000000000003, sum(cyl)=112, sum(disp)=4943.4, sum(hp)=2929, sum(drat)=45.209999999999994, sum(wt)=55.98900000000001, sum(qsec)=234.81, sum(vs)=0, sum(am)=2, sum(gear)=46, sum(carb)=49)]

In [130]:
mtcars.groupBy("cyl").agg({"*":"count"}).show()

+---+--------+
|cyl|count(1)|
+---+--------+
|  6|       7|
|  4|      11|
|  8|      14|
+---+--------+



In [132]:
from pyspark.sql.functions import split

In [139]:
mtcars.select(split(mtcars.model," ")[0].alias('model')).groupBy("model").agg({"*":"count"}).show()

+--------+--------+
|   model|count(1)|
+--------+--------+
|  Hornet|       2|
|Maserati|       1|
|   Volvo|       1|
|   Honda|       1|
|Cadillac|       1|
| Pontiac|       1|
| Lincoln|       1|
|Chrysler|       1|
|  Camaro|       1|
|     AMC|       1|
|  Datsun|       1|
| Ferrari|       1|
|  Duster|       1|
| Porsche|       1|
| Valiant|       1|
|   Lotus|       1|
|  Toyota|       2|
|   Dodge|       1|
|   Mazda|       2|
|    Fiat|       2|
+--------+--------+
only showing top 20 rows



In [143]:
mtcars.select(split(mtcars.model," ")[0].alias('model')).show(4)

+------+
| model|
+------+
| Mazda|
| Mazda|
|Datsun|
|Hornet|
+------+
only showing top 4 rows



In [151]:
mtcars_1=mtcars.withColumn("Cars",split(mtcars.model," ")[0])

In [152]:
mtcars_1.show(3)

+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|        model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|  Cars|
+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|    Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4| Mazda|
|Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4| Mazda|
|   Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|Datsun|
+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
only showing top 3 rows



In [157]:
mtcars_1.filter(mtcars_1['cyl'] > 6).show(3)

+-----------------+----+---+-----+---+----+----+-----+---+---+----+----+------+
|            model| mpg|cyl| disp| hp|drat|  wt| qsec| vs| am|gear|carb|  Cars|
+-----------------+----+---+-----+---+----+----+-----+---+---+----+----+------+
|Hornet Sportabout|18.7|  8|360.0|175|3.15|3.44|17.02|  0|  0|   3|   2|Hornet|
|       Duster 360|14.3|  8|360.0|245|3.21|3.57|15.84|  0|  0|   3|   4|Duster|
|       Merc 450SE|16.4|  8|275.8|180|3.07|4.07| 17.4|  0|  0|   3|   3|  Merc|
+-----------------+----+---+-----+---+----+----+-----+---+---+----+----+------+
only showing top 3 rows



In [158]:
mtcars_1.filter(mtcars_1.Cars == "Hornet").show(3)

+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|            model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|  Cars|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|Hornet|
|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|Hornet|
+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+



In [165]:
mtcars_1.show(3)

+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|        model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|  Cars|
+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|    Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4| Mazda|
|Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4| Mazda|
|   Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|Datsun|
+-------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
only showing top 3 rows



In [166]:
mtcars_1.na.replace(160.0,161).show(4)

+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|         model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|  Cars|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
|     Mazda RX4|21.0|  6|161.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4| Mazda|
| Mazda RX4 Wag|21.0|  6|161.0|110| 3.9|2.875|17.02|  0|  1|   4|   4| Mazda|
|    Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|Datsun|
|Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|Hornet|
+--------------+----+---+-----+---+----+-----+-----+---+---+----+----+------+
only showing top 4 rows



In [167]:
mtcars_1.na.replace(['Mazda RX4','Mazda RX4 Wag'],['mazda rx4','mazda rx4 wag'], 'model').show()

+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+--------+
|              model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|    Cars|
+-------------------+----+---+-----+---+----+-----+-----+---+---+----+----+--------+
|          mazda rx4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|   Mazda|
|      mazda rx4 wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|   Mazda|
|         Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|  Datsun|
|     Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|  Hornet|
|  Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|  Hornet|
|            Valiant|18.1|  6|225.0|105|2.76| 3.46|20.22|  1|  0|   3|   1| Valiant|
|         Duster 360|14.3|  8|360.0|245|3.21| 3.57|15.84|  0|  0|   3|   4|  Duster|
|          Merc 240D|24.4|  4|146.7| 62|3.69| 3.19| 20.0|  1|  0|   4|   2|    Merc|
|           Merc 230|22.8|  4|140.8| 95|3.92| 3.15| 22.9|  1|  0|

### DataFrame to RDD
A **DataFrame** can be easily converted to an **RDD** by calling the `pyspark.sql.DataFrame.rdd()` function. Each element in the returned RDD is an *pyspark.sql.Row* object. An Row is a list of key-value pairs.



In [168]:
mtcars.rdd.take(2)

[Row(model='Mazda RX4', mpg=21.0, cyl=6, disp=160.0, hp=110, drat=3.9, wt=2.62, qsec=16.46, vs=0, am=1, gear=4, carb=4),
 Row(model='Mazda RX4 Wag', mpg=21.0, cyl=6, disp=160.0, hp=110, drat=3.9, wt=2.875, qsec=17.02, vs=0, am=1, gear=4, carb=4)]

With an RDD object, we can apply a set of mapping functions, such as **map, mapValues, flatMap, flatMapValues** and a lot of other methods that come from RDD.

In [171]:
mtcars_map = mtcars.rdd.map(lambda x:(x.model,x.mpg))
mtcars_map.take(3)

[('Mazda RX4', 21.0), ('Mazda RX4 Wag', 21.0), ('Datsun 710', 22.8)]

In [172]:
mtcars_map = mtcars.rdd.map(lambda x:(x['model'],x['mpg']))
mtcars_map.take(3)

[('Mazda RX4', 21.0), ('Mazda RX4 Wag', 21.0), ('Datsun 710', 22.8)]

In [173]:
mtcars_mapvalues = mtcars_map.mapValues(lambda x: [x, x * 10])
mtcars_mapvalues.take(5)

[('Mazda RX4', [21.0, 210.0]),
 ('Mazda RX4 Wag', [21.0, 210.0]),
 ('Datsun 710', [22.8, 228.0]),
 ('Hornet 4 Drive', [21.4, 214.0]),
 ('Hornet Sportabout', [18.7, 187.0])]

### RDD to DataFrame
To convert an RDD to a DataFrame, we can use the SparkSession.createDataFrame() function. Every element in the **RDD has be to an Row object.

In [174]:
rdd_raw = sc.textFile('../../data/mtcars.csv')
rdd_raw.take(5)

['model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb',
 'Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4',
 'Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4',
 'Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1',
 'Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1']

In [176]:
rdd_raw.map(lambda x: x.split(',')).take(1)

[['model',
  'mpg',
  'cyl',
  'disp',
  'hp',
  'drat',
  'wt',
  'qsec',
  'vs',
  'am',
  'gear',
  'carb']]

In [179]:
rdd_raw_split_map = rdd_raw.map(lambda x: x.split(','))

In [182]:
header = rdd_raw.map(lambda x: x.split(',')).filter(lambda x: x[1] == 'mpg').collect()[0]
rdd = rdd_raw_split_map.filter(lambda x:x[0] != 'model')
print("Header :", header)
print(rdd.take(1))

Header : ['model', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']
[['Mazda RX4', '21', '6', '160', '110', '3.9', '2.62', '16.46', '0', '1', '4', '4']]


#### Convert RDD elements to RDD Row objects
First we define a function which takes a list of column names and a list of values and create a Row of key-value pairs. Since keys in an Row object are variable names, we can’t simply pass a dictionary to the Row() function. We can think of a dictionary as an argument list and use the ** to unpack the argument list.

See an example.

In [183]:
from pyspark.sql import Row

In [185]:
abcd=string.ascii_lowercase
my_dict = dict(
                zip(
                    list(abcd), 
                    range(1,len(abcd)+1)
                    )
              )

In [187]:
Row(**my_dict)

Row(a=1, b=2, c=3, d=4, e=5, f=6, g=7, h=8, i=9, j=10, k=11, l=12, m=13, n=14, o=15, p=16, q=17, r=18, s=19, t=20, u=21, v=22, w=23, x=24, y=25, z=26)

In [188]:
def list_to_row(keys, values):
    row_dict = dict(zip(keys, values))
    return Row(**row_dict)

In [189]:
rdd_rows = rdd.map(lambda x:list_to_row(header,x))
rdd_rows.take(2)

[Row(am='1', carb='4', cyl='6', disp='160', drat='3.9', gear='4', hp='110', model='Mazda RX4', mpg='21', qsec='16.46', vs='0', wt='2.62'),
 Row(am='1', carb='4', cyl='6', disp='160', drat='3.9', gear='4', hp='110', model='Mazda RX4 Wag', mpg='21', qsec='17.02', vs='0', wt='2.875')]

In [190]:
df = spark.createDataFrame(rdd_rows)
df.show(5)

+---+----+---+----+----+----+---+-----------------+----+-----+---+-----+
| am|carb|cyl|disp|drat|gear| hp|            model| mpg| qsec| vs|   wt|
+---+----+---+----+----+----+---+-----------------+----+-----+---+-----+
|  1|   4|  6| 160| 3.9|   4|110|        Mazda RX4|  21|16.46|  0| 2.62|
|  1|   4|  6| 160| 3.9|   4|110|    Mazda RX4 Wag|  21|17.02|  0|2.875|
|  1|   1|  4| 108|3.85|   4| 93|       Datsun 710|22.8|18.61|  1| 2.32|
|  0|   1|  6| 258|3.08|   3|110|   Hornet 4 Drive|21.4|19.44|  1|3.215|
|  0|   2|  8| 360|3.15|   3|175|Hornet Sportabout|18.7|17.02|  0| 3.44|
+---+----+---+----+----+----+---+-----------------+----+-----+---+-----+
only showing top 5 rows



In [191]:
spark.createDataFrame(rdd, header).show(3)

+-------------+----+---+----+---+----+-----+-----+---+---+----+----+
|        model| mpg|cyl|disp| hp|drat|   wt| qsec| vs| am|gear|carb|
+-------------+----+---+----+---+----+-----+-----+---+---+----+----+
|    Mazda RX4|  21|  6| 160|110| 3.9| 2.62|16.46|  0|  1|   4|   4|
|Mazda RX4 Wag|  21|  6| 160|110| 3.9|2.875|17.02|  0|  1|   4|   4|
|   Datsun 710|22.8|  4| 108| 93|3.85| 2.32|18.61|  1|  1|   4|   1|
+-------------+----+---+----+---+----+-----+-----+---+---+----+----+
only showing top 3 rows



In [193]:
spark.createDataFrame(rdd, header).take(2)

[Row(model='Mazda RX4', mpg='21', cyl='6', disp='160', hp='110', drat='3.9', wt='2.62', qsec='16.46', vs='0', am='1', gear='4', carb='4'),
 Row(model='Mazda RX4 Wag', mpg='21', cyl='6', disp='160', hp='110', drat='3.9', wt='2.875', qsec='17.02', vs='0', am='1', gear='4', carb='4')]

### Merge multiple columns
We convert ***DataFrame to RDD*** and then apply the **map** ``function`` to merge values and convert elements to **Row** objects.

In [194]:
mtcars_rdd = mtcars.rdd.map(lambda x : Row(model=x[0], values = x[1:]))
mtcars_rdd.take(5)

[Row(model='Mazda RX4', values=(21.0, 6, 160.0, 110, 3.9, 2.62, 16.46, 0, 1, 4, 4)),
 Row(model='Mazda RX4 Wag', values=(21.0, 6, 160.0, 110, 3.9, 2.875, 17.02, 0, 1, 4, 4)),
 Row(model='Datsun 710', values=(22.8, 4, 108.0, 93, 3.85, 2.32, 18.61, 1, 1, 4, 1)),
 Row(model='Hornet 4 Drive', values=(21.4, 6, 258.0, 110, 3.08, 3.215, 19.44, 1, 0, 3, 1)),
 Row(model='Hornet Sportabout', values=(18.7, 8, 360.0, 175, 3.15, 3.44, 17.02, 0, 0, 3, 2))]

In [195]:
mtcars_df = spark.createDataFrame(mtcars_rdd)
mtcars_df.show(n=5, truncate=False)

+-----------------+-----------------------------------------------------+
|model            |values                                               |
+-----------------+-----------------------------------------------------+
|Mazda RX4        |[21.0, 6, 160.0, 110, 3.9, 2.62, 16.46, 0, 1, 4, 4]  |
|Mazda RX4 Wag    |[21.0, 6, 160.0, 110, 3.9, 2.875, 17.02, 0, 1, 4, 4] |
|Datsun 710       |[22.8, 4, 108.0, 93, 3.85, 2.32, 18.61, 1, 1, 4, 1]  |
|Hornet 4 Drive   |[21.4, 6, 258.0, 110, 3.08, 3.215, 19.44, 1, 0, 3, 1]|
|Hornet Sportabout|[18.7, 8, 360.0, 175, 3.15, 3.44, 17.02, 0, 0, 3, 2] |
+-----------------+-----------------------------------------------------+
only showing top 5 rows



### Split one column
We use the above DataFrame as our example data. Again, we need to convert the DataFrame to an RDD to achieve our goal.

Let's split the values column into two columns: x1 and x2. The first 4 values will be in column x1 and the remaining values will be in column x2.

In [198]:
mtcars_rdd_2 = mtcars.rdd.map(lambda x : Row(model=x[0], X = x[1:5], Y = x[5:] ))
mtcars_rdd_2.take(5)

[Row(X=(21.0, 6, 160.0, 110), Y=(3.9, 2.62, 16.46, 0, 1, 4, 4), model='Mazda RX4'),
 Row(X=(21.0, 6, 160.0, 110), Y=(3.9, 2.875, 17.02, 0, 1, 4, 4), model='Mazda RX4 Wag'),
 Row(X=(22.8, 4, 108.0, 93), Y=(3.85, 2.32, 18.61, 1, 1, 4, 1), model='Datsun 710'),
 Row(X=(21.4, 6, 258.0, 110), Y=(3.08, 3.215, 19.44, 1, 0, 3, 1), model='Hornet 4 Drive'),
 Row(X=(18.7, 8, 360.0, 175), Y=(3.15, 3.44, 17.02, 0, 0, 3, 2), model='Hornet Sportabout')]

In [199]:
mtcars_df_2 = spark.createDataFrame(mtcars_rdd_2)
mtcars_df_2.show(n=5, truncate=False)

+---------------------+--------------------------------+-----------------+
|X                    |Y                               |model            |
+---------------------+--------------------------------+-----------------+
|[21.0, 6, 160.0, 110]|[3.9, 2.62, 16.46, 0, 1, 4, 4]  |Mazda RX4        |
|[21.0, 6, 160.0, 110]|[3.9, 2.875, 17.02, 0, 1, 4, 4] |Mazda RX4 Wag    |
|[22.8, 4, 108.0, 93] |[3.85, 2.32, 18.61, 1, 1, 4, 1] |Datsun 710       |
|[21.4, 6, 258.0, 110]|[3.08, 3.215, 19.44, 1, 0, 3, 1]|Hornet 4 Drive   |
|[18.7, 8, 360.0, 175]|[3.15, 3.44, 17.02, 0, 0, 3, 2] |Hornet Sportabout|
+---------------------+--------------------------------+-----------------+
only showing top 5 rows

