# Data Frames to/from RDDs

In [9]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *

In [10]:
spark = (SparkSession
        .builder
        .appName("Word Count")
        .getOrCreate())

In [11]:
lines = spark.read.text("../word-count/input/*.txt")

In [12]:
lines.printSchema()

root
 |-- value: string (nullable = true)



In [13]:
words = lines.withColumn("words", explode(split(trim(col("value")), " ")))

In [14]:
words.show(2)

+--------------------+------+
|               value| words|
+--------------------+------+
|                    |      |
|***The Project Gu...|***The|
+--------------------+------+
only showing top 2 rows



In [15]:
wordcounts = words.groupBy("words").count()

In [18]:
rdd1 = wordcounts.rdd

In [19]:
result = rdd1.take(10)

In [20]:
print(result)

[Row(words='By', count=1279), Row(words='Volume', count=13), Row(words='still', count=1617), Row(words='those', count=2165), Row(words='some', count=3835), Row(words='cit.', count=20), Row(words='waters', count=132), Row(words='reddish', count=21), Row(words='art', count=982), Row(words='connected', count=262)]


In [21]:
rdd2 = wordcounts.rdd.map(tuple)

In [22]:
result = rdd2.take(10)

In [23]:
print(result)

[('By', 1279), ('Volume', 13), ('still', 1617), ('those', 2165), ('some', 3835), ('cit.', 20), ('waters', 132), ('reddish', 21), ('art', 982), ('connected', 262)]


In [24]:
rdd3 = wordcounts.rdd.map(list)

In [25]:
result = rdd3.take(10)

In [26]:
print(result)

[['By', 1279], ['Volume', 13], ['still', 1617], ['those', 2165], ['some', 3835], ['cit.', 20], ['waters', 132], ['reddish', 21], ['art', 982], ['connected', 262]]


We can also convert into a simple RDD (although that isn't useful in this example)

In [27]:
rdd4 = wordcounts.rdd.flatMap(list)

In [28]:
result = rdd4.take(10)

In [29]:
print(result)

['By', 1279, 'Volume', 13, 'still', 1617, 'those', 2165, 'some', 3835]


We can also create a DataFrame from RDD using either .toDF (schema is inferred)
or use createDataFrame methof fro

In [31]:
newDF = spark.createDataFrame(rdd1)

In [32]:
result = newDF.take(10)

In [33]:
print(result)

[Row(words='By', count=1279), Row(words='Volume', count=13), Row(words='still', count=1617), Row(words='those', count=2165), Row(words='some', count=3835), Row(words='cit.', count=20), Row(words='waters', count=132), Row(words='reddish', count=21), Row(words='art', count=982), Row(words='connected', count=262)]
