In [1]:
import findspark
import pyspark
from pyspark.sql import Row
from pyspark.sql.functions import first

In [2]:
findspark.init()
sc = pyspark.SparkContext("local", "Spark Dataframe Union")

In [3]:
sqlContext = pyspark.SQLContext(sc)

In [4]:
def create_dataframe(l):
    rdd = sc.parallelize(l)
    maped = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))
    return sqlContext.createDataFrame(maped)

In [5]:
l1 = [('Molly',23),('Claire',22),('Jenna',20),('Heather',26)]
schemaGirls = create_dataframe(l1)

In [6]:
schemaGirls.describe().show()
schemaGirls.printSchema()
schemaGirls.show()

+-------+------------------+------+
|summary|               age|  name|
+-------+------------------+------+
|  count|                 4|     4|
|   mean|             22.75|  null|
| stddev|2.4999999999999996|  null|
|    min|                20|Claire|
|    max|                26| Molly|
+-------+------------------+------+

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

+---+-------+
|age|   name|
+---+-------+
| 23|  Molly|
| 22| Claire|
| 20|  Jenna|
| 26|Heather|
+---+-------+



In [7]:
l2 = [('Maxwell',25),('Connor',23),('Jake',21),('Hunter',27)]
schemaBoys = create_dataframe(l2)

In [8]:
schemaBoys.describe().show()
schemaBoys.printSchema()
schemaBoys.show()

+-------+-----------------+-------+
|summary|              age|   name|
+-------+-----------------+-------+
|  count|                4|      4|
|   mean|             24.0|   null|
| stddev|2.581988897471611|   null|
|    min|               21| Connor|
|    max|               27|Maxwell|
+-------+-----------------+-------+

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

+---+-------+
|age|   name|
+---+-------+
| 25|Maxwell|
| 23| Connor|
| 21|   Jake|
| 27| Hunter|
+---+-------+



In [9]:
allPeople = schemaGirls.union(schemaBoys)

In [10]:
allPeople.describe().show()
allPeople.printSchema()
allPeople.show()

+-------+-----------------+------+
|summary|              age|  name|
+-------+-----------------+------+
|  count|                8|     8|
|   mean|           23.375|  null|
| stddev|2.445841952609133|  null|
|    min|               20|Claire|
|    max|               27| Molly|
+-------+-----------------+------+

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

+---+-------+
|age|   name|
+---+-------+
| 23|  Molly|
| 22| Claire|
| 20|  Jenna|
| 26|Heather|
| 25|Maxwell|
| 23| Connor|
| 21|   Jake|
| 27| Hunter|
+---+-------+



In [11]:
allPeople.select('name').show(5)

+-------+
|   name|
+-------+
|  Molly|
| Claire|
|  Jenna|
|Heather|
|Maxwell|
+-------+
only showing top 5 rows



In [12]:
allPeople.select('age').count(), allPeople.select('age').distinct().count()

(8, 7)

In [13]:
allPeople.orderBy('age', 'name').show()

+---+-------+
|age|   name|
+---+-------+
| 20|  Jenna|
| 21|   Jake|
| 22| Claire|
| 23| Connor|
| 23|  Molly|
| 25|Maxwell|
| 26|Heather|
| 27| Hunter|
+---+-------+



In [14]:
allPeople.registerTempTable('all_people_table')
sqlContext.sql('select age, count(name) from all_people_table group by age').show()

+---+-----------+
|age|count(name)|
+---+-----------+
| 26|          1|
| 22|          1|
| 25|          1|
| 27|          1|
| 21|          1|
| 23|          2|
| 20|          1|
+---+-----------+

