In [24]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col, lit, when, desc, asc, cast, like
from pyspark.sql.types import *

spark = SparkSession.Builder()\
.appName('unoin(), unionAll() & unoinByName() --> allowMissingColumns = True').getOrCreate()

24/06/17 12:28:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [25]:
data1 = [
    ('Ajay', 23, 3000),
    ('Rohit', 27, 2000),
    ('Hema', 26, 2000),
    ('Huedsad', 26, 1233),
]
data2 = [
    ('Ajay1', 23, 3000),
    ('Rohit1', 27, 2000),
    ('Hema', 26, 2000),
    ('Huedsad', 26, 1233),
]

schema = ['name', 'age', 'salary']

df1 = spark.createDataFrame(data1, schema)
df2 = spark.createDataFrame(data2, schema)
df1.show()
df2.show()

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|   Ajay| 23|  3000|
|  Rohit| 27|  2000|
|   Hema| 26|  2000|
|Huedsad| 26|  1233|
+-------+---+------+

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|  Ajay1| 23|  3000|
| Rohit1| 27|  2000|
|   Hema| 26|  2000|
|Huedsad| 26|  1233|
+-------+---+------+



In [7]:
df1.union(df2).distinct().show()



+-------+---+------+
|   name|age|salary|
+-------+---+------+
|   Ajay| 23|  3000|
|  Rohit| 27|  2000|
|   Hema| 26|  2000|
|Huedsad| 26|  1233|
|  Ajay1| 23|  3000|
| Rohit1| 27|  2000|
+-------+---+------+



                                                                                

#### unionAll() is alias to union() in PySpark

In [12]:
df1.unionAll(df2).show()



+-------+---+------+
|   name|age|salary|
+-------+---+------+
|   Ajay| 23|  3000|
|  Rohit| 27|  2000|
|   Hema| 26|  2000|
|Huedsad| 26|  1233|
|  Ajay1| 23|  3000|
| Rohit1| 27|  2000|
|   Hema| 26|  2000|
|Huedsad| 26|  1233|
+-------+---+------+



                                                                                

In [22]:
df1.groupBy('age').count().show()

+---+-----+
|age|count|
+---+-----+
| 23|    1|
| 27|    1|
| 26|    2|
+---+-----+



#### unionByName() --> Lets you union DataFrames with different Schema

In [26]:
data1 = [
    ('Ajay', 23, 3000),
    ('Rohit', 27, 2000),
    ('Hema', 26, 2000),
    ('Huedsad', 26, 1233),
]
data2 = [
    ('Ajay', 23, 'Data'),
    ('Rohit', 27, 'Data'),
    ('Hema', 26, "HR"),
    ('Huedsad', 26, "PayRoll"),
]

schema1 = ['name', 'age', 'salary']
schema2 = ['name', 'age', 'dept']

df1 = spark.createDataFrame(data1, schema1)
df2 = spark.createDataFrame(data2, schema2)
df1.show()
df2.show()

                                                                                

+-------+---+------+
|   name|age|salary|
+-------+---+------+
|   Ajay| 23|  3000|
|  Rohit| 27|  2000|
|   Hema| 26|  2000|
|Huedsad| 26|  1233|
+-------+---+------+



[Stage 11:>                                                         (0 + 7) / 7]

+-------+---+-------+
|   name|age|   dept|
+-------+---+-------+
|   Ajay| 23|   Data|
|  Rohit| 27|   Data|
|   Hema| 26|     HR|
|Huedsad| 26|PayRoll|
+-------+---+-------+



                                                                                

In [31]:
df1.unionByName(allowMissingColumns=True, other=df2).show()

+-------+---+------+-------+
|   name|age|salary|   dept|
+-------+---+------+-------+
|   Ajay| 23|  3000|   NULL|
|  Rohit| 27|  2000|   NULL|
|   Hema| 26|  2000|   NULL|
|Huedsad| 26|  1233|   NULL|
|   Ajay| 23|  NULL|   Data|
|  Rohit| 27|  NULL|   Data|
|   Hema| 26|  NULL|     HR|
|Huedsad| 26|  NULL|PayRoll|
+-------+---+------+-------+



In [44]:
spark.stop()