In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('PySparkLearning').getOrCreate()

In [2]:
# First DataFrame

simpleData1 = [("James","Sales","NY",90000,34,10000), \
                ("Maria","Finance","CA",90000,24,23000), \
                ("Michael","Sales","NY",86000,56,20000), \
                ("Robert","Sales","CA",81000,30,23000)           
          ]

columns1= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData1, schema = columns1)
df.printSchema()
df.show(truncate=False)

# Second DataFrame

simpleData2 = [("James","Sales","NY",90000,34,10000), \
                ("Maria","Finance","CA",90000,24,23000), \
                ("Jen","Finance","NY",79000,53,15000), \
                ("Jeff","Marketing","CA",80000,25,18000), \
                ("Kumar","Marketing","NY",91000,50,21000) \
          ]

columns2= ["employee_name","department","state","salary","age","bonus"]

df2 = spark.createDataFrame(data = simpleData2, schema = columns2)
df2.printSchema()
df2.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
+-------------+----------+-----+------+---+-----+

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----

### Merge two or more DataFrames using union
DataFrame `union()` method merges two DataFrames and returns the new DataFrame with all rows from two Dataframes regardless of duplicate data.



In [3]:
unionDF = df.union(df2)
unionDF.orderBy('employee_name').show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|James        |Sales     |NY   |90000 |34 |10000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
+-------------+----------+-----+------+---+-----+



### Merge DataFrames using unionAll
DataFrame `unionAll()` method is deprecated since PySpark “2.0.0” version and recommends using the `union()` method.


In [4]:
unionAllDF = df.unionAll(df2)
unionAllDF.orderBy('employee_name').show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|James        |Sales     |NY   |90000 |34 |10000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
+-------------+----------+-----+------+---+-----+



### Merge without Duplicates
Since the `union()` method returns all rows without distinct records, we will use the `distinct()` function to return just one record when duplicate exists.


In [5]:
disDF = df.union(df2).distinct()
disDF.orderBy('employee_name').show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
+-------------+----------+-----+------+---+-----+

