## 09-pyspark-join-two-dataframes.py

In [0]:
# 09-pyspark-join-two-dataframes.py
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
          .appName('PySparkExamples') \
          .getOrCreate()

# EMP DataFrame
empData = [(1, "Smita", 10), (2, "Rohit", 20), (3, "Alam", 10), (4, "Jaya", 30)]
empColumns = ["emp_id", "name", "emp_dept_id"]
empDF = spark.createDataFrame(empData, empColumns)
print("DataFrame columns are:", empDF.columns, "with column count:", len(empDF.columns), "and with row count:", empDF.count())
empDF.printSchema()
empDF.show()

DataFrame columns are: ['emp_id', 'name', 'emp_dept_id'] with column count: 3 and with row count: 4
root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- emp_dept_id: long (nullable = true)

+------+-----+-----------+
|emp_id| name|emp_dept_id|
+------+-----+-----------+
|     1|Smita|         10|
|     2|Rohit|         20|
|     3| Alam|         10|
|     4| Jaya|         30|
+------+-----+-----------+



In [0]:
# DEPT DataFrame
deptData = [("Finance", 10), ("Marketing", 20), ("Sales", 30), ("IT", 40)]
deptColumns = ["dept_name", "dept_id"]
deptDF = spark.createDataFrame(deptData, deptColumns)
print("DataFrame columns are:", deptDF.columns, "with column count:", len(deptDF.columns), "and with row count:", deptDF.count())
deptDF.printSchema()
deptDF.show()

DataFrame columns are: ['dept_name', 'dept_id'] with column count: 2 and with row count: 4
root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+



In [0]:
# Address DataFrame
addData=[(1, "111 ABCD St", "City-1", "State-1"),
         (2, "222 PQRS St", "City-1", "State-2"),
         (3, "333 WXYZ Rd", "City-2", "State-3"),
         (4, "444 CDEF St", "City-3", "State-4"),
         (5, "555 MNOP Rd", "City-4", "State-1")]
addColumns = ["emp_id", "addline1", "city", "state"]
addDF = spark.createDataFrame(addData, addColumns)
addDF.printSchema()
addDF.show()

root
 |-- emp_id: long (nullable = true)
 |-- addline1: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)

+------+-----------+------+-------+
|emp_id|   addline1|  city|  state|
+------+-----------+------+-------+
|     1|111 ABCD St|City-1|State-1|
|     2|222 PQRS St|City-1|State-2|
|     3|333 WXYZ Rd|City-2|State-3|
|     4|444 CDEF St|City-3|State-4|
|     5|555 MNOP Rd|City-4|State-1|
+------+-----------+------+-------+



In [0]:
# Join two DataFrames
empDF.join(addDF, empDF["emp_id"] == addDF["emp_id"]).show()

+------+-----+-----------+------+-----------+------+-------+
|emp_id| name|emp_dept_id|emp_id|   addline1|  city|  state|
+------+-----+-----------+------+-----------+------+-------+
|     1|Smita|         10|     1|111 ABCD St|City-1|State-1|
|     2|Rohit|         20|     2|222 PQRS St|City-1|State-2|
|     3| Alam|         10|     3|333 WXYZ Rd|City-2|State-3|
|     4| Jaya|         30|     4|444 CDEF St|City-3|State-4|
+------+-----+-----------+------+-----------+------+-------+



In [0]:
# Drop duplicate column
empDF.join(addDF, ["emp_id"]).show()

+------+-----+-----------+-----------+------+-------+
|emp_id| name|emp_dept_id|   addline1|  city|  state|
+------+-----+-----------+-----------+------+-------+
|     1|Smita|         10|111 ABCD St|City-1|State-1|
|     2|Rohit|         20|222 PQRS St|City-1|State-2|
|     3| Alam|         10|333 WXYZ Rd|City-2|State-3|
|     4| Jaya|         30|444 CDEF St|City-3|State-4|
+------+-----+-----------+-----------+------+-------+



In [0]:
# Join Multiple DataFrames
empDF.join(addDF, ["emp_id"]) \
     .join(deptDF, empDF["emp_dept_id"] == deptDF["dept_id"]) \
     .show()

+------+-----+-----------+-----------+------+-------+---------+-------+
|emp_id| name|emp_dept_id|   addline1|  city|  state|dept_name|dept_id|
+------+-----+-----------+-----------+------+-------+---------+-------+
|     3| Alam|         10|333 WXYZ Rd|City-2|State-3|  Finance|     10|
|     1|Smita|         10|111 ABCD St|City-1|State-1|  Finance|     10|
|     2|Rohit|         20|222 PQRS St|City-1|State-2|Marketing|     20|
|     4| Jaya|         30|444 CDEF St|City-3|State-4|    Sales|     30|
+------+-----+-----------+-----------+------+-------+---------+-------+



In [0]:
# Using Where for Join Condition
empDF.join(deptDF).where(empDF["emp_dept_id"] == deptDF["dept_id"]) \
    .join(addDF).where(empDF["emp_id"] == addDF["emp_id"]) \
    .show()

+------+-----+-----------+---------+-------+------+-----------+------+-------+
|emp_id| name|emp_dept_id|dept_name|dept_id|emp_id|   addline1|  city|  state|
+------+-----+-----------+---------+-------+------+-----------+------+-------+
|     1|Smita|         10|  Finance|     10|     1|111 ABCD St|City-1|State-1|
|     2|Rohit|         20|Marketing|     20|     2|222 PQRS St|City-1|State-2|
|     3| Alam|         10|  Finance|     10|     3|333 WXYZ Rd|City-2|State-3|
|     4| Jaya|         30|    Sales|     30|     4|444 CDEF St|City-3|State-4|
+------+-----+-----------+---------+-------+------+-----------+------+-------+



In [0]:
# SQL
empDF.createOrReplaceTempView("EMP")
deptDF.createOrReplaceTempView("DEPT")
addDF.createOrReplaceTempView("ADD")
spark.sql("select * from EMP e, DEPT d, ADD a " + \
    "where e.emp_dept_id == d.dept_id and e.emp_id == a.emp_id") \
    .show()

+------+-----+-----------+---------+-------+------+-----------+------+-------+
|emp_id| name|emp_dept_id|dept_name|dept_id|emp_id|   addline1|  city|  state|
+------+-----+-----------+---------+-------+------+-----------+------+-------+
|     1|Smita|         10|  Finance|     10|     1|111 ABCD St|City-1|State-1|
|     2|Rohit|         20|Marketing|     20|     2|222 PQRS St|City-1|State-2|
|     3| Alam|         10|  Finance|     10|     3|333 WXYZ Rd|City-2|State-3|
|     4| Jaya|         30|    Sales|     30|     4|444 CDEF St|City-3|State-4|
+------+-----+-----------+---------+-------+------+-----------+------+-------+



In [0]:
df1 = spark.createDataFrame([(1, "A"), (2, "B"), (3, "C")], ["A1", "A2"])
df2 = spark.createDataFrame([(1, "F"), (2, "B")], ["B1", "B2"])
df1.show()
df2.show()
df = df1.join(df2, (df1.A1 == df2.B1) & (df1.A2 == df2.B2))
df.show()

+---+---+
| A1| A2|
+---+---+
|  1|  A|
|  2|  B|
|  3|  C|
+---+---+

+---+---+
| B1| B2|
+---+---+
|  1|  F|
|  2|  B|
+---+---+

+---+---+---+---+
| A1| A2| B1| B2|
+---+---+---+---+
|  2|  B|  2|  B|
+---+---+---+---+

