 Pivot() It is an aggregation where one of the grouping columns values is transposed into individual columns with distinct data.

# Syntax
`pivot_df = original_df.groupBy("grouping_column").pivot("pivot_column").agg({"agg_column": "agg_function"})`

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("SP").getOrCreate()

In [7]:
# read csv file
file_path = "C:/Users/pcc/Desktop/HR_1.csv"
df=spark.read.csv(file_path,header=True,inferSchema=True)
df

DataFrame[Age: int, Attrition: string, BusinessTravel: string, DailyRate: int, Department: string, DistanceFromHome: int, Education: int, EducationField: string, EmployeeCount: int, EmployeeNumber: int, EnvironmentSatisfaction: int, Gender: string, HourlyRate: int, JobInvolvement: int, JobLevel: int, JobRole: string, JobSatisfaction: int, MaritalStatus: string]

In [8]:
df.show()

+---+---------+-----------------+---------+--------------------+----------------+---------+----------------+-------------+--------------+-----------------------+------+----------+--------------+--------+--------------------+---------------+-------------+
|Age|Attrition|   BusinessTravel|DailyRate|          Department|DistanceFromHome|Education|  EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|             JobRole|JobSatisfaction|MaritalStatus|
+---+---------+-----------------+---------+--------------------+----------------+---------+----------------+-------------+--------------+-----------------------+------+----------+--------------+--------+--------------------+---------------+-------------+
| 31|       No|       Non-Travel|      158|            Software|               7|        3|         Medical|            1|             1|                      3|  Male|        42|             2|       3|           Developer|           

In [9]:
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- DailyRate: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- DistanceFromHome: integer (nullable = true)
 |-- Education: integer (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- EmployeeCount: integer (nullable = true)
 |-- EmployeeNumber: integer (nullable = true)
 |-- EnvironmentSatisfaction: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- HourlyRate: integer (nullable = true)
 |-- JobInvolvement: integer (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- JobSatisfaction: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)



# Pivot PySpark DataFrame


In [18]:
# Applying pivot()
pivotDf=df.groupBy("Department").pivot("Gender").sum("Age")
pivotDf.printSchema()
pivotDf.show()

root
 |-- Department: string (nullable = true)
 |-- Female: long (nullable = true)
 |-- Male: long (nullable = true)

+--------------------+------+------+
|          Department|Female|  Male|
+--------------------+------+------+
|               Sales|162689|165794|
|Research & Develo...|164843|160354|
|            Software|163530|161543|
|             Support|158128|164641|
|            Hardware|160705|159191|
|     Human Resources|163646|163510|
+--------------------+------+------+



In [21]:
pivotDf=df.groupBy("Department","Gender").avg("Age").groupBy("Department").pivot("Gender").avg("avg(Age)")
pivotDf.show()

+--------------------+------------------+-----------------+
|          Department|            Female|             Male|
+--------------------+------------------+-----------------+
|               Sales|38.855743969429184|38.86404125644632|
|Research & Develo...| 39.16440959847945|39.01557177615572|
|            Software| 38.97283126787416|39.02004830917874|
|             Support| 38.82347164252394|38.90382797731569|
|            Hardware|39.359539554249324|38.96010768477729|
|     Human Resources| 39.02838063439065|38.70059171597633|
+--------------------+------------------+-----------------+



In [24]:
dep=['Sales','Hardware','Support']
pivotDf=df.groupBy("Gender").pivot("Department",dep).sum("Age")
pivotDf.printSchema()
pivotDf.show()

root
 |-- Gender: string (nullable = true)
 |-- Sales: long (nullable = true)
 |-- Hardware: long (nullable = true)
 |-- Support: long (nullable = true)

+------+------+--------+-------+
|Gender| Sales|Hardware|Support|
+------+------+--------+-------+
|Female|162689|  160705| 158128|
|  Male|165794|  159191| 164641|
+------+------+--------+-------+



# Unpivot PySpark DataFrame

 PySpark SQL doesn’t have unpivot function hence will use the stack() function.

In [25]:
# Applying unpivot()
from pyspark.sql.functions import expr
unpivotExpr="stack(3,'Support','Support','Software','Software','Hardware','Hardware') as (Department, Total)"
unpivotDF=pivotDf.select("Gender",expr(unpivotExpr)).where("Total is not null")
unpivotDF.show()


+------+----------+--------+
|Gender|Department|   Total|
+------+----------+--------+
|Female|   Support| Support|
|Female|  Software|Software|
|Female|  Hardware|Hardware|
|  Male|   Support| Support|
|  Male|  Software|Software|
|  Male|  Hardware|Hardware|
+------+----------+--------+

