## 10-pyspark-pivot-unpivot-1.py

In [0]:
# 10-pyspark-pivot-unpivot-1.py
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
spark = SparkSession.builder.appName('PySparkExamples').getOrCreate()

In [0]:
# Let Spark know about the header and infer the Schema types! 
# Infer scehma could be done with .csv file not with .json files
df = spark.read.csv('dbfs:/FileStore/tables/sales_info.csv', inferSchema = True, header = True)
df.printSchema()
print("DataFrame columns are:", df.columns, "with column count:", len(df.columns), "and with row count:", df.count())
print(df.head(2))
df.show()

root
 |-- Company: string (nullable = true)
 |-- Branch: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Qty: integer (nullable = true)
 |-- Sales: integer (nullable = true)

DataFrame columns are: ['Company', 'Branch', 'Person', 'Qty', 'Sales'] with column count: 5 and with row count: 12
[Row(Company='GOOG', Branch='NY', Person='Sam', Qty=1000, Sales=200), Row(Company='GOOG', Branch='NY', Person='Charlie', Qty=4000, Sales=120)]
+-------+------+-------+----+-----+
|Company|Branch| Person| Qty|Sales|
+-------+------+-------+----+-----+
|   GOOG|    NY|    Sam|1000|  200|
|   GOOG|    NY|Charlie|4000|  120|
|   GOOG|    CA|  Frank|6000|  340|
|   MSFT|    CA|   Tina|7000|  600|
|   MSFT|    NY|    Amy|2000|  124|
|   MSFT|    CA|Vanessa|2500|  243|
|     FB|    CA|   Carl|9000|  870|
|     FB|    NY|  Sarah|3050|  350|
|   APPL|    CA|   John|7000|  250|
|   APPL|    NY|  Linda|9500|  130|
|   APPL|    NY|   Mike|1000|  750|
|   APPL|    CA|  Chris|7600|  350|
+------

In [0]:
# Performing pivot operation
pivotDF = df.groupBy("Company").pivot("Branch").sum("Sales")
pivotDF.printSchema()
pivotDF.show(truncate = False)

root
 |-- Company: string (nullable = true)
 |-- CA: long (nullable = true)
 |-- NY: long (nullable = true)

+-------+---+---+
|Company|CA |NY |
+-------+---+---+
|APPL   |600|880|
|GOOG   |340|320|
|FB     |870|350|
|MSFT   |843|124|
+-------+---+---+



In [0]:
# Performing pivot operation
pivotDF = df.groupBy("Company","Branch") \
            .sum("Sales") \
            .groupBy("Company") \
            .pivot("Branch") \
            .sum("sum(Sales)")
pivotDF.printSchema()
pivotDF.show(truncate = False)

root
 |-- Company: string (nullable = true)
 |-- CA: long (nullable = true)
 |-- NY: long (nullable = true)

+-------+---+---+
|Company|CA |NY |
+-------+---+---+
|APPL   |600|880|
|GOOG   |340|320|
|FB     |870|350|
|MSFT   |843|124|
+-------+---+---+



In [0]:
# Performing unpivot operation
unpivotExpr = "stack(2, 'California', CA, 'New York', NY) as (Branch, Total)"
unPivotDF = pivotDF.select("Company", expr(unpivotExpr)) \
            .where("Total is not null")
unPivotDF.show(truncate=False)

+-------+----------+-----+
|Company|Branch    |Total|
+-------+----------+-----+
|APPL   |California|600  |
|APPL   |New York  |880  |
|GOOG   |California|340  |
|GOOG   |New York  |320  |
|FB     |California|870  |
|FB     |New York  |350  |
|MSFT   |California|843  |
|MSFT   |New York  |124  |
+-------+----------+-----+

