In [78]:
from functools import reduce
import pandas as pd
from pyspark.sql.functions import  *
from pyspark.sql import SparkSession

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

### Case 1

Simple columns, one column per category

In [79]:
# Generate pandas dataframe
pdf = pd.DataFrame({
    'animal': ['cat', 'dog', 'chick'],
    'voice': ['meaow', 'bark-bark', 'cheep cheep'],
    
    # Data from farm 1
    'farm1_count': [1, 1, 12],
    
    # Data from farm 2
    'farm2_count': [1, 1, 22],
})

# Convert to spark:
df = spark.createDataFrame(pdf)
df.show()

+------+-----------+-----------+-----------+
|animal|      voice|farm1_count|farm2_count|
+------+-----------+-----------+-----------+
|   cat|      meaow|          1|          1|
|   dog|  bark-bark|          1|          1|
| chick|cheep cheep|         12|         22|
+------+-----------+-----------+-----------+



In [80]:
unpivot_expression = '''stack(2, 'farm1', farm1_count, 'farm2', farm2_count) as (farm, count)'''

(
    df
    .select('animal', 'voice', expr(unpivot_expression))
    .show()
)

+------+-----------+-----+-----+
|animal|      voice| farm|count|
+------+-----------+-----+-----+
|   cat|      meaow|farm1|    1|
|   cat|      meaow|farm2|    1|
|   dog|  bark-bark|farm1|    1|
|   dog|  bark-bark|farm2|    1|
| chick|cheep cheep|farm1|   12|
| chick|cheep cheep|farm2|   22|
+------+-----------+-----+-----+



### Case 2

Multiple columns per farm. Still hardcoded changes. We want to melt the table by farms. Getting columns for `count` and `leg_count`.

In [81]:
# Generate pandas dataframe
pdf = pd.DataFrame({
    'animal': ['cat', 'dog', 'chick'],
    'voice': ['meaow', 'bark-bark', 'cheep cheep'],
    
    # Data from farm 1
    'farm1_count': [1, 1, 12],
    'farm1_legs_count': [4, 4, 24],
    
    # Data from farm 2
    'farm2_count': [1, 1, 22],
    'farm2_legs_count': [4, 4, 44],
})

# Convert to spark:
df = spark.createDataFrame(pdf)
df.show()


+------+-----------+-----------+----------------+-----------+----------------+
|animal|      voice|farm1_count|farm1_legs_count|farm2_count|farm2_legs_count|
+------+-----------+-----------+----------------+-----------+----------------+
|   cat|      meaow|          1|               4|          1|               4|
|   dog|  bark-bark|          1|               4|          1|               4|
| chick|cheep cheep|         12|              24|         22|              44|
+------+-----------+-----------+----------------+-----------+----------------+



In [85]:
# Two new colums are created: "farm" and "data":
# * The farm column contains the farm labels: farm1 or farm2
# * The data column contains the struct with columns belonging to a given farm
unpivot_expression = '''stack(2, 'farm1', farm1, 'farm2', farm2) as (farm, data)'''

(
    df
    
    # Collecting the data for farm1 into a struct called farm1:
    .withColumn('farm1', struct(
        col('farm1_count').alias('count'), 
        col('farm1_legs_count').alias('legs_count'))
    )

    # Collecting the data for farm2 into a struct called farm2:
    .withColumn('farm2', struct(
        col('farm2_count').alias('count'), 
        col('farm2_legs_count').alias('legs_count'))
    )
    
    # Stack data:
    .select('animal', 'voice', expr(unpivot_expression))
    
    # Explode struct:
    .select('*', 'data.*')
    .drop('data')

    .show()
)

+------+-----------+-----+-----+----------+
|animal|      voice| farm|count|legs_count|
+------+-----------+-----+-----+----------+
|   cat|      meaow|farm1|    1|         4|
|   cat|      meaow|farm2|    1|         4|
|   dog|  bark-bark|farm1|    1|         4|
|   dog|  bark-bark|farm2|    1|         4|
| chick|cheep cheep|farm1|   12|        24|
| chick|cheep cheep|farm2|   22|        44|
+------+-----------+-----+-----+----------+



In the above example, the columns belonging to the same farm are pulled together into a struct. 
Then these n columns of structs are then melted into a single column.

Then the structs are unpacked.

### Case 3

3 farms are present in the dataset:

In [83]:
# Generate pandas dataframe
pdf = pd.DataFrame({
    'animal': ['cat', 'dog', 'chick'],
    'voice': ['meaow', 'bark-bark', 'cheep cheep'],
    
    # Data from farm 1
    'farm1_count': [1, 1, 12],
    'farm1_legs_count': [4, 4, 24],
    'farm1_has_feathers': [False, False, True],
    
    # Data from farm 2
    'farm2_count': [1, 1, 22],
    'farm2_legs_count': [4, 4, 44],
    'farm2_has_feathers': [False, False, True],
    
    # Data from farm 3
    'farm3_count': [2, 5, 2],
    'farm3_legs_count': [8, 20, 4],
    'farm3_has_feathers': [False, False, True],
})

# Convert to spark:
df = spark.createDataFrame(pdf)
df.show()

+------+-----------+-----------+----------------+------------------+-----------+----------------+------------------+-----------+----------------+------------------+
|animal|      voice|farm1_count|farm1_legs_count|farm1_has_feathers|farm2_count|farm2_legs_count|farm2_has_feathers|farm3_count|farm3_legs_count|farm3_has_feathers|
+------+-----------+-----------+----------------+------------------+-----------+----------------+------------------+-----------+----------------+------------------+
|   cat|      meaow|          1|               4|             false|          1|               4|             false|          2|               8|             false|
|   dog|  bark-bark|          1|               4|             false|          1|               4|             false|          5|              20|             false|
| chick|cheep cheep|         12|              24|              true|         22|              44|              true|          2|               4|              true|
+------+--

In [84]:
farms = ['farm1', 'farm2', 'farm3'] # This can be inferred from schema
columns = ['count', 'legs_count', 'has_feathers'] # This can be inferred from schema

# Generating "unpivot" expression:
unpivot_expression = f'''stack({len(farms)}, {", ".join([f"'{x}', {x}" for x in farms])} ) as (farm, data)'''

# Generating expressions to create the full struct for each farm:
expressions = map(lambda farm: (farm, struct([col(f'{farm}_{x}').alias(x) for x in columns])), farms)

# Applying map on the dataframe:
res_df = reduce(lambda DF,value: DF.withColumn(*value) , expressions, df)

(
    res_df
    .select('animal', 'voice', expr(unpivot_expression))
    .select('*', 'data.*')
    .drop('data')
    .show()
)

+------+-----------+-----+-----+----------+------------+
|animal|      voice| farm|count|legs_count|has_feathers|
+------+-----------+-----+-----+----------+------------+
|   cat|      meaow|farm1|    1|         4|       false|
|   cat|      meaow|farm2|    1|         4|       false|
|   cat|      meaow|farm3|    2|         8|       false|
|   dog|  bark-bark|farm1|    1|         4|       false|
|   dog|  bark-bark|farm2|    1|         4|       false|
|   dog|  bark-bark|farm3|    5|        20|       false|
| chick|cheep cheep|farm1|   12|        24|        true|
| chick|cheep cheep|farm2|   22|        44|        true|
| chick|cheep cheep|farm3|    2|         4|        true|
+------+-----------+-----+-----+----------+------------+



In [11]:
import numpy as np

betuk = np.array([x for x in 'abcdefghijkl'.upper()]).reshape((3, 4))
betuk

array([['A', 'B', 'C', 'D'],
       ['E', 'F', 'G', 'H'],
       ['I', 'J', 'K', 'L']], dtype='<U1')

In [19]:
np.fliplr(np.rot90(betuk))[:-1]
# np.fliplr(betuk)

array([['L', 'H', 'D'],
       ['K', 'G', 'C'],
       ['J', 'F', 'B']], dtype='<U1')