# SPLIT ARRAY INTO COLUMNS

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/28 19:33:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data = [
    ("AM1", [1, 4, 19]),
    ("VM2", [5, 2, 10, 6]),
    ("PO3", [None, 0, 6]),
    ("PO3", [2, 8]),
]

schema = "test STRING, value ARRAY<INTEGER>"

In [4]:
df = spark.createDataFrame(data=data, schema=schema)
df.show()

                                                                                

+----+-------------+
|test|        value|
+----+-------------+
| AM1|   [1, 4, 19]|
| VM2|[5, 2, 10, 6]|
| PO3| [NULL, 0, 6]|
| PO3|       [2, 8]|
+----+-------------+



In [5]:
df.printSchema()

root
 |-- test: string (nullable = true)
 |-- value: array (nullable = true)
 |    |-- element: integer (containsNull = true)



## Split Array into separate columns

In [6]:
df.select("test",df.value[0].alias("first_time"),df.value[1].alias("second_time"),df.value[2].alias("third_time"), df.value[3].alias("forth_time")).show()

+----+----------+-----------+----------+----------+
|test|first_time|second_time|third_time|forth_time|
+----+----------+-----------+----------+----------+
| AM1|         1|          4|        19|      NULL|
| VM2|         5|          2|        10|         6|
| PO3|      NULL|          0|         6|      NULL|
| PO3|         2|          8|      NULL|      NULL|
+----+----------+-----------+----------+----------+



## Automate Solution

In [7]:
from pyspark.sql.functions import size, col

In [8]:
df_v2 = df.select("test", "value", size("value").alias("num_elements"))
df_v2.show()

+----+-------------+------------+
|test|        value|num_elements|
+----+-------------+------------+
| AM1|   [1, 4, 19]|           3|
| VM2|[5, 2, 10, 6]|           4|
| PO3| [NULL, 0, 6]|           3|
| PO3|       [2, 8]|           2|
+----+-------------+------------+



In [11]:
max_num_elements = df_v2.agg({"num_elements":"max"}).collect()[0][0]

print(max_num_elements)

4


In [12]:
def  split_array(df, max_num):
    for i in range (max_num):
        df = df.withColumn(f"value_{i}",df.value[i])
    return df

In [13]:
df_new = split_array(df,max_num_elements)
df_new.show()

+----+-------------+-------+-------+-------+-------+
|test|        value|value_0|value_1|value_2|value_3|
+----+-------------+-------+-------+-------+-------+
| AM1|   [1, 4, 19]|      1|      4|     19|   NULL|
| VM2|[5, 2, 10, 6]|      5|      2|     10|      6|
| PO3| [NULL, 0, 6]|   NULL|      0|      6|   NULL|
| PO3|       [2, 8]|      2|      8|   NULL|   NULL|
+----+-------------+-------+-------+-------+-------+

