In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import explode,col

In [0]:
data = [(1,"Ram",['python','spark']),(2,'Sham',['java','sql'])]
schema =['id','name','skills']

df=spark.createDataFrame(data,schema)
display(df)
df.printSchema()

id,name,skills
1,Ram,"List(python, spark)"
2,Sham,"List(java, sql)"


root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [0]:
# It will create a new row for each element in a array column. 
df.show()
df1=df.withColumn('skill',explode(col('skills')))
df1.show()

+---+----+---------------+
| id|name|         skills|
+---+----+---------------+
|  1| Ram|[python, spark]|
|  2|Sham|    [java, sql]|
+---+----+---------------+

+---+----+---------------+------+
| id|name|         skills| skill|
+---+----+---------------+------+
|  1| Ram|[python, spark]|python|
|  1| Ram|[python, spark]| spark|
|  2|Sham|    [java, sql]|  java|
|  2|Sham|    [java, sql]|   sql|
+---+----+---------------+------+



In [0]:
from pyspark.sql.functions import split


data = [(1, "Saket,Jha"), (2, "Anuj,Jha")]
columns = ["id", "name"]
df = spark.createDataFrame(data, columns)

df = df.withColumn("name_split", split(df["name"], ","))

df.show(truncate=False)


+---+---------+------------+
|id |name     |name_split  |
+---+---------+------------+
|1  |Saket,Jha|[Saket, Jha]|
|2  |Anuj,Jha |[Anuj, Jha] |
+---+---------+------------+



In [0]:
from pyspark.sql.functions import col

df = df.withColumn("first_name", col("name_split")[0]) \
       .withColumn("last_name", col("name_split")[1])

df.show(truncate=False)


+---+---------+------------+----------+---------+
|id |name     |name_split  |first_name|last_name|
+---+---------+------------+----------+---------+
|1  |Saket,Jha|[Saket, Jha]|Saket     |Jha      |
|2  |Anuj,Jha |[Anuj, Jha] |Anuj      |Jha      |
+---+---------+------------+----------+---------+



In [0]:
from pyspark.sql.functions import array, lit

data = [("Anuj", 25), ("panki", 30)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)

df = df.withColumn("info", array(df["name"], df["age"])) # we are adding an array col name.

df.show(truncate=False)


+-----+---+-----------+
|name |age|info       |
+-----+---+-----------+
|Anuj |25 |[Anuj, 25] |
|panki|30 |[panki, 30]|
+-----+---+-----------+



In [0]:
df = df.withColumn("hobbies", array(lit("Reading"), lit("Traveling")))
df.show(truncate=False)


+-----+---+-----------+--------------------+
|name |age|info       |hobbies             |
+-----+---+-----------+--------------------+
|Anuj |25 |[Anuj, 25] |[Reading, Traveling]|
|panki|30 |[panki, 30]|[Reading, Traveling]|
+-----+---+-----------+--------------------+



In [0]:
from pyspark.sql.functions import explode
#using Explode, alias with array
df_exploded = df.select("name", explode(df["hobbies"]).alias("hobby"))
df_exploded.show(truncate=False)


+-----+---------+
|name |hobby    |
+-----+---------+
|Anuj |Reading  |
|Anuj |Traveling|
|panki|Reading  |
|panki|Traveling|
+-----+---------+

