In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Array").getOrCreate()


defining array

In [2]:
data = [("John", [1, 2, 3]), ("Jane", [4, 5, 6])]

# Define schema with ArrayType
schema = ["name", "numbers"]

# Create DataFrame
df = spark.createDataFrame(data, schema)

df.show()
df.printSchema()

+----+---------+
|name|  numbers|
+----+---------+
|John|[1, 2, 3]|
|Jane|[4, 5, 6]|
+----+---------+

root
 |-- name: string (nullable = true)
 |-- numbers: array (nullable = true)
 |    |-- element: long (containsNull = true)



Creating an Array Column Using array Function

In [3]:
from pyspark.sql.functions import array,col

data = [("John", 1, 2, 3), ("Jane", 4, 5, 6)]
schema = ["name", "num1", "num2", "num3"]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Add array column
df = df.withColumn("numbers", array(col("num1"), col("num2"), col("num3")))

df.show()
df.printSchema()

+----+----+----+----+---------+
|name|num1|num2|num3|  numbers|
+----+----+----+----+---------+
|John|   1|   2|   3|[1, 2, 3]|
|Jane|   4|   5|   6|[4, 5, 6]|
+----+----+----+----+---------+

root
 |-- name: string (nullable = true)
 |-- num1: long (nullable = true)
 |-- num2: long (nullable = true)
 |-- num3: long (nullable = true)
 |-- numbers: array (nullable = false)
 |    |-- element: long (containsNull = true)



ArrayType in Schema Definition

In [4]:
from pyspark.sql.types import StructType, StructField, ArrayType, IntegerType, StringType

schema = StructType([
    StructField("name", StringType(), True),
    StructField("numbers", ArrayType(IntegerType()), True)
])

# Sample data
data = [("John", [1, 2, 3]), ("Jane", [4, 5, 6])]

# Create DataFrame with schema
df = spark.createDataFrame(data, schema)

df.show()
df.printSchema()

+----+---------+
|name|  numbers|
+----+---------+
|John|[1, 2, 3]|
|Jane|[4, 5, 6]|
+----+---------+

root
 |-- name: string (nullable = true)
 |-- numbers: array (nullable = true)
 |    |-- element: integer (containsNull = true)



Accessing Array Elements


In [5]:
from pyspark.sql.functions import col

# Access first element of the array
df = df.withColumn("first_number", col("numbers")[0])
df.show()


+----+---------+------------+
|name|  numbers|first_number|
+----+---------+------------+
|John|[1, 2, 3]|           1|
|Jane|[4, 5, 6]|           4|
+----+---------+------------+



Exploding Arrays


In [6]:
from pyspark.sql.functions import explode

# Explode array column into separate rows
df_exploded = df.withColumn("number", explode(col("numbers")))

df_exploded.show()
df_exploded.printSchema()


+----+---------+------------+------+
|name|  numbers|first_number|number|
+----+---------+------------+------+
|John|[1, 2, 3]|           1|     1|
|John|[1, 2, 3]|           1|     2|
|John|[1, 2, 3]|           1|     3|
|Jane|[4, 5, 6]|           4|     4|
|Jane|[4, 5, 6]|           4|     5|
|Jane|[4, 5, 6]|           4|     6|
+----+---------+------------+------+

root
 |-- name: string (nullable = true)
 |-- numbers: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- first_number: integer (nullable = true)
 |-- number: integer (nullable = true)

