In [29]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [30]:
from pyspark.sql import SparkSession 

spark = SparkSession.Builder().appName('Array_Functions').getOrCreate()

In [31]:
from pyspark.sql.types import *
from pyspark.sql.functions import explode, array, split, array_contains, col, lit, trim

data = [
    (1,'Rohit', ['Python','PySpark', 'SAS', 'SQL', 'JavaScript']),
    (2,'Ajay', ['Python', 'Computer Science', 'Power BI', 'LangChain', 'LLMs']),
    (3,'Dhananjay',['Python', 'Azure', 'PySpark', 'SQL', 'Hive'])
]

schema = StructType(
    [
        StructField(name = 'id', dataType = IntegerType()),
        StructField(name = 'name', dataType = StringType()),
        StructField(name = 'Skills', dataType = ArrayType(StringType()))
    ]
)

df = spark.createDataFrame(data = data, schema = schema)
df.show(truncate = False)
df.printSchema()

+---+---------+-----------------------------------------------------+
|id |name     |Skills                                               |
+---+---------+-----------------------------------------------------+
|1  |Rohit    |[Python, PySpark, SAS, SQL, JavaScript]              |
|2  |Ajay     |[Python, Computer Science, Power BI, LangChain, LLMs]|
|3  |Dhananjay|[Python, Azure, PySpark, SQL, Hive]                  |
+---+---------+-----------------------------------------------------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- Skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



##### explode() function --> To create new row for each element

In [32]:
df.withColumn('explodedCol', explode('Skills')).select(['id','name','explodedCol']).show()

+---+---------+----------------+
| id|     name|     explodedCol|
+---+---------+----------------+
|  1|    Rohit|          Python|
|  1|    Rohit|         PySpark|
|  1|    Rohit|             SAS|
|  1|    Rohit|             SQL|
|  1|    Rohit|      JavaScript|
|  2|     Ajay|          Python|
|  2|     Ajay|Computer Science|
|  2|     Ajay|        Power BI|
|  2|     Ajay|       LangChain|
|  2|     Ajay|            LLMs|
|  3|Dhananjay|          Python|
|  3|Dhananjay|           Azure|
|  3|Dhananjay|         PySpark|
|  3|Dhananjay|             SQL|
|  3|Dhananjay|            Hive|
+---+---------+----------------+



In [45]:
data = [
    (1,'Rohit', "Python,PySpark,SAS,SQL,JavaScript"),
    (2,'Ajay', "Python,Computer Science,Power BI,LangChain,LLMs"),
    (3,'Dhananjay',"Python,Azure,PySpark,SQL,Hive")
]

schema = StructType(
    [
        StructField(name = 'id', dataType = IntegerType()),
        StructField(name = 'name', dataType = StringType()),
        StructField(name = 'Skills', dataType = StringType())
    ]
)

df = spark.createDataFrame(data = data, schema = schema)
df.show(truncate = False)
df.printSchema()

+---+---------+-----------------------------------------------+
|id |name     |Skills                                         |
+---+---------+-----------------------------------------------+
|1  |Rohit    |Python,PySpark,SAS,SQL,JavaScript              |
|2  |Ajay     |Python,Computer Science,Power BI,LangChain,LLMs|
|3  |Dhananjay|Python,Azure,PySpark,SQL,Hive                  |
+---+---------+-----------------------------------------------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- Skills: string (nullable = true)



##### split() function --> Split the string to array

In [47]:
df = df.withColumn('skillsArray', split(col('skills'),','))
df.show()
df.printSchema()

+---+---------+--------------------+--------------------+
| id|     name|              Skills|         skillsArray|
+---+---------+--------------------+--------------------+
|  1|    Rohit|Python,PySpark,SA...|[Python, PySpark,...|
|  2|     Ajay|Python,Computer S...|[Python, Computer...|
|  3|Dhananjay|Python,Azure,PySp...|[Python, Azure, P...|
+---+---------+--------------------+--------------------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- Skills: string (nullable = true)
 |-- skillsArray: array (nullable = true)
 |    |-- element: string (containsNull = false)



##### array() function --> Create an array column from multiple column data

In [52]:
df.withColumn('name_skills', array('name', 'skills')).show(vertical = True, truncate = False)

-RECORD 0--------------------------------------------------------------
 id          | 1                                                       
 name        | Rohit                                                   
 Skills      | Python,PySpark,SAS,SQL,JavaScript                       
 skillsArray | [Python, PySpark, SAS, SQL, JavaScript]                 
 name_skills | [Rohit, Python,PySpark,SAS,SQL,JavaScript]              
-RECORD 1--------------------------------------------------------------
 id          | 2                                                       
 name        | Ajay                                                    
 Skills      | Python,Computer Science,Power BI,LangChain,LLMs         
 skillsArray | [Python, Computer Science, Power BI, LangChain, LLMs]   
 name_skills | [Ajay, Python,Computer Science,Power BI,LangChain,LLMs] 
-RECORD 2--------------------------------------------------------------
 id          | 3                                                

##### array_contains() function --> To check if array contains a value

In [57]:
data = [
    (1,'Rohit', "Python,PySpark,SAS,SQL,JavaScript"),
    (2,'Ajay', "Python,Computer Science,Power BI,LangChain,LLMs"),
    (3,'Dhananjay',None)
]

schema = StructType(
    [
        StructField(name = 'id', dataType = IntegerType()),
        StructField(name = 'name', dataType = StringType()),
        StructField(name = 'Skills', dataType = StringType())
    ]
)

df = spark.createDataFrame(data = data, schema = schema)
df = df.withColumn('skillsArray', split(col('skills'),','))
df.show(truncate = False)
df.printSchema()

+---+---------+-----------------------------------------------+-----------------------------------------------------+
|id |name     |Skills                                         |skillsArray                                          |
+---+---------+-----------------------------------------------+-----------------------------------------------------+
|1  |Rohit    |Python,PySpark,SAS,SQL,JavaScript              |[Python, PySpark, SAS, SQL, JavaScript]              |
|2  |Ajay     |Python,Computer Science,Power BI,LangChain,LLMs|[Python, Computer Science, Power BI, LangChain, LLMs]|
|3  |Dhananjay|NULL                                           |NULL                                                 |
+---+---------+-----------------------------------------------+-----------------------------------------------------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- Skills: string (nullable = true)
 |-- skillsArray: array (nullable = true)
 |    |-- element

In [58]:
df.withColumn('HasPythonSkill', array_contains('skillsArray', 'Python')).show()

+---+---------+--------------------+--------------------+--------------+
| id|     name|              Skills|         skillsArray|HasPythonSkill|
+---+---------+--------------------+--------------------+--------------+
|  1|    Rohit|Python,PySpark,SA...|[Python, PySpark,...|          true|
|  2|     Ajay|Python,Computer S...|[Python, Computer...|          true|
|  3|Dhananjay|                NULL|                NULL|          NULL|
+---+---------+--------------------+--------------------+--------------+



In [61]:
df.withColumn('HasLLMSkill', array_contains('skillsArray', 'LLMs')).show()

+---+---------+--------------------+--------------------+-----------+
| id|     name|              Skills|         skillsArray|HasLLMSkill|
+---+---------+--------------------+--------------------+-----------+
|  1|    Rohit|Python,PySpark,SA...|[Python, PySpark,...|      false|
|  2|     Ajay|Python,Computer S...|[Python, Computer...|       true|
|  3|Dhananjay|                NULL|                NULL|       NULL|
+---+---------+--------------------+--------------------+-----------+



In [62]:
spark.stop()