In [0]:
########## Run this code snippet when running for the first time and don't repeat it in future (else it will keep on downloading the same stuffs again and again and
########## result in redundant usage of memory)

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apachemirror.wuchna.com/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
import findspark
import pandas as pd
import random
import numpy as np
os.environ["JAVA_HOME"]   = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"]  = "/content/spark-2.4.4-bin-hadoop2.7"
findspark.init("spark-2.4.4-bin-hadoop2.7")# SPARK_HOME
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark                      = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
pdf = pd.DataFrame(columns=['Index','Arrays'])
pdf['Index']  = [100,101,102,103]
pdf['Arrays'] = [[random.choice(['a','b','c','d','z','y','m','p']) for j in range(5)] for i in range(4)]

In [0]:
df = spark.createDataFrame(pdf)
df.show(truncate=False)

+-----+---------------+
|Index|Arrays         |
+-----+---------------+
|100  |[y, p, p, d, y]|
|101  |[d, b, y, d, m]|
|102  |[p, b, d, z, d]|
|103  |[a, y, d, b, y]|
+-----+---------------+



**Value in array:-** To check if an array contains a particular value or not, we use 'array_contains'. Here we are checking if the array contains the letter 'a' or not.

In [0]:
df = df.withColumn('Finding_value',array_contains(df['Arrays'], "a"))
df.show()

+-----+---------------+-------------+
|Index|         Arrays|Finding_value|
+-----+---------------+-------------+
|  100|[y, p, p, d, y]|        false|
|  101|[d, b, y, d, m]|        false|
|  102|[p, b, d, z, d]|        false|
|  103|[a, y, d, b, y]|         true|
+-----+---------------+-------------+



**Sorting in an array:-** We can sort an array in either an ascending or descending order.

In [0]:
df = df.withColumn('Sorted_array_asc',sort_array(df['Arrays'],asc=True))
df = df.withColumn('Sorted_array+desc',sort_array(df['Arrays'],asc=False))
df.show()

+-----+---------------+-------------+----------------+-----------------+
|Index|         Arrays|Finding_value|Sorted_array_asc|Sorted_array+desc|
+-----+---------------+-------------+----------------+-----------------+
|  100|[y, p, p, d, y]|        false| [d, p, p, y, y]|  [y, y, p, p, d]|
|  101|[d, b, y, d, m]|        false| [b, d, d, m, y]|  [y, m, d, d, b]|
|  102|[p, b, d, z, d]|        false| [b, d, d, p, z]|  [z, p, d, d, b]|
|  103|[a, y, d, b, y]|         true| [a, b, d, y, y]|  [y, y, d, b, a]|
+-----+---------------+-------------+----------------+-----------------+



**Distributing an array to rows**

In [0]:
df = df.withColumn('Explode',explode(df['Sorted_array_asc']))
df.show()

+-----+---------------+-------------+----------------+-----------------+-------+
|Index|         Arrays|Finding_value|Sorted_array_asc|Sorted_array+desc|Explode|
+-----+---------------+-------------+----------------+-----------------+-------+
|  100|[y, p, p, d, y]|        false| [d, p, p, y, y]|  [y, y, p, p, d]|      d|
|  100|[y, p, p, d, y]|        false| [d, p, p, y, y]|  [y, y, p, p, d]|      p|
|  100|[y, p, p, d, y]|        false| [d, p, p, y, y]|  [y, y, p, p, d]|      p|
|  100|[y, p, p, d, y]|        false| [d, p, p, y, y]|  [y, y, p, p, d]|      y|
|  100|[y, p, p, d, y]|        false| [d, p, p, y, y]|  [y, y, p, p, d]|      y|
|  101|[d, b, y, d, m]|        false| [b, d, d, m, y]|  [y, m, d, d, b]|      b|
|  101|[d, b, y, d, m]|        false| [b, d, d, m, y]|  [y, m, d, d, b]|      d|
|  101|[d, b, y, d, m]|        false| [b, d, d, m, y]|  [y, m, d, d, b]|      d|
|  101|[d, b, y, d, m]|        false| [b, d, d, m, y]|  [y, m, d, d, b]|      m|
|  101|[d, b, y, d, m]|     

**Distributing a dictionary to columns**

In [0]:
data1   = [(1,[{'aa':10},{'alpha':12}],[1,2,3]),
           (2,[{'bb':60},{'delta':11}],[1,3,4]),
            (3,[{'cc':40},{'beta':15}],[1,1,1]),
            (4,[{'dd':20},{'gamma':23}],[1,4,6])]
df11 = spark.createDataFrame(data1,("primary_key","dictionary_column",'array_column'))
df11.show()
df11=df11.select('primary_key','dictionary_column','array_column',explode_outer(df11['dictionary_column']))
df11.show()

+-----------+--------------------+------------+
|primary_key|   dictionary_column|array_column|
+-----------+--------------------+------------+
|          1|[[aa -> 10], [alp...|   [1, 2, 3]|
|          2|[[bb -> 60], [del...|   [1, 3, 4]|
|          3|[[cc -> 40], [bet...|   [1, 1, 1]|
|          4|[[dd -> 20], [gam...|   [1, 4, 6]|
+-----------+--------------------+------------+

+-----------+--------------------+------------+-------------+
|primary_key|   dictionary_column|array_column|          col|
+-----------+--------------------+------------+-------------+
|          1|[[aa -> 10], [alp...|   [1, 2, 3]|   [aa -> 10]|
|          1|[[aa -> 10], [alp...|   [1, 2, 3]|[alpha -> 12]|
|          2|[[bb -> 60], [del...|   [1, 3, 4]|   [bb -> 60]|
|          2|[[bb -> 60], [del...|   [1, 3, 4]|[delta -> 11]|
|          3|[[cc -> 40], [bet...|   [1, 1, 1]|   [cc -> 40]|
|          3|[[cc -> 40], [bet...|   [1, 1, 1]| [beta -> 15]|
|          4|[[dd -> 20], [gam...|   [1, 4, 6]|   [dd -> 

Performing explode along with getting positional values

In [0]:
df11.show()
df11.select('primary_key','array_column',posexplode(df11.dictionary_column)).show()
df11.select("primary_key", "dictionary_column", posexplode_outer("array_column")).show()

+-----------+--------------------+------------+-------------+
|primary_key|   dictionary_column|array_column|          col|
+-----------+--------------------+------------+-------------+
|          1|[[aa -> 10], [alp...|   [1, 2, 3]|   [aa -> 10]|
|          1|[[aa -> 10], [alp...|   [1, 2, 3]|[alpha -> 12]|
|          2|[[bb -> 60], [del...|   [1, 3, 4]|   [bb -> 60]|
|          2|[[bb -> 60], [del...|   [1, 3, 4]|[delta -> 11]|
|          3|[[cc -> 40], [bet...|   [1, 1, 1]|   [cc -> 40]|
|          3|[[cc -> 40], [bet...|   [1, 1, 1]| [beta -> 15]|
|          4|[[dd -> 20], [gam...|   [1, 4, 6]|   [dd -> 20]|
|          4|[[dd -> 20], [gam...|   [1, 4, 6]|[gamma -> 23]|
+-----------+--------------------+------------+-------------+

+-----------+------------+---+-------------+
|primary_key|array_column|pos|          col|
+-----------+------------+---+-------------+
|          1|   [1, 2, 3]|  0|   [aa -> 10]|
|          1|   [1, 2, 3]|  1|[alpha -> 12]|
|          1|   [1, 2, 3]|  0|