In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

from pyspark.sql.types import StructType, StructField,IntegerType,StringType

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
data= [('abc',[1,2]),('xyz',[3,4]),('pqr',[5,6]),('efg',[7,8])]

schema = ['ID','Numbers']

df = spark.createDataFrame(data,schema)

df.printSchema()

df.show()

root
 |-- ID: string (nullable = true)
 |-- Numbers: array (nullable = true)
 |    |-- element: long (containsNull = true)

+---+-------+
| ID|Numbers|
+---+-------+
|abc| [1, 2]|
|xyz| [3, 4]|
|pqr| [5, 6]|
|efg| [7, 8]|
+---+-------+



In [9]:
from pyspark.sql.types import ArrayType,StructType,StringType,IntegerType

cust_schema = StructType([StructField('ID',StringType()),\
                           StructField('Numbers',ArrayType(IntegerType()))])

df1 = spark.createDataFrame(data,cust_schema)

df1.printSchema()

df1.show()

display(df1)

root
 |-- ID: string (nullable = true)
 |-- Numbers: array (nullable = true)
 |    |-- element: integer (containsNull = true)

+---+-------+
| ID|Numbers|
+---+-------+
|abc| [1, 2]|
|xyz| [3, 4]|
|pqr| [5, 6]|
|efg| [7, 8]|
+---+-------+



DataFrame[ID: string, Numbers: array<int>]

In [10]:
df1.withColumn('FirstNumber',df1.Numbers[0]).show()

+---+-------+-----------+
| ID|Numbers|FirstNumber|
+---+-------+-----------+
|abc| [1, 2]|          1|
|xyz| [3, 4]|          3|
|pqr| [5, 6]|          5|
|efg| [7, 8]|          7|
+---+-------+-----------+



In [11]:
from pyspark.sql.functions import col

df1.withColumn('FirstNumber',col=col('Numbers')[0]).show()

+---+-------+-----------+
| ID|Numbers|FirstNumber|
+---+-------+-----------+
|abc| [1, 2]|          1|
|xyz| [3, 4]|          3|
|pqr| [5, 6]|          5|
|efg| [7, 8]|          7|
+---+-------+-----------+



In [13]:
from pyspark.sql.functions import col,array

data = [(1,2),(3,4)]

schema = ['num1','num2']

df = spark.createDataFrame(data,schema)

df.printSchema()

df.show()


root
 |-- num1: long (nullable = true)
 |-- num2: long (nullable = true)

+----+----+
|num1|num2|
+----+----+
|   1|   2|
|   3|   4|
+----+----+



In [16]:
df6 = df.withColumn('numbers',array(col('num1'),col('num2')))

In [17]:
df6.printSchema()
df6.show()

root
 |-- num1: long (nullable = true)
 |-- num2: long (nullable = true)
 |-- numbers: array (nullable = false)
 |    |-- element: long (containsNull = true)

+----+----+-------+
|num1|num2|numbers|
+----+----+-------+
|   1|   2| [1, 2]|
|   3|   4| [3, 4]|
+----+----+-------+



In [None]:
#explode()
#split()
#array()
#array_contains

In [18]:
data = [(1,'srikanth',['python','AWS']),(2,'Manvith',['Java','DWH'])]

schema = ['ID','Name','Skills']

df7 = spark.createDataFrame(data,schema)

df7.printSchema()

df7.show()

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+---+--------+-------------+
| ID|    Name|       Skills|
+---+--------+-------------+
|  1|srikanth|[python, AWS]|
|  2| Manvith|  [Java, DWH]|
+---+--------+-------------+



In [23]:
from pyspark.sql.functions import explode,col

df8= df7.withColumn('Explode_Column',col=explode(col('Skills')))

df8.show()

df8.printSchema()

+---+--------+-------------+--------------+
| ID|    Name|       Skills|Explode_Column|
+---+--------+-------------+--------------+
|  1|srikanth|[python, AWS]|        python|
|  1|srikanth|[python, AWS]|           AWS|
|  2| Manvith|  [Java, DWH]|          Java|
|  2| Manvith|  [Java, DWH]|           DWH|
+---+--------+-------------+--------------+

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Skills: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Explode_Column: string (nullable = true)



In [31]:
from pyspark.sql.functions import split

data = [(1,'sri','.Net,Java,PHP'),(2,'Manu','Python,AWS,Cloud')]

schema = ['ID','Name','Skills']

df9 = spark.createDataFrame(data,schema)

df9.show()

df9.printSchema()


+---+----+----------------+
| ID|Name|          Skills|
+---+----+----------------+
|  1| sri|   .Net,Java,PHP|
|  2|Manu|Python,AWS,Cloud|
+---+----+----------------+

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Skills: string (nullable = true)



In [30]:
df10 = df9.withColumn('SplitCol',col=split(col('Skills'),','))
df10.show()
df10.printSchema()

+---+----+----------------+--------------------+
| ID|Name|          Skills|            SplitCol|
+---+----+----------------+--------------------+
|  1| sri|   .Net,Java,PHP|   [.Net, Java, PHP]|
|  2|Manu|Python,AWS,Cloud|[Python, AWS, Cloud]|
+---+----+----------------+--------------------+

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Skills: string (nullable = true)
 |-- SplitCol: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [32]:
from pyspark.sql.functions import split

data = [(1,'sri','.Net','Java'),(2,'Manu','Python','AWS')]

schema = ['ID','Name','PrimarySkill','SecondarySkill']

df11= spark.createDataFrame(data,schema)

df11.show()

df11.printSchema()

+---+----+------------+--------------+
| ID|Name|PrimarySkill|SecondarySkill|
+---+----+------------+--------------+
|  1| sri|        .Net|          Java|
|  2|Manu|      Python|           AWS|
+---+----+------------+--------------+

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- PrimarySkill: string (nullable = true)
 |-- SecondarySkill: string (nullable = true)



In [34]:
from pyspark.sql.functions import array

df12= df11.withColumn('ArrayCol',col=array(col('PrimarySkill'),col('SecondarySkill')))

df12.show()

df12.printSchema()

+---+----+------------+--------------+-------------+
| ID|Name|PrimarySkill|SecondarySkill|     ArrayCol|
+---+----+------------+--------------+-------------+
|  1| sri|        .Net|          Java| [.Net, Java]|
|  2|Manu|      Python|           AWS|[Python, AWS]|
+---+----+------------+--------------+-------------+

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- PrimarySkill: string (nullable = true)
 |-- SecondarySkill: string (nullable = true)
 |-- ArrayCol: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [37]:
from pyspark.sql.functions import array_contains

data = [(1,'srikanth',['python','AWS']),(2,'Manvith',['Java','DWH'])]

schema = ['ID','Name','Skills']

df13 = spark.createDataFrame(data,schema)

df14 = df13.withColumn('ArrayContains',col=array_contains(col('Skills'),'Java'))

df14.show()

df14.printSchema()

+---+--------+-------------+-------------+
| ID|    Name|       Skills|ArrayContains|
+---+--------+-------------+-------------+
|  1|srikanth|[python, AWS]|        false|
|  2| Manvith|  [Java, DWH]|         true|
+---+--------+-------------+-------------+

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Skills: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ArrayContains: boolean (nullable = true)



In [38]:
help(array_contains)

Help on function array_contains in module pyspark.sql.functions:

array_contains(col, value)
    Collection function: returns null if the array is null, true if the array contains the
    given value, and false otherwise.
    
    .. versionadded:: 1.5.0
    
    Parameters
    ----------
    col : :class:`~pyspark.sql.Column` or str
        name of column containing array
    value :
        value or column to check for in array
    
    Examples
    --------
    >>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])
    >>> df.select(array_contains(df.data, "a")).collect()
    [Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]
    >>> df.select(array_contains(df.data, lit("a"))).collect()
    [Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]

