In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('PySparkLearning').getOrCreate()

In [2]:
arrayStructureData = [
                        (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
                        (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
                        (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
                        (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
                        (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
                        (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
                    ]
        

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

arrayStructureSchema = StructType([
                                    StructField('name', StructType([
                                                             StructField('firstname', StringType(), True),
                                                             StructField('middlename', StringType(), True),
                                                             StructField('lastname', StringType(), True)
                                                          ])),
                                    StructField('languages', ArrayType(StringType()), True),
                                    StructField('state', StringType(), True),
                                    StructField('gender', StringType(), True)
                                 ])

In [4]:
df = spark.createDataFrame(data= arrayStructureData, schema = arrayStructureSchema)

In [5]:
df.printSchema()
df.show()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- state: string (nullable = true)
 |-- gender: string (nullable = true)

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    [James, , Smith]|[Java, Scala, C++]|   OH|     M|
|      [Anna, Rose, ]|[Spark, Java, C++]|   NY|     F|
| [Julia, , Williams]|      [CSharp, VB]|   OH|     F|
|[Maria, Anne, Jones]|      [CSharp, VB]|   NY|     M|
|  [Jen, Mary, Brown]|      [CSharp, VB]|   NY|     M|
|[Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



### DataFrame "filter()" with Column Condition

Use Column with the condition to filter the rows from DataFrame, using this you can express complex condition by referring column names using dfObject.colname

In [6]:
#  Single condition
df.filter(df.state == 'OH').show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    [James, , Smith]|[Java, Scala, C++]|   OH|     M|
| [Julia, , Williams]|      [CSharp, VB]|   OH|     F|
|[Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



Same example can also written as below. In order to use this first you need to import `from pyspark.sql.functions import col`

In [7]:
from pyspark.sql.functions import col

df.where(col('state') == 'NY').show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|      [Anna, Rose, ]|[Spark, Java, C++]|   NY|     F|
|[Maria, Anne, Jones]|      [CSharp, VB]|   NY|     M|
|  [Jen, Mary, Brown]|      [CSharp, VB]|   NY|     M|
+--------------------+------------------+-----+------+



In [8]:
# Multiple conditions

df.filter((df.state == 'OH') & (df.gender == 'M')).show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    [James, , Smith]|[Java, Scala, C++]|   OH|     M|
|[Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



### Filtering on Nested Struct columns

In [9]:
df.filter(df.name.lastname == 'Williams').show()

+--------------------+------------+-----+------+
|                name|   languages|state|gender|
+--------------------+------------+-----+------+
| [Julia, , Williams]|[CSharp, VB]|   OH|     F|
|[Mike, Mary, Will...|[Python, VB]|   OH|     M|
+--------------------+------------+-----+------+



### DataFrame "filter()" with SQL Expression

If you are coming from SQL background, you can use that knowledge in PySpark to filter DataFrame rows with SQL expressions.

In [10]:
df.filter("gender  == 'M'").show()

+--------------------+------------------+-----+------+
|                name|         languages|state|gender|
+--------------------+------------------+-----+------+
|    [James, , Smith]|[Java, Scala, C++]|   OH|     M|
|[Maria, Anne, Jones]|      [CSharp, VB]|   NY|     M|
|  [Jen, Mary, Brown]|      [CSharp, VB]|   NY|     M|
|[Mike, Mary, Will...|      [Python, VB]|   OH|     M|
+--------------------+------------------+-----+------+



### Filter on an Array column

When you want to filter rows from DataFrame based on value present in an array collection column, you can use `array_contains()` from Pyspark SQL functions which checks if a value contains in an array- if present it returns true otherwise false.



In [11]:
from pyspark.sql.functions import array_contains

df.filter(array_contains(df.languages, 'Java')).show()
df.where(array_contains(df.languages, 'Java')).show()
#  filter() or where() results same output

+----------------+------------------+-----+------+
|            name|         languages|state|gender|
+----------------+------------------+-----+------+
|[James, , Smith]|[Java, Scala, C++]|   OH|     M|
|  [Anna, Rose, ]|[Spark, Java, C++]|   NY|     F|
+----------------+------------------+-----+------+

+----------------+------------------+-----+------+
|            name|         languages|state|gender|
+----------------+------------------+-----+------+
|[James, , Smith]|[Java, Scala, C++]|   OH|     M|
|  [Anna, Rose, ]|[Spark, Java, C++]|   NY|     F|
+----------------+------------------+-----+------+

