In [1]:
from pyspark.sql import Row, SparkSession

In [6]:
spark = SparkSession.builder.appName("local[*]").getOrCreate()
sc = spark.sparkContext

In [13]:
row = [Row(1, "apple", 12345, "fruit", 3),
        Row(2, "strawberry", 32439, "fruit", 1),
        Row(3, "Celery", 766535, "vegetable", 4),
        Row(4, "potatoes", 89733, "vegetable", 10)
       ]

In [9]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [12]:
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("number", IntegerType(), True),
    StructField("category", StringType(), True),
    StructField("price", IntegerType(), True)
    ]
)

In [14]:
data = sc.parallelize(row)

In [18]:
dataFrame = spark.createDataFrame(data, schema)
dataFrame.show()

+---+----------+------+---------+-----+
| id|      name|number| category|price|
+---+----------+------+---------+-----+
|  1|     apple| 12345|    fruit|    3|
|  2|strawberry| 32439|    fruit|    1|
|  3|    Celery|766535|vegetable|    4|
|  4|  potatoes| 89733|vegetable|   10|
+---+----------+------+---------+-----+



## Convert RDD into DataFrame

In [17]:
data.toDF(schema).show()

+---+----------+------+---------+-----+
| id|      name|number| category|price|
+---+----------+------+---------+-----+
|  1|     apple| 12345|    fruit|    3|
|  2|strawberry| 32439|    fruit|    1|
|  3|    Celery|766535|vegetable|    4|
|  4|  potatoes| 89733|vegetable|   10|
+---+----------+------+---------+-----+



## Filter with Where DataFrame

In [34]:
dataFrame.where("price>=4").where("category=='vegetable'").show()

+---+--------+------+---------+-----+
| id|    name|number| category|price|
+---+--------+------+---------+-----+
|  3|  Celery|766535|vegetable|    4|
|  4|potatoes| 89733|vegetable|   10|
+---+--------+------+---------+-----+



## Filter with SQL

In [25]:
tmp_df = dataFrame.where("price>=4").where("category=='vegetable'")
tmp_df.createOrReplaceTempView("T_TEMP")

In [31]:
filter_sql = spark.sql("SELECT * FROM T_TEMP WHERE price>=4 AND category ='vegetable'")

In [32]:
filter_sql.show()

+---+--------+------+---------+-----+
| id|    name|number| category|price|
+---+--------+------+---------+-----+
|  3|  Celery|766535|vegetable|    4|
|  4|potatoes| 89733|vegetable|   10|
+---+--------+------+---------+-----+

