In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("test").master("local[*]").getOrCreate()

In [3]:
data = [("Hadoop ","Mumbai","Lokesh","M",9000),
      ("Spark ","Banglore","Rahul","M",7000),
      ("Scala ","Newyork","Venkat","M",6000),
      ("Python ","Hydrabad","Jasmin","F",7000),
      ("Java","Dubai","Pooja","F",12000)
         ]
columns = ["coursename","location","name","gender","coursefee"]

In [7]:
df = spark.createDataFrame(data, columns)

In [9]:
df.show()
df.printSchema()

+----------+--------+------+------+---------+
|coursename|location|  name|gender|coursefee|
+----------+--------+------+------+---------+
|   Hadoop |  Mumbai|Lokesh|     M|     9000|
|    Spark |Banglore| Rahul|     M|     7000|
|    Scala | Newyork|Venkat|     M|     6000|
|   Python |Hydrabad|Jasmin|     F|     7000|
|      Java|   Dubai| Pooja|     F|    12000|
+----------+--------+------+------+---------+

root
 |-- coursename: string (nullable = true)
 |-- location: string (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- coursefee: long (nullable = true)



In [10]:
#Write DataFrame as parquet file
df.write.mode("overwrite").parquet("tmp/table/learner.parquet")

In [11]:
#Create DataFrame by reading that Parquet file again
parquetDF = spark.read.parquet("tmp/table/learner.parquet")
#Create a Temporary view or table using Parquet DataFrame
parquetDF.createOrReplaceTempView("ParquetTable")

In [12]:
#Select all the records where coursefee is more than 7000
parquetSQLDF = spark.sql("select * from ParquetTable where coursefee >= 7000 ")
parquetSQLDF.show()

+----------+--------+------+------+---------+
|coursename|location|  name|gender|coursefee|
+----------+--------+------+------+---------+
|   Hadoop |  Mumbai|Lokesh|     M|     9000|
|    Spark |Banglore| Rahul|     M|     7000|
|   Python |Hydrabad|Jasmin|     F|     7000|
|      Java|   Dubai| Pooja|     F|    12000|
+----------+--------+------+------+---------+



In [14]:
#save data back, wich is partitioned by gender and coursefee
df.write.mode("overwrite").partitionBy("gender","coursefee").parquet("tmp/table/learner2.parquet")

In [15]:
#Read back the partitioned data
parquetDF2 = spark.read.parquet("tmp/table/learner2.parquet")

In [17]:
#Create another temporary table from partitioned data
parquetDF2.createOrReplaceTempView("ParquetTable2")
#Create DataFrame which contains only data where gender is Male and #Course Fee is more than 7000
df2 = spark.sql("select * from ParquetTable2  where gender='M' and coursefee >= 7000")
#Check the explain plan
df2.explain()

== Physical Plan ==
*(1) FileScan parquet [coursename#67,location#68,name#69,gender#70,coursefee#71] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/mnt/c/projects/notebook/CRT020/Tips/tmp/table/learner2.parquet], PartitionCount: 2, PartitionFilters: [isnotnull(gender#70), isnotnull(coursefee#71), (gender#70 = M), (coursefee#71 >= 7000)], PushedFilters: [], ReadSchema: struct<coursename:string,location:string,name:string>


In [22]:
#Print the DataFrame Schema
df2.printSchema()
#Display DataFrame Contents
display(df2)
df2.show()

root
 |-- coursename: string (nullable = true)
 |-- location: string (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- coursefee: integer (nullable = true)



DataFrame[coursename: string, location: string, name: string, gender: string, coursefee: int]

+----------+--------+------+------+---------+
|coursename|location|  name|gender|coursefee|
+----------+--------+------+------+---------+
|   Hadoop |  Mumbai|Lokesh|     M|     9000|
|    Spark |Banglore| Rahul|     M|     7000|
+----------+--------+------+------+---------+



In [4]:
#Read data from specific partitions
df3 = spark.read.parquet("tmp/table/learner2.parquet/gender=M")
df3.show()

+----------+--------+------+---------+
|coursename|location|  name|coursefee|
+----------+--------+------+---------+
|   Hadoop |  Mumbai|Lokesh|     9000|
|    Spark |Banglore| Rahul|     7000|
|    Scala | Newyork|Venkat|     6000|
+----------+--------+------+---------+



In [8]:
df3.write.format("parquet").bucketBy(4, "courseFee").option("path", "tmp/bucket").saveAsTable("data")

In [9]:
spark.read.parquet("tmp/bucket").show()

+----------+--------+------+---------+
|coursename|location|  name|coursefee|
+----------+--------+------+---------+
|   Hadoop |  Mumbai|Lokesh|     9000|
|    Spark |Banglore| Rahul|     7000|
|    Scala | Newyork|Venkat|     6000|
+----------+--------+------+---------+



In [30]:
def print_data(data):
    print(data.name)

df3.foreach(print_data)

In [32]:
display(df3.take(2))

[Row(coursename='Hadoop ', location='Mumbai', name='Lokesh', coursefee=9000),
 Row(coursename='Spark ', location='Banglore', name='Rahul', coursefee=7000)]