In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, BooleanType, StringType, ArrayType

In [2]:
spark = SparkSession.builder.master("local[*]").appName("Test7").getOrCreate()

In [3]:
data = [("James ","","Smith","36636","M",3000),
              ("Michael ","Rose","","40288","M",4000),
              ("Robert ","","Williams","42114","M",4000),
              ("Maria ","Anne","Jones","39192","F",4000),
              ("Jen","Mary","Brown","","F",-1)]

columns = ["firstname", "middlename", "lastname", "id", "gender", "Salary"]

#creating dataframe
df = spark.createDataFrame(data, columns)
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- Salary: long (nullable = true)



In [4]:
df.show()

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|Salary|
+---------+----------+--------+-----+------+------+
|   James |          |   Smith|36636|     M|  3000|
| Michael |      Rose|        |40288|     M|  4000|
|  Robert |          |Williams|42114|     M|  4000|
|   Maria |      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



In [5]:
#writing the above data of the dataframe into a parquet file
df.write.parquet("D:\\LAB\\Spark\\jupyter_notebooks\\Output\\customer.parquet")

In [6]:
#creating a new dataframe from the parquet file
df1 = spark.read.parquet("D:\\LAB\\Spark\\jupyter_notebooks\\Output\\customer.parquet")
df1.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- Salary: long (nullable = true)



In [7]:
df1.show()

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|Salary|
+---------+----------+--------+-----+------+------+
|  Robert |          |Williams|42114|     M|  4000|
|   Maria |      Anne|   Jones|39192|     F|  4000|
| Michael |      Rose|        |40288|     M|  4000|
|   James |          |   Smith|36636|     M|  3000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



In [8]:
#creating a temporary view using df1
df1.createOrReplaceTempView("parquetTable")

#creating a new dataframe & storing the output of the query/ temporary table data in the dataframe
var1 = spark.sql("select * from parquetTable where Salary >= 4000")

In [9]:
var1.describe()

DataFrame[summary: string, firstname: string, middlename: string, lastname: string, id: string, gender: string, Salary: string]

In [10]:
var1.show(truncate = False)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|Salary|
+---------+----------+--------+-----+------+------+
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Michael  |Rose      |        |40288|M     |4000  |
+---------+----------+--------+-----+------+------+



In [12]:
#creating temporary view without using dataframe
spark.sql("create temporary view person using parquet options (path \'D:/LAB/Spark/jupyter_notebooks/Output/customer.parquet')")
spark.sql("select * from person").show()

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|Salary|
+---------+----------+--------+-----+------+------+
|  Robert |          |Williams|42114|     M|  4000|
|   Maria |      Anne|   Jones|39192|     F|  4000|
| Michael |      Rose|        |40288|     M|  4000|
|   James |          |   Smith|36636|     M|  3000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



In [13]:
#writing dataframe data to local folder using partition
df.write.partitionBy("gender", "Salary").mode("overwrite").parquet("D:\LAB\Spark\jupyter_notebooks\Output\customer2.parquet")

In [14]:
#loading data into dataframe using the partitioned data gender=F
df2 = spark.read.parquet("D:\LAB\Spark\jupyter_notebooks\Output\customer2.parquet\gender=F")
df2.show(truncate = False)

+---------+----------+--------+-----+------+
|firstname|middlename|lastname|id   |Salary|
+---------+----------+--------+-----+------+
|Maria    |Anne      |Jones   |39192|4000  |
|Jen      |Mary      |Brown   |     |-1    |
+---------+----------+--------+-----+------+



In [16]:
#creating temporary view using the partitioned data
spark.sql("create temporary view person2 using parquet options (path \'D:/LAB/Spark/jupyter_notebooks/Output/customer2.parquet/gender=F')")
spark.sql("select * from person2")

DataFrame[firstname: string, middlename: string, lastname: string, id: string, Salary: int]

In [17]:
spark.sql("select * from person2").show()

+---------+----------+--------+-----+------+
|firstname|middlename|lastname|   id|Salary|
+---------+----------+--------+-----+------+
|   Maria |      Anne|   Jones|39192|  4000|
|      Jen|      Mary|   Brown|     |    -1|
+---------+----------+--------+-----+------+

