In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark=SparkSession.builder.appName("data formats").config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0").getOrCreate()


In [5]:
data = [
    ("ORD001","Delhi","Laptop",45000,"2024-01-05"),
    ("ORD002","Mumbai","Mobile",32000,"2024-01-06"),
    ("ORD003","Bangalore","Tablet",30000,"2024-01-07"),
    ("ORD004","Delhi","Laptop",55000,"2024-01-08"),
    ("ORD005","Mumbai","Tablet",34000,"2024-01-09")
]
columns = ["order_id", "city", "product", "price", "order_date"]
df = spark.createDataFrame(data, columns)
df.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_date|
+--------+---------+-------+-----+----------+
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
+--------+---------+-------+-----+----------+



In [6]:
df.write.mode("overwrite").parquet("data/parquet/orders")

In [7]:
df_parquet = spark.read.parquet("data/parquet/orders")
df_parquet.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_date|
+--------+---------+-------+-----+----------+
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
+--------+---------+-------+-----+----------+



Parquet format

Parquet is a columnar storage format optimized for use with analytics systems. It offers several benefits, including efficient data compression and encoding, which reduces storage space and I/O operations, and improved query performance by allowing column projection and predicate pushdown. It's widely used in big data processing frameworks like Apache Spark and Hadoop.



In [8]:
df.write.mode("overwrite").orc("data/orc/orders")
#orc is more compressed than parquet, uses historical data eg: insurance, banking

In [9]:
df_orc = spark.read.orc("data/orc/orders")
df_orc.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_date|
+--------+---------+-------+-----+----------+
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
+--------+---------+-------+-----+----------+



In [10]:
df.write.mode("overwrite").format("avro").save("data/avro/orders")

In [12]:
df_avro=spark.read.format("avro").load("data/avro/orders")
df_avro.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_date|
+--------+---------+-------+-----+----------+
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
+--------+---------+-------+-----+----------+

