In [16]:

from pyspark.sql import SparkSession  #type: ignore 

In [17]:
spark = SparkSession.builder \
    .appName("datafram_writer_demo") \
    .config("spark.sql.catalogImplementation", "hive") \
    .enableHiveSupport() \
    .getOrCreate() 

In [18]:
# Demonstration for adding timestamp in cell execution 

import datetime
import functools

def log_execution_time(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        execution_time = datetime.datetime.now()
        print(f"Function {func.__name__} executed at: {execution_time}")
        return func(*args, **kwargs)
    return wrapper


In [19]:
@log_execution_time
def my_spark_job():
    from pyspark.sql import SparkSession
    spark = SparkSession.builder \
        .appName("Streaming Application") \
        .master("local[2]") \
        .getOrCreate()

my_spark_job()


Function my_spark_job executed at: 2025-03-12 13:16:45.464911


In [20]:
orders_schema = "order_id long , order_date string, customer_id long,order_status string"

In [21]:
orders_df = spark.read \
.format("csv") \
.schema(orders_schema) \
.load(r"C:\Users\sayed\Desktop\DDP\prac3\data\orders_1gb.csv")

In [22]:
orders_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [23]:
orders_df.rdd.getNumPartitions()

9

In [24]:
orders_df.write \
    .format("csv") \
    .mode("overwrite") \
    .option("path", "C:\\Users\\sayed\\Desktop\\DDP\\prac3\\data\\sparkwriterdemo1") \
    .save()


In [25]:
orders_df.write \
    .format("csv") \
    .mode("overwrite") \
    .option("path", "C:\\Users\\sayed\\Desktop\\DDP\\prac3\\data\\sparkwriterdemo2") \
    .save()


In [26]:
orders_df.write \
.format("json") \
.mode("overwrite") \
.option("path","C:\\Users\\sayed\\Desktop\\DDP\\prac3\\data\\sparkwriterdemo3") \
.save()

In [27]:
orders_df.write \
.format("orc") \
.mode("overwrite") \
.option("path","C:\\Users\\sayed\\Desktop\\DDP\\prac3\\data\\sparkwriterdemo4") \
.save()

In [28]:
orders_df.write \
.format("json") \
.mode("overwrite") \
.option("path","C:\\Users\\sayed\\Desktop\\DDP\\prac3\\data\\sparkwriterdemo4") \
.save()

In [29]:
orders_df.write \
.format("json") \
.mode("ignore") \
.option("path","C:\\Users\\sayed\\Desktop\\DDP\\prac3\\data\\sparkwriterdemo4") \
.save()

In [30]:
orders_df.write \
.format("json") \
.mode("append") \
.option("path","C:\\Users\\sayed\\Desktop\\DDP\\prac3\\data\\sparkwriterdemo4") \
.save()