In [None]:
if 'spark' in locals() and isinstance(spark, SparkSession):
    print("Stopping existing SparkSession...")
    spark.stop()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Sales Dataset Exploration").getOrCreate()
spark

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


1. DataFrame Creation and Inspection

In [105]:
# Load the CSV using Pandas, PySpark, and Dask.

# using spark
sales_df=spark.read.csv("/content/drive/MyDrive/Sales_Dataset.csv",header=True,inferSchema=True)

In [78]:
# Display the first 5 and last 5 records.
sales_df.show(5)
last_rows = sales_df.tail(5)
last_df = spark.createDataFrame(last_rows)
last_df.show()

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|   2022

In [79]:
# Print schema and check data types.
sales_df.printSchema()

root
 |-- OrderID: integer (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- ProductCategory: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- DeliveryStatus: string (nullable = true)
 |-- Discount: double (nullable = true)
 |-- City: string (nullable = true)
 |-- PaymentMode: string (nullable = true)
 |-- CustomerSince: date (nullable = true)



2.Selection, Renaming, and Filtering

In [80]:
# Select only OrderID , CustomerName , and Amount .
sales_df.select("OrderID","CustomerName","Amount").show()

+-------+------------------+------+
|OrderID|      CustomerName|Amount|
+-------+------------------+------+
|   2824|     Donald Walker|783.04|
|   7912|      Brandon Hall| 905.0|
|   4611|      Donald Booth|657.96|
|   3547|    Phillip Garcia|606.89|
|   8527|      Valerie Gray| 77.87|
|   4150|       Amber Perez|352.37|
|   5554|        Roy Martin|148.33|
|   2169|    Carolyn Daniel| 14.09|
|   6313|       Patty Perez| 79.83|
|   6155|Jonathan Wilkerson|882.68|
|   9830|       Kevin Hurst|870.55|
|   9085| Anthony Rodriguez|921.73|
|   2040|     Kyle Mcdonald|327.52|
|   6573|    Jeffrey Chavez|676.02|
|   2743|  Elizabeth Fowler| 47.06|
|   9837|     Tammy Sellers| 46.15|
|   6038|     David Bradley|348.51|
|   3060|       John Pierce|362.09|
|   4295|   Jennifer Powers|684.26|
|   5061|    George Chapman|251.89|
+-------+------------------+------+
only showing top 20 rows



In [81]:
# Rename Amount to OrderAmount
sales_df=sales_df.withColumnRenamed("Amount","OrderAmount")
sales_df.show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|   2020-08-08|
|   8527|      Valerie Gray|           Toys|    

In [82]:
# Filter orders where Amount > 500.
sales_df.filter(sales_df.OrderAmount > 500).show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|   2020-08-08|
|   6155|Jonathan Wilkerson|        Fashion|    

In [83]:
# Filter orders from a specific city using .query() or .filter().
sales_df.filter(sales_df.City == "New Barry").show()

+-------+-------------+---------------+-----------+----------+--------------+--------+---------+-----------+-------------+
|OrderID| CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|     City|PaymentMode|CustomerSince|
+-------+-------------+---------------+-----------+----------+--------------+--------+---------+-----------+-------------+
|   8239|Jessica Smith|          Books|     610.08|2023-07-31|     Delivered|    0.09|New Barry|        UPI|   2021-12-27|
+-------+-------------+---------------+-----------+----------+--------------+--------+---------+-----------+-------------+



3. Data Manipulation

In [84]:
# Drop CustomerSince column.
sales_df = sales_df.drop("CustomerSince")
sales_df.show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|
|   8527|      Valerie Gray|           Toys|      77.87|2024-08-04|     Delivered|    0.17|        Mariastad|       Cash|
|   4150|       Amber Pe

In [85]:
# Add a new column FinalAmount = Amount - (Amount * Discount).
sales_df = sales_df.withColumn("FinalAmount", sales_df.OrderAmount - (sales_df.OrderAmount * sales_df.Discount))
sales_df.show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+------------------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|       FinalAmount|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+------------------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|           665.584|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|            877.85|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|          651.3804|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|          515.8565|
|   8527|    

In [86]:
# Sort by FinalAmount descending.
sales_df = sales_df.sort("FinalAmount", ascending=False)
sales_df.show()

+-------+-----------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-----------------+
|OrderID|     CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|      FinalAmount|
+-------+-----------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-----------------+
|   5573|   Jordan Frazier|          Books|     981.05|2025-03-19|     Cancelled|    0.02|      Sheilaville|       Cash|          961.429|
|   8474|      Heidi Brown|    Electronics|     968.91|2023-11-23|     Cancelled|    0.02|       Riverafort|       Cash|         949.5318|
|   8889|      Karen Garza|          Books|      998.3|2024-10-17|     Cancelled|    0.06|       Johnsonton|Credit Card|938.4019999999999|
|   2127|     Jaclyn Moore|      Groceries|     933.32|2025-03-11|      Returned|    0.01|      Cherylhaven|       Cash|         923.9868|
|   9806|    Samantha Gill|

In [87]:
# Replace all “Cancelled” status with “Order Cancelled”.
from pyspark.sql.functions import regexp_replace
sales_df = sales_df.withColumn("DeliveryStatus",regexp_replace(sales_df.DeliveryStatus, "Cancelled", "Order Cancelled"))
sales_df.show()

+-------+-----------------+---------------+-----------+----------+---------------+--------+-----------------+-----------+-----------------+
|OrderID|     CustomerName|ProductCategory|OrderAmount| OrderDate| DeliveryStatus|Discount|             City|PaymentMode|      FinalAmount|
+-------+-----------------+---------------+-----------+----------+---------------+--------+-----------------+-----------+-----------------+
|   5573|   Jordan Frazier|          Books|     981.05|2025-03-19|Order Cancelled|    0.02|      Sheilaville|       Cash|          961.429|
|   8474|      Heidi Brown|    Electronics|     968.91|2023-11-23|Order Cancelled|    0.02|       Riverafort|       Cash|         949.5318|
|   8889|      Karen Garza|          Books|      998.3|2024-10-17|Order Cancelled|    0.06|       Johnsonton|Credit Card|938.4019999999999|
|   2127|     Jaclyn Moore|      Groceries|     933.32|2025-03-11|       Returned|    0.01|      Cherylhaven|       Cash|         923.9868|
|   9806|    Samanth

4. Aggregations and GroupBy

In [88]:
# Count of orders by DeliveryStatus.
sales_df.groupBy("DeliveryStatus").count().show()

+---------------+-----+
| DeliveryStatus|count|
+---------------+-----+
|       Returned|  117|
|      Delivered|  119|
|Order Cancelled|  149|
|        Pending|  115|
+---------------+-----+



In [89]:
# Average Amount by ProductCategory.
from pyspark.sql.functions import avg
sales_df.groupBy("ProductCategory").agg(avg("OrderAmount").alias("Average Amount")).show()

+---------------+------------------+
|ProductCategory|    Average Amount|
+---------------+------------------+
|        Fashion|500.63082352941205|
|      Groceries|459.51786407767014|
|    Electronics| 551.7450000000002|
|          Books| 568.6003773584907|
|           Toys| 534.2837499999999|
+---------------+------------------+



In [90]:
# Group by City and show total sales.
from pyspark.sql.functions import sum,col
from pyspark.sql.types import DoubleType
sales_df.groupBy("City").agg(sum(col("OrderAmount").cast(DoubleType())).alias("Total Sales")).show()

+----------------+-----------+
|            City|Total Sales|
+----------------+-----------+
|     Ramseymouth|     761.06|
|East Edwardshire|     291.26|
|    Lake Douglas|     975.09|
|      Thomasberg|     882.68|
| South Colinstad|     786.27|
|     Laurenville|     383.26|
|        Seanbury|     814.39|
|      Gordonport|     514.99|
|  West Dawnmouth|       12.8|
|   Williamsmouth|      10.78|
|     Sheilaville|     981.05|
|       Mollybury|     222.02|
|       Perezfort|     917.55|
| Lake Jerrymouth|     404.01|
|       Lisaville|      45.69|
|     Port Willie|     788.13|
|  South Samantha|     229.46|
|Port Nicoleshire|     133.78|
|Lake Rebeccabury|     891.66|
|      Valdezberg|     424.96|
+----------------+-----------+
only showing top 20 rows



Null Handling & Update


In [91]:
# Intentionally inject nulls in City column and handle them using fillna(),dropna().
from pyspark.sql.functions import when,col

# Intentionally inserting null values
sales_df = sales_df.withColumn("City", when(col("City").startswith("New"), None).otherwise(col("City")))
sales_df.where(col("City").isNull()).show()

# Using fillna() to fill null values
sales_df=sales_df.fillna({"City": "Unknown"})
sales_df.where(col('City')=='Unknown').show()

#Again intentionally inserting null values
sales_df = sales_df.withColumn("City", when(col("City").startswith("East"), None).otherwise(col("City")))
sales_df.where(col("City").isNull()).show()

# Using dropna() to drop null values
sales_df = sales_df.dropna(subset=["City"])
sales_df.where(col("City").isNull()).show()

+-------+---------------+---------------+-----------+----------+---------------+--------+----+-----------+------------------+
|OrderID|   CustomerName|ProductCategory|OrderAmount| OrderDate| DeliveryStatus|Discount|City|PaymentMode|       FinalAmount|
+-------+---------------+---------------+-----------+----------+---------------+--------+----+-----------+------------------+
|   9806|  Samantha Gill|      Groceries|     993.17|2024-11-12|Order Cancelled|    0.07|NULL|     Wallet|          923.6481|
|   8253|Kevin Patterson|          Books|     998.21|2024-08-17|Order Cancelled|    0.12|NULL|        UPI|          878.4248|
|   7912|   Brandon Hall|      Groceries|      905.0|2024-09-12|Order Cancelled|    0.03|NULL|     Wallet|            877.85|
|   5496|  Eric Ferguson|        Fashion|      865.0|2024-06-27|Order Cancelled|    0.03|NULL|Credit Card|            839.05|
|   9185|    Carol Moore|    Electronics|     766.67|2025-01-23|        Pending|    0.04|NULL|       Cash|          73

In [92]:
# Use .when().otherwise() in PySpark to tag high-value customers ( Amount >800 ).
from pyspark.sql.functions import when
sales_df = sales_df.withColumn("CustomerCategory", when(sales_df.OrderAmount > 800, "High-Value").otherwise("Low-Value"))
sales_df.show()

+-------+-----------------+---------------+-----------+----------+---------------+--------+-----------------+-----------+-----------------+----------------+
|OrderID|     CustomerName|ProductCategory|OrderAmount| OrderDate| DeliveryStatus|Discount|             City|PaymentMode|      FinalAmount|CustomerCategory|
+-------+-----------------+---------------+-----------+----------+---------------+--------+-----------------+-----------+-----------------+----------------+
|   5573|   Jordan Frazier|          Books|     981.05|2025-03-19|Order Cancelled|    0.02|      Sheilaville|       Cash|          961.429|      High-Value|
|   8474|      Heidi Brown|    Electronics|     968.91|2023-11-23|Order Cancelled|    0.02|       Riverafort|       Cash|         949.5318|      High-Value|
|   8889|      Karen Garza|          Books|      998.3|2024-10-17|Order Cancelled|    0.06|       Johnsonton|Credit Card|938.4019999999999|      High-Value|
|   2127|     Jaclyn Moore|      Groceries|     933.32|202

6. Date & Time Functions

In [95]:
# Extract year and month from OrderDate .
sales_df = sales_df.withColumn("OrderDate", sales_df.OrderDate.cast("date"))
sales_df = sales_df.withColumn("Year", sales_df.OrderDate.substr(1, 4))
sales_df = sales_df.withColumn("Month", sales_df.OrderDate.substr(6, 2))
sales_df.show()

+-------+-----------------+---------------+-----------+----------+---------------+--------+-----------------+-----------+-----------------+----------------+----+-----+
|OrderID|     CustomerName|ProductCategory|OrderAmount| OrderDate| DeliveryStatus|Discount|             City|PaymentMode|      FinalAmount|CustomerCategory|Year|Month|
+-------+-----------------+---------------+-----------+----------+---------------+--------+-----------------+-----------+-----------------+----------------+----+-----+
|   5573|   Jordan Frazier|          Books|     981.05|2025-03-19|Order Cancelled|    0.02|      Sheilaville|       Cash|          961.429|      High-Value|2025|   03|
|   8474|      Heidi Brown|    Electronics|     968.91|2023-11-23|Order Cancelled|    0.02|       Riverafort|       Cash|         949.5318|      High-Value|2023|   11|
|   8889|      Karen Garza|          Books|      998.3|2024-10-17|Order Cancelled|    0.06|       Johnsonton|Credit Card|938.4019999999999|      High-Value|2024

In [101]:
# Calculate customer loyalty in years = today - CustomerSince .
from pyspark.sql.functions import current_date, datediff,round
sales_df = sales_df.withColumn("CustomerSince", sales_df.CustomerSince.cast("date"))
sales_df = sales_df.withColumn("LoyaltyYears",round(datediff(current_date(), sales_df.CustomerSince) / 365,2))
sales_df.show()

+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|LoyaltyYears|
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|        4.64|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|        3.22|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|        3.82|
|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|   2020-08-08|      

7.Joins and Unions

In [106]:
# Create a second DataFrame with city-wise region mapping.
from pyspark.sql.functions import when, col
region_temp_df = sales_df.select("City").distinct().withColumn(
    "Region",
    when(col("City").startswith("North"), "North")
    .when(col("City").startswith("West"), "West")
    .when(col("City").startswith("East"), "East")
    .when(col("City").startswith("South"), "South")
    .otherwise("Unknown"))
region_temp_df.show()

+----------------+-------+
|            City| Region|
+----------------+-------+
|     Ramseymouth|Unknown|
|East Edwardshire|   East|
|      Thomasberg|Unknown|
|     Laurenville|Unknown|
| South Colinstad|  South|
|    Lake Douglas|Unknown|
|   Williamsmouth|Unknown|
|      Gordonport|Unknown|
|  West Dawnmouth|   West|
|        Seanbury|Unknown|
|     Sheilaville|Unknown|
|       Mollybury|Unknown|
|       Lisaville|Unknown|
| Lake Jerrymouth|Unknown|
|       Perezfort|Unknown|
|Port Nicoleshire|Unknown|
|  South Samantha|  South|
|     Port Willie|Unknown|
|     Waltersfort|Unknown|
|       Youngbury|Unknown|
+----------------+-------+
only showing top 20 rows



In [107]:
# Perform left join with the main dataset.
left_join_df = sales_df.join(region_temp_df, on="City", how="left")
left_join_df.show()

+-----------------+-------+------------------+---------------+------+----------+--------------+--------+-----------+-------------+-------+
|             City|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince| Region|
+-----------------+-------+------------------+---------------+------+----------+--------------+--------+-----------+-------------+-------+
|     Lake Joyside|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|Credit Card|   2020-10-15|Unknown|
|    New Jamesside|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|     Wallet|   2022-03-15|Unknown|
|     Lake Roberto|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Wallet|   2021-08-07|Unknown|
| West Melanieview|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|     Wallet|   2020-08-08|   West|
|        Mariastad|   8527|

In [108]:
#Perform inner join with the main dataset.
inner_join_df = sales_df.join(region_temp_df, on="City", how="inner")
inner_join_df.show()

+-----------------+-------+------------------+---------------+------+----------+--------------+--------+-----------+-------------+-------+
|             City|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince| Region|
+-----------------+-------+------------------+---------------+------+----------+--------------+--------+-----------+-------------+-------+
|     Lake Joyside|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|Credit Card|   2020-10-15|Unknown|
|    New Jamesside|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|     Wallet|   2022-03-15|Unknown|
|     Lake Roberto|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Wallet|   2021-08-07|Unknown|
| West Melanieview|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|     Wallet|   2020-08-08|   West|
|        Mariastad|   8527|

In [109]:
# Union two datasets: e.g., orders from 2023 and 2024.
from pyspark.sql.functions import year, to_date
sales_df = sales_df.withColumn("OrderDate", to_date("OrderDate"))
sales_2023 = sales_df.filter(year("OrderDate") == 2023)
sales_2024 = sales_df.filter(year("OrderDate") == 2024)
union_df = sales_2023.union(sales_2024)
union_df.show()


+-------+----------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+
|OrderID|    CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|
+-------+----------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+
|   2169|  Carolyn Daniel|    Electronics| 14.09|2023-10-07|     Delivered|    0.25|         Grayside|Credit Card|   2021-05-09|
|   6313|     Patty Perez|      Groceries| 79.83|2023-06-27|     Cancelled|    0.12|      Richardland|Credit Card|   2021-04-25|
|   2040|   Kyle Mcdonald|           Toys|327.52|2023-12-15|      Returned|    0.06|Lake Jenniferside|     Wallet|   2021-07-21|
|   6038|   David Bradley|        Fashion|348.51|2023-08-03|      Returned|    0.23|    Lake Toddland|        UPI|   2022-09-07|
|   3060|     John Pierce|           Toys|362.09|2023-12-25|      Returned|    0.03|       Brandt

8. Complex JSON Simulation (Advanced)

In [110]:
# Convert each order to a JSON string and load it back into a DataFrame.
from pyspark.sql.functions import to_json, struct, from_json
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

sales_json = sales_df.withColumn("json_order", to_json(struct("*")))
sales_json.select("json_order").show(truncate=False)

schema = sales_df.schema
sales_from_json = sales_json.select(from_json("json_order", schema).alias("data")).select("data.*")
sales_from_json.show()

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|json_order                                                                                                                                                                                                                                       |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"OrderID":2824,"CustomerName":"Donald Walker","ProductCategory":"Books","Amount":783.04,"OrderDate":"2024-12-26","DeliveryStatus":"Returned","Discount":0.15,"City":"Lake Joyside","PaymentMode":"Credit Card","CustomerSince":"2020-10-15"}    |
|{"OrderID":7912,"Custom

In [111]:
# Access nested fields using get_json_object and explode()
from pyspark.sql.functions import get_json_object
sales_json.select(get_json_object("json_order", "$.City").alias("CityFromJSON")).show()

+-----------------+
|     CityFromJSON|
+-----------------+
|     Lake Joyside|
|    New Jamesside|
|     Lake Roberto|
| West Melanieview|
|        Mariastad|
|  Port Jesseville|
|      Lake Joseph|
|         Grayside|
|      Richardland|
|       Thomasberg|
|      Jeffreyberg|
|        Port Erin|
|Lake Jenniferside|
|      Teresaburgh|
|   East Nathaniel|
|       Tracyville|
|    Lake Toddland|
|       Brandtside|
|         Lammouth|
|       North Chad|
+-----------------+
only showing top 20 rows



9. Applying Functions

In [112]:
# Create a function to tag orders: “Big”, “Medium”, “Small” based on Amount.
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
def tag_order(amount):
    if amount >= 10000:
        return "Big"
    elif amount >= 5000:
        return "Medium"
    else:
        return "Small"
tag_order_udf = udf(tag_order, StringType())
sales_df = sales_df.withColumn("OrderTag", tag_order_udf(col("Amount")))
sales_df.select("Amount", "OrderTag").show()


+------+--------+
|Amount|OrderTag|
+------+--------+
|783.04|   Small|
| 905.0|   Small|
|657.96|   Small|
|606.89|   Small|
| 77.87|   Small|
|352.37|   Small|
|148.33|   Small|
| 14.09|   Small|
| 79.83|   Small|
|882.68|   Small|
|870.55|   Small|
|921.73|   Small|
|327.52|   Small|
|676.02|   Small|
| 47.06|   Small|
| 46.15|   Small|
|348.51|   Small|
|362.09|   Small|
|684.26|   Small|
|251.89|   Small|
+------+--------+
only showing top 20 rows



In [114]:
import pandas as pd
sales_df1=pd.read_csv("/content/drive/MyDrive/Sales_Dataset.csv")
def tag_order(amount):
    if amount >= 10000:
        return "Big"
    elif amount >= 5000:
        return "Medium"
    else:
        return "Small"
sales_df1["OrderTag"] = sales_df1["Amount"].apply(tag_order)
print(sales_df1[["Amount", "OrderTag"]])

     Amount OrderTag
0    783.04    Small
1    905.00    Small
2    657.96    Small
3    606.89    Small
4     77.87    Small
..      ...      ...
495  680.00    Small
496  285.32    Small
497  792.11    Small
498  578.49    Small
499  904.97    Small

[500 rows x 2 columns]
