**Intialize the Saprk Session**

In [1]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder\
    .appName("SalesData Analysis")\
    .getOrCreate()
)

**DataFrame Creation and Inspection**

In [24]:
#1.Load the CSV using pyspark
a=spark.read.csv("/content/Sales_Dataset__500_Records_.csv",header=True,inferSchema=True)
a.show()
#load the csv using pandas
import pandas as pd
b=pd.read_csv("/content/Sales_Dataset__500_Records_.csv")
b.head()
#2.Display first 5 and last 5 records
print("First five rows:")
a.show(5)
print("last five rows:")
a.orderBy(a.OrderID.desc()).show(5)
#3.Print schema and data types
a.printSchema()

+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|
|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|   2020-08-08|
|   8527|      Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.

**Selection, Renaming, and Filtering**

In [7]:
#1.Select OrderID, CustomerName, Amount
a.select("OrderID", "CustomerName", "Amount").show()
#2.Rename Amount to OrderAmount
df_renamed = a.withColumnRenamed("Amount", "OrderAmount")
df_renamed.select("OrderID", "OrderAmount").show()
#3.Filter where Amount > 500
a.filter(a.Amount > 500).show()
#4.Filter by city
a.filter(a.City == "New York").show()
a.filter(a.City == "Port Erin").show()

+-------+------------------+------+
|OrderID|      CustomerName|Amount|
+-------+------------------+------+
|   2824|     Donald Walker|783.04|
|   7912|      Brandon Hall| 905.0|
|   4611|      Donald Booth|657.96|
|   3547|    Phillip Garcia|606.89|
|   8527|      Valerie Gray| 77.87|
|   4150|       Amber Perez|352.37|
|   5554|        Roy Martin|148.33|
|   2169|    Carolyn Daniel| 14.09|
|   6313|       Patty Perez| 79.83|
|   6155|Jonathan Wilkerson|882.68|
|   9830|       Kevin Hurst|870.55|
|   9085| Anthony Rodriguez|921.73|
|   2040|     Kyle Mcdonald|327.52|
|   6573|    Jeffrey Chavez|676.02|
|   2743|  Elizabeth Fowler| 47.06|
|   9837|     Tammy Sellers| 46.15|
|   6038|     David Bradley|348.51|
|   3060|       John Pierce|362.09|
|   4295|   Jennifer Powers|684.26|
|   5061|    George Chapman|251.89|
+-------+------------------+------+
only showing top 20 rows

+-------+-----------+
|OrderID|OrderAmount|
+-------+-----------+
|   2824|     783.04|
|   7912|      905.0|


**Data Manipulation**

In [10]:
from pyspark.sql.functions import col,when
#1.Drop CustomerSince
df_dropped = a.drop("CustomerSince")
df_dropped.show()
#2.Add FinalAmount = Amount - (Amount * Discount)
a = a.withColumn("FinalAmount", col("Amount") - (col("Amount") * col("Discount")))
a.show()
#3.Sort by FinalAmount descending
a.orderBy(col("FinalAmount").desc()).show()
#4.Replace "Cancelled" with "Order Cancelled"
df_status_updated = a.withColumn("DeliveryStatus", when(col("DeliveryStatus") == "Cancelled", "Order Cancelled").otherwise(col("DeliveryStatus")))
df_status_updated.select("OrderID", "DeliveryStatus").show()


+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+------------------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|       FinalAmount|
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+------------------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|           665.584|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|            877.85|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|          651.3804|
|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|          515.8565|
|   8527|      Valerie Gray|           Toys| 77.

**Aggregations and GroupBy**

In [11]:
from pyspark.sql.functions import avg, sum
#1.Count of orders by DeliveryStatus
a.groupBy("DeliveryStatus").count().show()
#2.Average Amount by ProductCategory
a.groupBy("ProductCategory").agg(avg("Amount").alias("AvgAmount")).show()
#3.Total sales by City
a.groupBy("City").agg(sum("Amount").alias("TotalSales")).show()


+--------------+-----+
|DeliveryStatus|count|
+--------------+-----+
|      Returned|  117|
|     Cancelled|  149|
|     Delivered|  119|
|       Pending|  115|
+--------------+-----+

+---------------+------------------+
|ProductCategory|         AvgAmount|
+---------------+------------------+
|        Fashion| 500.6308235294116|
|      Groceries|459.51786407766957|
|    Electronics|           551.745|
|          Books| 568.6003773584907|
|           Toys| 534.2837499999999|
+---------------+------------------+

+----------------+----------+
|            City|TotalSales|
+----------------+----------+
|     Ramseymouth|    761.06|
|East Edwardshire|    291.26|
|      Thomasberg|    882.68|
|     Laurenville|    383.26|
| South Colinstad|    786.27|
|    Lake Douglas|    975.09|
|   Williamsmouth|     10.78|
|      Gordonport|    514.99|
|  West Dawnmouth|      12.8|
|        Seanbury|    814.39|
|     Sheilaville|    981.05|
|       Mollybury|    222.02|
|       Lisaville|     45.69|
|

**Null Handling & Update**

In [13]:
from pyspark.sql.functions import lit
import random
df_null = a.withColumn("City", when((col("OrderID") % 5 == 0), None).otherwise(col("City")))
# Fill nulls
filled = df_null.fillna({"City": "Unknown"})
# Drop nulls
no_nulls = df_null.dropna(subset=["City"])
# Tag high-value customers
tagged = a.withColumn("CustomerType", when(col("Amount") > 800, "High Value").otherwise("Regular"))
tagged.select("OrderID", "Amount", "CustomerType").show()


+-------+------+------------+
|OrderID|Amount|CustomerType|
+-------+------+------------+
|   2824|783.04|     Regular|
|   7912| 905.0|  High Value|
|   4611|657.96|     Regular|
|   3547|606.89|     Regular|
|   8527| 77.87|     Regular|
|   4150|352.37|     Regular|
|   5554|148.33|     Regular|
|   2169| 14.09|     Regular|
|   6313| 79.83|     Regular|
|   6155|882.68|  High Value|
|   9830|870.55|  High Value|
|   9085|921.73|  High Value|
|   2040|327.52|     Regular|
|   6573|676.02|     Regular|
|   2743| 47.06|     Regular|
|   9837| 46.15|     Regular|
|   6038|348.51|     Regular|
|   3060|362.09|     Regular|
|   4295|684.26|     Regular|
|   5061|251.89|     Regular|
+-------+------+------------+
only showing top 20 rows



 **Date & Time Functions**

In [15]:
from pyspark.sql.functions import year, month, current_date, datediff,round
#1.Extract year and month from OrderDate
a.withColumn("OrderYear", year("OrderDate"))\
  .withColumn("OrderMonth", month("OrderDate"))\
  .select("OrderDate", "OrderYear", "OrderMonth")\
  .show()
#2.Calculate loyalty in years
loyalty =a.withColumn("LoyaltyYears", round(datediff(current_date(), col("CustomerSince")) / 365, 2))
loyalty.select("CustomerName", "CustomerSince", "LoyaltyYears").show()


+----------+---------+----------+
| OrderDate|OrderYear|OrderMonth|
+----------+---------+----------+
|2024-12-26|     2024|        12|
|2024-09-12|     2024|         9|
|2025-01-12|     2025|         1|
|2024-03-24|     2024|         3|
|2024-08-04|     2024|         8|
|2024-01-13|     2024|         1|
|2024-03-04|     2024|         3|
|2023-10-07|     2023|        10|
|2023-06-27|     2023|         6|
|2024-10-14|     2024|        10|
|2024-04-08|     2024|         4|
|2024-10-02|     2024|        10|
|2023-12-15|     2023|        12|
|2024-12-14|     2024|        12|
|2024-09-18|     2024|         9|
|2024-09-10|     2024|         9|
|2023-08-03|     2023|         8|
|2023-12-25|     2023|        12|
|2024-03-19|     2024|         3|
|2023-11-28|     2023|        11|
+----------+---------+----------+
only showing top 20 rows

+------------------+-------------+------------+
|      CustomerName|CustomerSince|LoyaltyYears|
+------------------+-------------+------------+
|     Donald W

**Joins and Unions**

In [17]:
from pyspark.sql import Row
#1.Region mapping DataFrame
region_data = [
    Row(City="Lake Roberto", Region="East"),
    Row(City="Port Jesseville", Region="West"),
    Row(City="Grayside", Region="South"),
    Row(City="Lake Joseph", Region="North"),
    Row(City="Mariastad", Region="West")
]
region= spark.createDataFrame(region_data)
#2.Inner Join
inner_joined = a.join(region, on="City", how="inner")
inner_joined.show()
#Left Join
left_joined = a.join(region, on="City", how="left")
left_joined.show()
#3.Union 2023 and 2024 orders
df_2023 = a.filter(year("OrderDate") == 2023)
df_2024 = a.filter(year("OrderDate") == 2024)
union_df = df_2023.union(df_2024)
union_df.show()


+---------------+-------+--------------+---------------+------+----------+--------------+--------+-----------+-------------+------------------+------+
|           City|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|PaymentMode|CustomerSince|       FinalAmount|Region|
+---------------+-------+--------------+---------------+------+----------+--------------+--------+-----------+-------------+------------------+------+
|   Lake Roberto|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Wallet|   2021-08-07|          651.3804|  East|
|Port Jesseville|   4150|   Amber Perez|          Books|352.37|2024-01-13|     Cancelled|    0.24|       Cash|   2022-01-13|          267.8012|  West|
|       Grayside|   2169|Carolyn Daniel|    Electronics| 14.09|2023-10-07|     Delivered|    0.25|Credit Card|   2021-05-09|10.567499999999999| South|
|    Lake Joseph|   5554|    Roy Martin|           Toys|148.33|2024-03-04|     Cancelled|    0

**Complex JSON Simulation**

In [20]:
from pyspark.sql.functions import to_json, struct, from_json, schema_of_json
#1.Convert to JSON string
sales_json =a.withColumn("json_data", to_json(struct([col(c) for c in a.columns])))
sales_json.select("json_data").show(truncate=False)
#2.Load back to DataFrame
sample_json = sales_json.select("json_data").first()["json_data"]
json_schema = schema_of_json(sample_json)
df_loaded = sales_json.select(from_json("json_data", json_schema).alias("data")).select("data.*")
df_loaded.show()
#save the data in json
a.write.mode("overwrite").json("sales_json")

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|json_data                                                                                                                                                                                                                                                                        |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"OrderID":2824,"CustomerName":"Donald Walker","ProductCategory":"Books","Amount":783.04,"OrderDate":"2024-12-26","DeliveryStatus":"Returned","Discount":0.15,"City":"Lake 

**Applying Functions**

In [22]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
# user defined function
def tag_order(amount):
    if amount > 800:
        return "Big"
    elif amount > 400:
        return "Medium"
    else:
        return "Small"
order_udf = udf(tag_order, StringType())
# Apply User defined function
df_tagged = a.withColumn("OrderTag",order_udf(col("Amount")))
df_tagged.select("OrderID", "Amount", "OrderTag").show()


+-------+------+--------+
|OrderID|Amount|OrderTag|
+-------+------+--------+
|   2824|783.04|  Medium|
|   7912| 905.0|     Big|
|   4611|657.96|  Medium|
|   3547|606.89|  Medium|
|   8527| 77.87|   Small|
|   4150|352.37|   Small|
|   5554|148.33|   Small|
|   2169| 14.09|   Small|
|   6313| 79.83|   Small|
|   6155|882.68|     Big|
|   9830|870.55|     Big|
|   9085|921.73|     Big|
|   2040|327.52|   Small|
|   6573|676.02|  Medium|
|   2743| 47.06|   Small|
|   9837| 46.15|   Small|
|   6038|348.51|   Small|
|   3060|362.09|   Small|
|   4295|684.26|  Medium|
|   5061|251.89|   Small|
+-------+------+--------+
only showing top 20 rows



In [25]:
!pip install dash

Collecting dash
  Downloading dash-3.0.4-py3-none-any.whl.metadata (10 kB)
Collecting Flask<3.1,>=1.0.4 (from dash)
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug<3.1 (from dash)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading dash-3.0.4-py3-none-any.whl (7.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading flask-3.0.3-py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading werkzeug-3.0.6-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.0/228.0 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: Werkzeug, retryi