In [1]:
from pyspark.sql import SparkSession

spark=SparkSession.builder\
      .appName("Colab PySpark Setup")\
      .getOrCreate()

spark

In [32]:
import os

file_path = '/content/drive/MyDrive/PysparkDemo/SalesDataset500Records.csv'
print(os.path.exists(file_path))

True


In [40]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

from pyspark.sql.functions import avg, max, sum, count, year, current_date, datediff, col


sales_df=spark.read.csv('/content/drive/MyDrive/PysparkDemo/SalesDataset500Records.csv',header=True,inferSchema=True)
sales_df.show()

Mounted at /content/drive
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|
|   3547|    Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|   2020-08-08|
|   8527|      Valerie Gray|           Toys| 77.87|2024-0

In [41]:
# 1. DataFrame Creation and Inspection
#  Load the CSV using Pandas, PySpark, and Dask.
#  Display the first 5 and last 5 records.
#  Print schema and check data types.

sales_df.show(5)
sales_df.tail(5)
sales_df.printSchema()

+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanieview|     Wallet|   2020-08-08|
|   8527|  Valerie Gray|           Toys| 77.87|2024-08-04|     Delivered|    0.17|       Mariastad|       Cash|   2022

In [42]:
#  2. Selection, Renaming, and Filtering
#  Select only OrderID, CustomerName, and Amount.
#  Rename Amount to OrderAmount.
#  Filter orders where Amount > 500.
#  Filter orders from a specific city using .query() or .filter()
sales_df.select("OrderID","CustomerName","Amount").withColumnRenamed("Amount","OrderAmount").show()
sales_df.filter((sales_df.Amount)>500).show()
sales_df.filter((sales_df.City)=="Mariastad").show()

+-------+------------------+-----------+
|OrderID|      CustomerName|OrderAmount|
+-------+------------------+-----------+
|   2824|     Donald Walker|     783.04|
|   7912|      Brandon Hall|      905.0|
|   4611|      Donald Booth|     657.96|
|   3547|    Phillip Garcia|     606.89|
|   8527|      Valerie Gray|      77.87|
|   4150|       Amber Perez|     352.37|
|   5554|        Roy Martin|     148.33|
|   2169|    Carolyn Daniel|      14.09|
|   6313|       Patty Perez|      79.83|
|   6155|Jonathan Wilkerson|     882.68|
|   9830|       Kevin Hurst|     870.55|
|   9085| Anthony Rodriguez|     921.73|
|   2040|     Kyle Mcdonald|     327.52|
|   6573|    Jeffrey Chavez|     676.02|
|   2743|  Elizabeth Fowler|      47.06|
|   9837|     Tammy Sellers|      46.15|
|   6038|     David Bradley|     348.51|
|   3060|       John Pierce|     362.09|
|   4295|   Jennifer Powers|     684.26|
|   5061|    George Chapman|     251.89|
+-------+------------------+-----------+
only showing top

In [43]:
#  3. Data Manipulation
#  Drop CustomerSince column.
#  Add a new column FinalAmount = Amount - (Amount * Discount).
#  Sort by FinalAmount descending.
#  Replace all “Cancelled” status with “Order Cancelled”.
from pyspark.sql.functions import col
sales_modified_df = sales_df.drop("CustomerSince")

sales_df = sales_df.withColumn("FinalAmount", col("Amount") - (col("Amount") * col("Discount")))
sales_df.orderBy(col("FinalAmount").desc()).show()



+-------+-----------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+-----------------+
|OrderID|     CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|      FinalAmount|
+-------+-----------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+-----------------+
|   5573|   Jordan Frazier|          Books|981.05|2025-03-19|     Cancelled|    0.02|      Sheilaville|       Cash|   2021-07-12|          961.429|
|   8474|      Heidi Brown|    Electronics|968.91|2023-11-23|     Cancelled|    0.02|       Riverafort|       Cash|   2023-03-19|         949.5318|
|   8889|      Karen Garza|          Books| 998.3|2024-10-17|     Cancelled|    0.06|       Johnsonton|Credit Card|   2020-12-17|938.4019999999999|
|   2127|     Jaclyn Moore|      Groceries|933.32|2025-03-11|      Returned|    0.01|      Cherylhaven|       Ca

In [44]:
# 4. Aggregations and GroupBy
#  Count of orders by DeliveryStatus.
#  Average Amount by ProductCategory.
#  Group by City and show total sales.
sales_df.groupBy("DeliveryStatus").count().show()
sales_df.groupBy("ProductCategory").avg("Amount").show()
sales_df.groupBy("City").sum("Amount").withColumnRenamed("sum(Amount)", "TotalSales").show()



+--------------+-----+
|DeliveryStatus|count|
+--------------+-----+
|      Returned|  117|
|     Cancelled|  149|
|     Delivered|  119|
|       Pending|  115|
+--------------+-----+

+---------------+------------------+
|ProductCategory|       avg(Amount)|
+---------------+------------------+
|        Fashion| 500.6308235294116|
|      Groceries|459.51786407766957|
|    Electronics|           551.745|
|          Books| 568.6003773584907|
|           Toys| 534.2837499999999|
+---------------+------------------+

+----------------+----------+
|            City|TotalSales|
+----------------+----------+
|     Ramseymouth|    761.06|
|East Edwardshire|    291.26|
|      Thomasberg|    882.68|
|     Laurenville|    383.26|
| South Colinstad|    786.27|
|    Lake Douglas|    975.09|
|   Williamsmouth|     10.78|
|      Gordonport|    514.99|
|  West Dawnmouth|      12.8|
|        Seanbury|    814.39|
|     Sheilaville|    981.05|
|       Mollybury|    222.02|
|       Lisaville|     45.69|
|

In [45]:
# 5. Null Handling & Update
#  Intentionally inject nulls in City column and handle them using fillna(),
#  dropna().
#  Use .when().otherwise() in PySpark to tag high-value customers (Amount >
#  800).
from pyspark.sql.functions import when
sales_df = sales_df.withColumn("City", when(col("City").isNull(), "Unknown").otherwise(col("City")))
sales_df = sales_df.na.drop(subset=["City"])
sales_df = sales_df.withColumn("CustomerType", when(col("Amount") > 800, "High-Value").otherwise("Regular"))
sales_df.show()

+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+------------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|       FinalAmount|CustomerType|
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+------------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|           665.584|     Regular|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|            877.85|  High-Value|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|          651.3804|     Regular|
|   3547|    Phillip G

In [47]:
#  6. Date & Time Functions
#  Extract year and month from OrderDate.
#  Calculate customer loyalty in years = today - CustomerSince.
from pyspark.sql.functions import year, month, datediff, current_date

sales_df = sales_df.withColumn("Year", year("OrderDate")).withColumn("Month", month("OrderDate"))
sales_df = sales_df.withColumn("LoyaltyYears", (datediff(current_date(), col("CustomerSince")) / 365).cast("int"))
sales_df.show()


+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+------------+----+-----+------------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|       FinalAmount|CustomerType|Year|Month|LoyaltyYears|
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+------------+----+-----+------------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|           665.584|     Regular|2024|   12|           4|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|            877.85|  High-Value|2024|    9|           3|
|   4611|      Donald Booth|        Fashion|657.96|2025-01-12|   

In [49]:
#  7. Joins and Unions
#  Create a second DataFrame with city-wise region mapping.
#  Perform inner and left joins with the main dataset.
#  Union two datasets: e.g., orders from 2023 and 2024.
o23 = sales_df.filter(year("OrderDate") == 2023)
o24 = sales_df.filter(year("OrderDate") == 2024)

combined_df = o23.union(o24)
combined_df.show()

+-------+----------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+------------+----+-----+------------+
|OrderID|    CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|       FinalAmount|CustomerType|Year|Month|LoyaltyYears|
+-------+----------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+------------+----+-----+------------+
|   2169|  Carolyn Daniel|    Electronics| 14.09|2023-10-07|     Delivered|    0.25|         Grayside|Credit Card|   2021-05-09|10.567499999999999|     Regular|2023|   10|           4|
|   6313|     Patty Perez|      Groceries| 79.83|2023-06-27|     Cancelled|    0.12|      Richardland|Credit Card|   2021-04-25|           70.2504|     Regular|2023|    6|           4|
|   2040|   Kyle Mcdonald|           Toys|327.52|2023-12-15|      Returned|

In [56]:
#  8. Complex JSON Simulation (Advanced)
#  Convert each order to a JSON string and load it back into a DataFrame.
#  Access nested fields using explode() and get_json_object().
from pyspark.sql.functions import to_json, from_json, schema_of_json, explode,struct

json_df = sales_df.withColumn("json_data", to_json(struct([col(c) for c in sales_df.columns])))
schema = schema_of_json(json_df.select("json_data").first()[0])

j_df = json_df.select(from_json("json_data", schema).alias("data"))
j_df.select("data.OrderID", "data.CustomerName").show()


+-------+------------------+
|OrderID|      CustomerName|
+-------+------------------+
|   2824|     Donald Walker|
|   7912|      Brandon Hall|
|   4611|      Donald Booth|
|   3547|    Phillip Garcia|
|   8527|      Valerie Gray|
|   4150|       Amber Perez|
|   5554|        Roy Martin|
|   2169|    Carolyn Daniel|
|   6313|       Patty Perez|
|   6155|Jonathan Wilkerson|
|   9830|       Kevin Hurst|
|   9085| Anthony Rodriguez|
|   2040|     Kyle Mcdonald|
|   6573|    Jeffrey Chavez|
|   2743|  Elizabeth Fowler|
|   9837|     Tammy Sellers|
|   6038|     David Bradley|
|   3060|       John Pierce|
|   4295|   Jennifer Powers|
|   5061|    George Chapman|
+-------+------------------+
only showing top 20 rows



In [55]:
#  9. Applying Functions
# Create a function to tag orders: “Big”, “Medium”, “Small” based on Amount.
#  Apply it using
# .apply() in Pandas, and UDF in PySpark

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def order(amount):
    if amount > 800:
        return "Big"
    elif amount > 300:
        return "Medium"
    else:
        return "Small"

order_udf = udf(order, StringType())
sales_df = sales_df.withColumn("OrderTag", order_udf("Amount"))
sales_df.show()

+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+------------+----+-----+------------+--------+
|OrderID|      CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|       FinalAmount|CustomerType|Year|Month|LoyaltyYears|OrderTag|
+-------+------------------+---------------+------+----------+--------------+--------+-----------------+-----------+-------------+------------------+------------+----+-----+------------+--------+
|   2824|     Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|           665.584|     Regular|2024|   12|           4|  Medium|
|   7912|      Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|            877.85|  High-Value|2024|    9|           3|     Big|
|   4611|      Donal