Import Statments

In [176]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, round, max, sum, rand, month, year,  current_date, datediff, col, when, min, udf
from pyspark.sql.functions import to_json, struct, from_json, get_json_object
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from google.colab import drive


Creating Spark session

In [151]:
spark = SparkSession.builder \
                    .appName("Sales data set") \
                    .getOrCreate()
spark

#Hands-On Exercises
## 1. DataFrame Creation and Inspection

In [152]:
drive.mount('/content/drive')

# Loading Data
sales_df = spark.read.csv('/content/drive/MyDrive/SalesData/SalesDataset.csv', header=True, inferSchema=True)

# Displaying first 5 and last 5 records
print("\n First 5 records")
sales_df.show(5)

print("\n Last 5 records")
sales_df.orderBy("OrderID", ascending=False).show(5)

# Print schema and check data types.
print("\n Displaying data schema")
sales_df.printSchema()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

 First 5 records
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|OrderID|  CustomerName|ProductCategory|Amount| OrderDate|DeliveryStatus|Discount|            City|PaymentMode|CustomerSince|
+-------+--------------+---------------+------+----------+--------------+--------+----------------+-----------+-------------+
|   2824| Donald Walker|          Books|783.04|2024-12-26|      Returned|    0.15|    Lake Joyside|Credit Card|   2020-10-15|
|   7912|  Brandon Hall|      Groceries| 905.0|2024-09-12|     Cancelled|    0.03|   New Jamesside|     Wallet|   2022-03-15|
|   4611|  Donald Booth|        Fashion|657.96|2025-01-12|      Returned|    0.01|    Lake Roberto|     Wallet|   2021-08-07|
|   3547|Phillip Garcia|        Fashion|606.89|2024-03-24|      Returned|    0.15|West Melanievie

## 2. Selection, Renaming, and Filtering

Select only OrderID, CustomerName, and Amount

In [153]:
sales_df.select("OrderID", "CustomerName", "Amount").show()

+-------+------------------+------+
|OrderID|      CustomerName|Amount|
+-------+------------------+------+
|   2824|     Donald Walker|783.04|
|   7912|      Brandon Hall| 905.0|
|   4611|      Donald Booth|657.96|
|   3547|    Phillip Garcia|606.89|
|   8527|      Valerie Gray| 77.87|
|   4150|       Amber Perez|352.37|
|   5554|        Roy Martin|148.33|
|   2169|    Carolyn Daniel| 14.09|
|   6313|       Patty Perez| 79.83|
|   6155|Jonathan Wilkerson|882.68|
|   9830|       Kevin Hurst|870.55|
|   9085| Anthony Rodriguez|921.73|
|   2040|     Kyle Mcdonald|327.52|
|   6573|    Jeffrey Chavez|676.02|
|   2743|  Elizabeth Fowler| 47.06|
|   9837|     Tammy Sellers| 46.15|
|   6038|     David Bradley|348.51|
|   3060|       John Pierce|362.09|
|   4295|   Jennifer Powers|684.26|
|   5061|    George Chapman|251.89|
+-------+------------------+------+
only showing top 20 rows



Rename Amount to OrderAmount

In [154]:
sales_df = sales_df.withColumnRenamed("Amount", "OrderAmount")
sales_df.show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|   2020-08-08|
|   8527|      Valerie Gray|           Toys|    

 Filter orders where Amount > 500.

In [155]:
sales_df.filter(sales_df.OrderAmount > 500).show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|CustomerSince|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-------------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|   2020-10-15|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|   2022-03-15|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|   2021-08-07|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|   2020-08-08|
|   6155|Jonathan Wilkerson|        Fashion|    

Filter orders from a specific city using .query() or .filter()

In [156]:
sales_df.filter(sales_df.City == "Lake Roberto").show()
sales_df.where(sales_df.City == "Lake Roberto").show()

+-------+------------+---------------+-----------+----------+--------------+--------+------------+-----------+-------------+
|OrderID|CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|        City|PaymentMode|CustomerSince|
+-------+------------+---------------+-----------+----------+--------------+--------+------------+-----------+-------------+
|   4611|Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|Lake Roberto|     Wallet|   2021-08-07|
+-------+------------+---------------+-----------+----------+--------------+--------+------------+-----------+-------------+

+-------+------------+---------------+-----------+----------+--------------+--------+------------+-----------+-------------+
|OrderID|CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|        City|PaymentMode|CustomerSince|
+-------+------------+---------------+-----------+----------+--------------+--------+------------+-----------+-------------+

## 3. Data Manipulation

 Drop CustomerSince column

In [157]:
sales_df = sales_df.drop("CustomerSince")
sales_df.show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|
|   8527|      Valerie Gray|           Toys|      77.87|2024-08-04|     Delivered|    0.17|        Mariastad|       Cash|
|   4150|       Amber Pe

Add a new column FinalAmount = Amount - (Amount * Discount)

In [158]:
sales_df = sales_df.withColumn("FinalAmount", round(sales_df.OrderAmount - (sales_df.OrderAmount * sales_df.Discount), 2))
sales_df.show()

+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-----------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate|DeliveryStatus|Discount|             City|PaymentMode|FinalAmount|
+-------+------------------+---------------+-----------+----------+--------------+--------+-----------------+-----------+-----------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|      Returned|    0.15|     Lake Joyside|Credit Card|     665.58|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|     Cancelled|    0.03|    New Jamesside|     Wallet|     877.85|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|      Returned|    0.01|     Lake Roberto|     Wallet|     651.38|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|      Returned|    0.15| West Melanieview|     Wallet|     515.86|
|   8527|      Valerie Gray|           Toys|      77.87|2024-0

 Sort by FinalAmount descending.

In [159]:
sales_df.orderBy("FinalAmount", ascending=False) \
        .select("OrderID", "CustomerName", "ProductCategory", "OrderAmount", "FinalAmount") \
        .show()

+-------+-----------------+---------------+-----------+-----------+
|OrderID|     CustomerName|ProductCategory|OrderAmount|FinalAmount|
+-------+-----------------+---------------+-----------+-----------+
|   5573|   Jordan Frazier|          Books|     981.05|     961.43|
|   8474|      Heidi Brown|    Electronics|     968.91|     949.53|
|   8889|      Karen Garza|          Books|      998.3|      938.4|
|   2127|     Jaclyn Moore|      Groceries|     933.32|     923.99|
|   9806|    Samantha Gill|      Groceries|     993.17|     923.65|
|   5593|   Kristy Johnson|        Fashion|     961.35|     913.28|
|   2120| Alejandra Santos|    Electronics|     948.84|     910.89|
|   5949|Dr. Michael Evans|           Toys|     918.14|     908.96|
|   1422|    Hunter Kramer|          Books|      973.2|     905.08|
|   2904|   Michelle Burns|        Fashion|     922.29|     903.84|
|   7566|    Jesus Houston|        Fashion|     899.31|     899.31|
|   7511|      Regina Diaz|           Toys|     

Replace all “Cancelled” status with “Order Cancelled”.

In [160]:
sales_df = sales_df.withColumn(
    "DeliveryStatus",
    when(col("DeliveryStatus") == "Cancelled", "Order Cancelled").otherwise(col("DeliveryStatus"))
)
sales_df.show()

+-------+------------------+---------------+-----------+----------+---------------+--------+-----------------+-----------+-----------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate| DeliveryStatus|Discount|             City|PaymentMode|FinalAmount|
+-------+------------------+---------------+-----------+----------+---------------+--------+-----------------+-----------+-----------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|       Returned|    0.15|     Lake Joyside|Credit Card|     665.58|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|Order Cancelled|    0.03|    New Jamesside|     Wallet|     877.85|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|       Returned|    0.01|     Lake Roberto|     Wallet|     651.38|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|       Returned|    0.15| West Melanieview|     Wallet|     515.86|
|   8527|      Valerie Gray|           Toys|      77.87

## 4.  Aggregations and GroupBy

 Count of orders by DeliveryStatus.

In [161]:
sales_df.groupBy(sales_df["DeliveryStatus"]) \
        .count().withColumnRenamed("count", "OrderCount") \
        .show()

+---------------+----------+
| DeliveryStatus|OrderCount|
+---------------+----------+
|       Returned|       117|
|      Delivered|       119|
|Order Cancelled|       149|
|        Pending|       115|
+---------------+----------+



Average Amount by ProductCategory

In [162]:
sales_df.groupBy(["ProductCategory"]) \
        .agg(round(avg("FinalAmount"), 2).alias("AvgFinalAmount")) \
        .show()

+---------------+--------------+
|ProductCategory|AvgFinalAmount|
+---------------+--------------+
|        Fashion|        422.53|
|      Groceries|        392.48|
|    Electronics|         472.5|
|          Books|        484.19|
|           Toys|        458.45|
+---------------+--------------+



Group by City and show total sales.

In [163]:
sales_df.groupBy("City") \
    .agg(round(sum("FinalAmount"), 2) \
    .alias("Total Sales")) \
    .show()

+----------------+-----------+
|            City|Total Sales|
+----------------+-----------+
|     Ramseymouth|     738.23|
|East Edwardshire|      230.1|
|      Thomasberg|     644.36|
|     Laurenville|     321.94|
| South Colinstad|     629.02|
|    Lake Douglas|     780.07|
|   Williamsmouth|       9.81|
|      Gordonport|     381.09|
|  West Dawnmouth|       10.5|
|        Seanbury|     627.08|
|     Sheilaville|     961.43|
|       Mollybury|     222.02|
|       Lisaville|      43.41|
| Lake Jerrymouth|     395.93|
|       Perezfort|     734.04|
|Port Nicoleshire|     111.04|
|  South Samantha|      169.8|
|     Port Willie|     780.25|
|     Waltersfort|     386.97|
|       Youngbury|     268.52|
+----------------+-----------+
only showing top 20 rows



## 5. Null Handling & Update

  Intentionally inject nulls in City column and handle them using fillna(), dropn().

In [164]:
# Filling Nulls in Column City
sales_df = sales_df.withColumn(
    "City",
    when(rand() < 0.2, None).otherwise(sales_df.City)
)
sales_df.show()

# Handling null with 'unknown'
sales_df = sales_df.fillna({"City": "Unknown"})
sales_df.show()

# Again injecting Null values
sales_df = sales_df.withColumn(
    "City",
    when(rand() < 0.2, None).otherwise(sales_df.City)
)
sales_df.show()

# Droping City names that contain null
sales_df = sales_df.dropna(subset=["City"])
sales_df.show()

+-------+------------------+---------------+-----------+----------+---------------+--------+-----------------+-----------+-----------+
|OrderID|      CustomerName|ProductCategory|OrderAmount| OrderDate| DeliveryStatus|Discount|             City|PaymentMode|FinalAmount|
+-------+------------------+---------------+-----------+----------+---------------+--------+-----------------+-----------+-----------+
|   2824|     Donald Walker|          Books|     783.04|2024-12-26|       Returned|    0.15|     Lake Joyside|Credit Card|     665.58|
|   7912|      Brandon Hall|      Groceries|      905.0|2024-09-12|Order Cancelled|    0.03|    New Jamesside|     Wallet|     877.85|
|   4611|      Donald Booth|        Fashion|     657.96|2025-01-12|       Returned|    0.01|     Lake Roberto|     Wallet|     651.38|
|   3547|    Phillip Garcia|        Fashion|     606.89|2024-03-24|       Returned|    0.15| West Melanieview|     Wallet|     515.86|
|   8527|      Valerie Gray|           Toys|      77.87

Use .when().otherwise() in PySpark to tag high-value customers (Amount >
 800).

In [165]:
sales_df = sales_df.withColumn("CustomerType",
                                when(col("FinalAmount") > 800, "High Value")
                                .otherwise("Regular")
                              )
sales_df.select("CustomerName", "CustomerType").show()

+------------------+------------+
|      CustomerName|CustomerType|
+------------------+------------+
|     Donald Walker|     Regular|
|      Brandon Hall|  High Value|
|      Donald Booth|     Regular|
|    Phillip Garcia|     Regular|
|      Valerie Gray|     Regular|
|       Amber Perez|     Regular|
|        Roy Martin|     Regular|
|    Carolyn Daniel|     Regular|
|       Patty Perez|     Regular|
|Jonathan Wilkerson|     Regular|
|       Kevin Hurst|     Regular|
|     Kyle Mcdonald|     Regular|
|    Jeffrey Chavez|     Regular|
|  Elizabeth Fowler|     Regular|
|     David Bradley|     Regular|
|       John Pierce|     Regular|
|    George Chapman|     Regular|
|      Taylor Heath|     Regular|
|    Timothy Duncan|  High Value|
|  Nicholas Mcbride|     Regular|
+------------------+------------+
only showing top 20 rows



## 6. Date & Time Functions

Extract year and month from OrderDate

In [166]:
sales_df = sales_df.withColumn("OrderMonth", month(sales_df.OrderDate)) \
                   .withColumn("OrderYear", year(sales_df.OrderDate))

sales_df.select("OrderID", "OrderDate", "OrderMonth", "OrderYear", "DeliveryStatus").show()

+-------+----------+----------+---------+---------------+
|OrderID| OrderDate|OrderMonth|OrderYear| DeliveryStatus|
+-------+----------+----------+---------+---------------+
|   2824|2024-12-26|        12|     2024|       Returned|
|   7912|2024-09-12|         9|     2024|Order Cancelled|
|   4611|2025-01-12|         1|     2025|       Returned|
|   3547|2024-03-24|         3|     2024|       Returned|
|   8527|2024-08-04|         8|     2024|      Delivered|
|   4150|2024-01-13|         1|     2024|Order Cancelled|
|   5554|2024-03-04|         3|     2024|Order Cancelled|
|   2169|2023-10-07|        10|     2023|      Delivered|
|   6313|2023-06-27|         6|     2023|Order Cancelled|
|   6155|2024-10-14|        10|     2024|Order Cancelled|
|   9830|2024-04-08|         4|     2024|      Delivered|
|   2040|2023-12-15|        12|     2023|       Returned|
|   6573|2024-12-14|        12|     2024|Order Cancelled|
|   2743|2024-09-18|         9|     2024|      Delivered|
|   6038|2023-

Calculate customer loyalty in years = today - CustomerSince.

In [167]:


customer_since_df = sales_df.groupBy("CustomerName").agg(min("OrderDate").alias("CustomerSince"))

sales_df = sales_df.join(customer_since_df, on="CustomerName", how="left")

sales_df = sales_df.withColumn(
    "CustomerLoyaltyYears",
    (datediff(current_date(), col("CustomerSince")) / 365).cast("int")
)

sales_df.select("CustomerName", "CustomerSince", "CustomerLoyaltyYears").show()


+------------------+-------------+--------------------+
|      CustomerName|CustomerSince|CustomerLoyaltyYears|
+------------------+-------------+--------------------+
|     Donald Walker|   2024-12-26|                   0|
|      Brandon Hall|   2024-09-12|                   0|
|      Donald Booth|   2025-01-12|                   0|
|    Phillip Garcia|   2024-03-24|                   1|
|      Valerie Gray|   2024-08-04|                   0|
|       Amber Perez|   2024-01-13|                   1|
|        Roy Martin|   2024-03-04|                   1|
|    Carolyn Daniel|   2023-10-07|                   1|
|       Patty Perez|   2023-06-27|                   1|
|Jonathan Wilkerson|   2024-10-14|                   0|
|       Kevin Hurst|   2024-04-08|                   1|
|     Kyle Mcdonald|   2023-12-15|                   1|
|    Jeffrey Chavez|   2024-12-14|                   0|
|  Elizabeth Fowler|   2024-09-18|                   0|
|     David Bradley|   2023-08-03|              

 ## 7. Joins and Unions

Create a second DataFrame with city-wise region mapping.

In [168]:
region_data = [
    ("Lake Joyside", "East"),
    ("New Jamesside", "West"),
    ("Lake Roberto", "Central")
]
region_df = spark.createDataFrame(region_data, ["City", "Region"])
region_df.show()

+-------------+-------+
|         City| Region|
+-------------+-------+
| Lake Joyside|   East|
|New Jamesside|   West|
| Lake Roberto|Central|
+-------------+-------+



Perform inner and left joins with the main dataset.

In [169]:
sales_df = sales_df.join(region_df, on="City", how="left")
sales_df.select("CustomerName", "City", "Region").show()

+------------------+-----------------+-------+
|      CustomerName|             City| Region|
+------------------+-----------------+-------+
|  Nicholas Mcbride|       East James|   NULL|
|       Amber Perez|  Port Jesseville|   NULL|
|    George Chapman|       North Chad|   NULL|
|     David Bradley|    Lake Toddland|   NULL|
|     Donald Walker|     Lake Joyside|   East|
|       John Pierce|       Brandtside|   NULL|
|      Taylor Heath|   West Elizabeth|   NULL|
|     Kyle Mcdonald|Lake Jenniferside|   NULL|
|        Roy Martin|          Unknown|   NULL|
|       Patty Perez|          Unknown|   NULL|
|Jonathan Wilkerson|          Unknown|   NULL|
|    Jeffrey Chavez|          Unknown|   NULL|
|  Elizabeth Fowler|          Unknown|   NULL|
|    Timothy Duncan|      Port Thomas|   NULL|
|      Donald Booth|     Lake Roberto|Central|
|     Donald Wright|South Lindseyside|   NULL|
|       Kevin Hurst|      Jeffreyberg|   NULL|
|    Phillip Garcia| West Melanieview|   NULL|
|      Brando

Union two datasets: e.g., orders from 2023 and 2024.

In [170]:
orders_2023 = [
    ("O001", "Alice", "2023-05-01", 250.0),
    ("O002", "Bob", "2023-06-15", 300.0)
]
orders_df_2023 = spark.createDataFrame(orders_2023, ["OrderID", "CustomerName", "OrderDate", "Amount"])

orders_2024 = [
    ("O101", "Charlie", "2024-03-20", 150.0),
    ("O102", "Diana", "2024-04-10", 500.0)
]
orders_df_2024 = spark.createDataFrame(orders_2024, ["OrderID", "CustomerName", "OrderDate", "Amount"])


combined_orders_df = orders_df_2023.union(orders_df_2024)


combined_orders_df.show()


+-------+------------+----------+------+
|OrderID|CustomerName| OrderDate|Amount|
+-------+------------+----------+------+
|   O001|       Alice|2023-05-01| 250.0|
|   O002|         Bob|2023-06-15| 300.0|
|   O101|     Charlie|2024-03-20| 150.0|
|   O102|       Diana|2024-04-10| 500.0|
+-------+------------+----------+------+



## 8. Complex JSON Simulation (Advanced)

Convert each order to a JSON string and load it back into a DataFrame.

In [171]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

# Use the Spark DataFrame, not the list
json_rdd = orders_df_2023.toJSON()

# Define schema for the JSON data
schema = StructType([
    StructField("OrderID", StringType(), True),
    StructField("CustomerName", StringType(), True),
    StructField("OrderDate", StringType(), True),
    StructField("Amount", DoubleType(), True)
])

# Loading data back to DataFrame
df_from_json = spark.read.schema(schema).json(json_rdd)

df_from_json.show()


+-------+------------+----------+------+
|OrderID|CustomerName| OrderDate|Amount|
+-------+------------+----------+------+
|   O001|       Alice|2023-05-01| 250.0|
|   O002|         Bob|2023-06-15| 300.0|
+-------+------------+----------+------+



Access nested fields using explode() and get_json_object().



In [172]:


data = [
    ('{"OrderID":"O001","CustomerName":"Alice","Amount":250.0}',),
    ('{"OrderID":"O002","CustomerName":"Bob","Amount":300.0}',)
]
json_df = spark.createDataFrame(data, ["order_json"])

json_df.select(
    get_json_object("order_json", "$.CustomerName").alias("Customer Name"),
    get_json_object("order_json", "$.Amount").alias("Amount")
).show()

+-------------+------+
|Customer Name|Amount|
+-------------+------+
|        Alice| 250.0|
|          Bob| 300.0|
+-------------+------+



 ## 9. Applying Functions

Create a function to tag orders: “Big”, “Medium”, “Small” based on Amount. (UDF Version)



In [174]:
def categorize_amount(amount):
    if amount > 600:
        return "Big"
    elif amount > 200:
        return "Medium"
    else:
        return "Small"

# Registering in PySpark UDF
categorize_amount_udf = udf(categorize_amount, StringType())

# Applying UDF
tagged_sales_df = sales_df.withColumn("AmountCategory", categorize_amount_udf(col("FinalAmount")))

tagged_sales_df.select("OrderID", "FinalAmount", "AmountCategory").show(truncate=False)


+-------+-----------+--------------+
|OrderID|FinalAmount|AmountCategory|
+-------+-----------+--------------+
|2612   |159.91     |Small         |
|4150   |267.8      |Medium        |
|5061   |183.88     |Small         |
|6038   |268.35     |Medium        |
|2824   |665.58     |Big           |
|3060   |351.23     |Medium        |
|1964   |572.72     |Medium        |
|2040   |307.87     |Medium        |
|5554   |108.28     |Small         |
|6313   |70.25      |Small         |
|6155   |644.36     |Big           |
|6573   |547.58     |Medium        |
|2743   |40.94      |Small         |
|9834   |878.43     |Big           |
|4611   |651.38     |Big           |
|3296   |434.99     |Medium        |
|9830   |679.03     |Big           |
|3547   |515.86     |Medium        |
|7912   |877.85     |Big           |
|2169   |10.57      |Small         |
+-------+-----------+--------------+
only showing top 20 rows



Create a function to tag orders: “Big”, “Medium”, “Small” based on Amount. (.apply() Version)

In [175]:
def categorize_amount_pandas(amount):
    if amount > 600:
        return "Big"
    elif amount > 200:
        return "Medium"
    else:
        return "Small"


sales_pd_df = sales_df.toPandas()
sales_pd_df['AmountCategory'] = sales_pd_df['FinalAmount'].apply(categorize_amount_pandas)

print(sales_pd_df[['OrderID', 'FinalAmount', 'AmountCategory']])


     OrderID  FinalAmount AmountCategory
0       8827       230.10         Medium
1       1046       321.94         Medium
2       5076       381.09         Medium
3       9095        10.50          Small
4       8935       222.02         Medium
..       ...          ...            ...
404     8700       411.72         Medium
405     3317       370.16         Medium
406     2341       114.99          Small
407     8155       244.57         Medium
408     6302       536.65         Medium

[409 rows x 3 columns]
