In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum, col

# Step 1: Create a SparkSession
spark = SparkSession.builder \
    .appName("SQLCommandsOnDataFrame") \
    .master("local[*]") \
    .getOrCreate()

# Step 2: Create a DataFrame with sample data
data = [
    ("Alice", "Electronics", 200),
    ("Bob", "Electronics", 150),
    ("Alice", "Clothing", 100),
    ("Bob", "Clothing", 50),
    ("Alice", "Electronics", 300)
]
columns = ["Name", "Category", "Amount"]
df = spark.createDataFrame(data, columns)

# Step 3: Execute SQL-style functions
# Calculate total and average spending by category
result_df = df.groupBy("Category") \
    .agg(
        sum("Amount").alias("TotalSpent"),
        avg("Amount").alias("AverageSpent")
    )

# Step 4: Show the result
result_df.show()

# Stop the SparkSession
spark.stop()

+-----------+----------+------------------+
|   Category|TotalSpent|      AverageSpent|
+-----------+----------+------------------+
|Electronics|       650|216.66666666666666|
|   Clothing|       150|              75.0|
+-----------+----------+------------------+

