In [None]:
# Setup
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *

# Create SparkSession
spark = SparkSession.builder.appName("Aggregations and GroupBy").getOrCreate()

# Create comprehensive sample data
sales_data = [
    (1, "2024-01-15", "Alice", "Electronics", "Laptop", 2, 1200.00, "North"),
    (2, "2024-01-15", "Bob", "Books", "Python Guide", 1, 45.00, "South"),
    (3, "2024-01-16", "Alice", "Electronics", "Mouse", 3, 25.00, "North"),
    (4, "2024-01-16", "Charlie", "Electronics", "Keyboard", 1, 75.00, "East"),
    (5, "2024-01-17", "Bob", "Books", "Data Science", 2, 60.00, "South"),
    (6, "2024-01-17", "Diana", "Clothing", "Shirt", 4, 30.00, "West"),
    (7, "2024-01-18", "Alice", "Electronics", "Monitor", 1, 300.00, "North"),
    (8, "2024-01-18", "Eve", "Clothing", "Pants", 2, 50.00, "West"),
    (9, "2024-01-19", "Charlie", "Books", "ML Handbook", 1, 80.00, "East"),
    (10, "2024-01-19", "Diana", "Electronics", "Tablet", 1, 400.00, "West")
]

df = spark.createDataFrame(sales_data, [
    "order_id", "date", "customer", "category", "product",
    "quantity", "price", "region"
])

# Add calculated columns
df = df.withColumn("total_amount", df.quantity * df.price)
df = df.withColumn("date", F.to_date(df.date, "yyyy-MM-dd"))

print("Sample sales data:")
df.show()
df.printSchema()


In [None]:
# Basic aggregate functions
print("=== BASIC AGGREGATIONS ===")

# Single aggregations
print("1. Basic statistics:")
df.agg(
    F.count("*").alias("total_orders"),
    F.sum("total_amount").alias("total_revenue"),
    F.avg("total_amount").alias("avg_order_value"),
    F.min("total_amount").alias("min_order"),
    F.max("total_amount").alias("max_order")
).show()

print("\n2. More aggregate functions:")
df.agg(
    F.countDistinct("customer").alias("unique_customers"),
    F.countDistinct("category").alias("unique_categories"),
    F.stddev("total_amount").alias("std_deviation"),
    F.variance("total_amount").alias("variance")
).show()

print("\n3. Statistical functions:")
df.agg(
    F.percentile_approx("total_amount", 0.5).alias("median"),
    F.percentile_approx("total_amount", 0.25).alias("q1"),
    F.percentile_approx("total_amount", 0.75).alias("q3"),
    F.skewness("total_amount").alias("skewness"),
    F.kurtosis("total_amount").alias("kurtosis")
).show()

# String aggregations
print("\n4. String aggregations:")
df.agg(
    F.collect_list("customer").alias("all_customers"),
    F.collect_set("customer").alias("unique_customers_list")
).show(truncate=False)


In [None]:
# GroupBy operations
print("=== GROUPBY OPERATIONS ===")

print("1. Group by single column:")
df.groupBy("category").agg(
    F.count("*").alias("order_count"),
    F.sum("total_amount").alias("total_revenue"),
    F.avg("total_amount").alias("avg_order_value")
).show()

print("\n2. Group by multiple columns:")
df.groupBy("category", "region").agg(
    F.count("*").alias("order_count"),
    F.sum("total_amount").alias("total_revenue")
).orderBy("category", "region").show()

print("\n3. Group by customer analysis:")
customer_stats = df.groupBy("customer").agg(
    F.count("*").alias("order_count"),
    F.sum("total_amount").alias("total_spent"),
    F.avg("total_amount").alias("avg_order_value"),
    F.countDistinct("category").alias("categories_purchased"),
    F.max("total_amount").alias("largest_order")
).orderBy(F.desc("total_spent"))

customer_stats.show()

print("\n4. Date-based grouping:")
df.groupBy("date").agg(
    F.count("*").alias("daily_orders"),
    F.sum("total_amount").alias("daily_revenue")
).orderBy("date").show()

print("\n5. Advanced grouping with conditions:")
df.groupBy("region").agg(
    F.count("*").alias("total_orders"),
    F.sum(F.when(F.col("category") == "Electronics", F.col("total_amount")).otherwise(0)).alias("electronics_revenue"),
    F.sum(F.when(F.col("category") == "Books", F.col("total_amount")).otherwise(0)).alias("books_revenue"),
    F.sum(F.when(F.col("category") == "Clothing", F.col("total_amount")).otherwise(0)).alias("clothing_revenue")
).show()


In [None]:
# Window functions
print("=== WINDOW FUNCTIONS ===")

print("1. Ranking functions:")
window_spec = Window.partitionBy("category").orderBy(F.desc("total_amount"))

df.select(
    "order_id", "customer", "category", "total_amount",
    F.row_number().over(window_spec).alias("row_number"),
    F.rank().over(window_spec).alias("rank"),
    F.dense_rank().over(window_spec).alias("dense_rank")
).show()

print("\n2. Aggregate window functions:")
window_category = Window.partitionBy("category")
window_customer = Window.partitionBy("customer").orderBy("date")

df.select(
    "order_id", "customer", "category", "total_amount",
    F.sum("total_amount").over(window_category).alias("category_total"),
    F.avg("total_amount").over(window_category).alias("category_avg"),
    F.count("*").over(window_category).alias("category_count")
).show()

print("\n3. Running totals and moving averages:")
df.select(
    "order_id", "customer", "date", "total_amount",
    F.sum("total_amount").over(window_customer.rowsBetween(Window.unboundedPreceding, Window.currentRow)).alias("running_total"),
    F.avg("total_amount").over(window_customer.rowsBetween(-1, 1)).alias("moving_avg_3")
).orderBy("customer", "date").show()

print("\n4. Lag and Lead functions:")
df.select(
    "order_id", "customer", "date", "total_amount",
    F.lag("total_amount", 1).over(window_customer).alias("prev_order_amount"),
    F.lead("total_amount", 1).over(window_customer).alias("next_order_amount")
).orderBy("customer", "date").show()

print("\n5. Percentile functions:")
window_all = Window.orderBy("total_amount")

df.select(
    "order_id", "customer", "total_amount",
    F.percent_rank().over(window_all).alias("percent_rank"),
    F.ntile(4).over(window_all).alias("quartile")
).orderBy("total_amount").show()


In [None]:
# Create additional data for exercises
employee_data = [
    (1, "Alice", "Engineering", 75000, "2020-01-15", "Manager"),
    (2, "Bob", "Sales", 65000, "2019-03-20", "Associate"),
    (3, "Charlie", "Engineering", 80000, "2018-06-10", "Senior"),
    (4, "Diana", "Marketing", 70000, "2021-02-28", "Manager"),
    (5, "Eve", "Sales", 68000, "2017-11-05", "Senior"),
    (6, "Frank", "Engineering", 82000, "2020-09-12", "Senior"),
    (7, "Grace", "Marketing", 72000, "2019-07-01", "Associate"),
    (8, "Henry", "Sales", 63000, "2021-04-15", "Associate")
]

emp_df = spark.createDataFrame(employee_data, ["id", "name", "department", "salary", "hire_date", "level"])
emp_df = emp_df.withColumn("hire_date", F.to_date(emp_df.hire_date, "yyyy-MM-dd"))
emp_df = emp_df.withColumn("years_of_service", F.datediff(F.current_date(), emp_df.hire_date) / 365.25)

print("Employee data for exercises:")
emp_df.show()

print("\n=== EXERCISE 1: Department Analysis ===")
print("TODO: Create a department analysis report with:")
print("1. Number of employees per department")
print("2. Average salary per department")
print("3. Min and max salary per department")
print("4. Standard deviation of salaries per department")

# Your code here:
# dept_analysis = emp_df.groupBy("department").agg(...)

print("\n=== EXERCISE 2: Salary Bands ===")
print("TODO: Create salary bands and analyze distribution:")
print("1. Create bands: <60k, 60-70k, 70-80k, >80k")
print("2. Count employees in each band")
print("3. Show average years of service per band")

# Your code here:
# salary_bands = emp_df.withColumn("salary_band", ...)

print("\n=== EXERCISE 3: Window Functions ===")
print("TODO: Use window functions to:")
print("1. Rank employees by salary within each department")
print("2. Calculate running average salary by hire date")
print("3. Find salary percentile for each employee")

# Your code here:
# window_dept = Window.partitionBy("department").orderBy(F.desc("salary"))

print("\n=== EXERCISE 4: Advanced Grouping ===")
print("TODO: Create a pivot table showing:")
print("1. Departments as rows")
print("2. Levels as columns")
print("3. Average salary as values")

# Your code here:
# pivot_table = emp_df.groupBy("department").pivot("level").agg(...)

print("\n=== EXERCISE 5: Time-based Analysis ===")
print("TODO: Analyze hiring trends:")
print("1. Group by hire year and count employees")
print("2. Calculate average salary by hire year")
print("3. Show cumulative hiring count over time")

# Your code here:
# emp_with_year = emp_df.withColumn("hire_year", F.year("hire_date"))
