# Spark Data Processing Example

This notebook demonstrates PySpark usage for large-scale data processing.

In [None]:
from {{PROJECT_NAME}}.spark_utils import SparkSessionManager, get_local_spark_session
from {{PROJECT_NAME}}.data_utils import read_data, write_data

from pyspark.sql import functions as F
from pyspark.sql.types import *

## Initialize Spark Session

In [None]:
# Option 1: Using context manager
with SparkSessionManager(app_name="{{PROJECT_NAME}}_notebook") as spark:
    print(f"Spark version: {spark.version}")
    print(f"Spark master: {spark.sparkContext.master}")

In [None]:
# Option 2: Direct session creation
spark = get_local_spark_session("{{PROJECT_NAME}}_analysis")
print(f"Spark UI: http://localhost:4040")

## Load and Process Data

In [None]:
# Load data with Spark
# df = read_data('../data/raw/your_data.parquet', spark=spark)

# Example: Create sample data
sample_data = [
    (1, "Alice", 25, "Engineering"),
    (2, "Bob", 30, "Marketing"),
    (3, "Charlie", 35, "Engineering"),
    (4, "Diana", 28, "Sales"),
]

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("department", StringType(), True),
])

df = spark.createDataFrame(sample_data, schema)
df.show()

## Data Transformations

In [None]:
# Example transformations
result = df.groupBy("department").agg(
    F.count("*").alias("employee_count"),
    F.avg("age").alias("avg_age"),
    F.min("age").alias("min_age"),
    F.max("age").alias("max_age")
)

result.show()

## Save Results

In [None]:
# Save processed data
# write_data(result, '../data/processed/department_stats.parquet')

## Cleanup

In [None]:
# Stop Spark session
spark.stop()