In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType

# Create SparkSession (only needed if running outside Databricks)
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()

# Define schema
schema = StructType([
    StructField("account_id", IntegerType(), True),
    StructField("income", IntegerType(), True)
])

# Create DataFrame
data = [
    (3, 108939),
    (2, 12747),
    (8, 87709),
    (6, 91796)
]

df = spark.createDataFrame(data, schema=schema)

# Create a temporary view for SQL queries
df.createOrReplaceTempView("AccountsView")

print("Tables and views created successfully.")


Tables and views created successfully.


In [8]:
spark.sql("""
    WITH cte AS (
    SELECT *,
        CASE
            WHEN income < 20000 THEN 'Low Salary'
            WHEN income BETWEEN 20000 AND 50000 THEN 'Average Salary'
            WHEN income > 50000 THEN 'High Salary'
        END AS category
    FROM AccountsView)
""").show()

AnalysisException: grouping expressions sequence is empty, and 'cte.category' is not an aggregate function. Wrap '(count(1) AS `count(1)`)' in windowing function(s) or wrap 'cte.category' in first() (or first_value) if you don't care which value you get.;
WithCTE
:- CTERelationDef 3, false
:  +- SubqueryAlias cte
:     +- Project [account_id#0, income#1, CASE WHEN (income#1 < 20000) THEN Low Salary WHEN ((income#1 >= 20000) AND (income#1 <= 50000)) THEN Average Salary WHEN (income#1 > 50000) THEN High Salary END AS category#63]
:        +- SubqueryAlias accountsview
:           +- View (`AccountsView`, [account_id#0,income#1])
:              +- LogicalRDD [account_id#0, income#1], false
+- Aggregate [category#63, count(1) AS count(1)#65L]
   +- Filter (category#63 = High Salary)
      +- SubqueryAlias cte
         +- CTERelationRef 3, true, [account_id#0, income#1, category#63]
