In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import *

# Initialize a Spark session
spark = SparkSession.builder.appName("SurveyData").getOrCreate()

# Define the data
data = [
    ("Alex", 4, "USA"),
    ("Saurabh", 5, "US"),
    ("Mark", 4, "United States"),
    ("Shane", 4, "USA"),
    ("Kim", 5, "United States"),
    ("Joe", 5, "USA"),
    ("Mira", 5, "United States"),
    ("John", 3, "USA"),
    ("Jane", 4, "United States"),
    ("Sam", 3, "US"),
    ("Sara", 4, "USA"),
    ("Luis", 5, "United States"),
    ("Carlos", 4, "US"),
    ("Anna", 3, "USA"),
    ("Maria", 5, "United States"),
]

# Define the schema
columns = ["name", "job_satisfaction", "country"]

# Create the DataFrame
survey_df = spark.createDataFrame(data, schema=columns)

# Show the DataFrame
survey_df.show()


+-------+----------------+-------------+
|   name|job_satisfaction|      country|
+-------+----------------+-------------+
|   Alex|               4|          USA|
|Saurabh|               5|           US|
|   Mark|               4|United States|
|  Shane|               4|          USA|
|    Kim|               5|United States|
|    Joe|               5|          USA|
|   Mira|               5|United States|
|   John|               3|          USA|
|   Jane|               4|United States|
|    Sam|               3|           US|
|   Sara|               4|          USA|
|   Luis|               5|United States|
| Carlos|               4|           US|
|   Anna|               3|          USA|
|  Maria|               5|United States|
+-------+----------------+-------------+



In [0]:
# Step 1: Create the CTE equivalent by aggregating the survey data
cte_survey = survey_df.groupBy("job_satisfaction", "country") \
    .agg(count("*").alias("number_of_respondents"))

# Step 2: Add the total_response and max_response columns using window functions
window_spec = Window.partitionBy("job_satisfaction")

survey_with_responses = cte_survey.withColumn(
    "total_response", sum("number_of_respondents").over(window_spec)
).withColumn(
    "max_response", max("number_of_respondents").over(window_spec)
)

# Step 3: Filter rows where max_response equals number_of_respondents
result_df = survey_with_responses.filter(
    col("max_response") == col("number_of_respondents")
).select(
    "job_satisfaction", "country", "total_response"
).orderBy("number_of_respondents")

# Show the final result
result_df.show()


+----------------+-------------+--------------+
|job_satisfaction|      country|total_response|
+----------------+-------------+--------------+
|               3|          USA|             3|
|               4|          USA|             6|
|               5|United States|             6|
+----------------+-------------+--------------+

