In [None]:
https://www.youtube.com/watch?v=P6kNMyqKD0A

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
spark = SparkSession.builder.appName("PySparkTables").getOrCreate()

# Define schema for entries table
entries_schema = StructType([
    StructField("name", StringType(), True),
    StructField("address", StringType(), True),
    StructField("email", StringType(), True),
    StructField("floor", IntegerType(), True),
    StructField("resources", StringType(), True)
])

# Create DataFrame with initial data
entries_data = [
    ("A", "Bangalore", "A@gmail.com", 1, "CPU"),
    ( "A", "Bangalore", "A1@gmail.com", 1, "CPU"),
    ( "A", "Bangalore", "A2@gmail.com", 1, "DESKTOP"),
    ( "B", "Bangalore", "B@gmail.com", 2, "DESKTOP"),
    ( "B", "Bangalore", "B1@gmail.com", 2, "DESKTOP"),
    ( "B", "Bangalore", "B2@gmail.com", 1, "MONITOR")
]

# Create DataFrame
entries_df = spark.createDataFrame(entries_data, schema=entries_schema)

# Create a temporary view for SQL queries
entries_df.createOrReplaceTempView("entries")

# Create a persistent table (Delta format)

print("entries table and view created successfully.")


entries table and view created successfully.


In [15]:
spark.sql("""
with cte as (
    select name, floor, count(floor) as floor_visit_count,
    rank() over (partition by name order by count(floor) desc) as rn
    from entries
    group by name,floor),
    
    cte2 as (
    select name, count(*) as total_visits,
    ARRAY_JOIN(COLLECT_SET(resources), ', ') AS resource_list 
    from entries group by name)
    
    select * from cte2
    

""").show()

+----+------------+----------------+
|name|total_visits|   resource_list|
+----+------------+----------------+
|   A|           3|    DESKTOP, CPU|
|   B|           3|DESKTOP, MONITOR|
+----+------------+----------------+



In [26]:
from pyspark.sql.functions import col, count, collect_set, array_join, rank
from pyspark.sql.window import Window
window_spec = Window.partitionBy("name").orderBy(col("floor_visit_count").desc())

entries_df.groupBy(col("name"), col("floor")).agg(
    count(col("floor")).alias("floor_visit_count")) \
    .withColumn("rn", rank().over(window_spec)).show()



+----+-----+-----------------+---+
|name|floor|floor_visit_count| rn|
+----+-----+-----------------+---+
|   A|    1|                3|  1|
|   B|    2|                2|  1|
|   B|    1|                1|  2|
+----+-----+-----------------+---+



In [27]:
cte2 = (
    entries_df.groupBy("name")
    .agg(
        count("*").alias("total_visits"), 
        array_join(collect_set("resources"), ", ").alias("resource_list")  # Unique resources
    )
)

In [28]:
cte2.show()

+----+------------+----------------+
|name|total_visits|   resource_list|
+----+------------+----------------+
|   A|           3|    DESKTOP, CPU|
|   B|           3|DESKTOP, MONITOR|
+----+------------+----------------+

