In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, desc

# Step 1: Create a SparkSession
spark = SparkSession.builder \
    .appName("MostViewedURL") \
    .master("local[*]") \
    .getOrCreate()

# Step 2: Create a DataFrame with sample log data
data = [
    (1, "http://example.com/home"),
    (2, "http://example.com/about"),
    (3, "http://example.com/home"),
    (4, "http://example.com/contact"),
    (5, "http://example.com/home"),
    (6, "http://example.com/about"),
    (7, "http://example.com/contact"),
    (8, "http://example.com/home")
]
columns = ["UserID", "URL"]
logs_df = spark.createDataFrame(data, columns)

# Step 3: Count the number of views for each URL
url_counts_df = logs_df.groupBy("URL").agg(count("URL").alias("ViewCount"))

# Step 4: Sort the URLs by the number of views in descending order
sorted_url_counts_df = url_counts_df.orderBy(desc("ViewCount"))

# Step 5: Show the most viewed URLs
sorted_url_counts_df.show()

# Stop the SparkSession
spark.stop()

+--------------------+---------+
|                 URL|ViewCount|
+--------------------+---------+
|http://example.co...|        4|
|http://example.co...|        2|
|http://example.co...|        2|
+--------------------+---------+

