**Intialize the SparkSession**

In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
      .appName("web-traffic")\
      .getOrCreate()
spark

**Create Dataframe**

In [0]:
from pyspark.sql import Row
web_data = [
    Row(UserID=1, Page="Home", Timestamp="2024-04-10 10:00:00", Duration=35, Device="Mobile", Country="India"),
    Row(UserID=2, Page="Products", Timestamp="2024-04-10 10:02:00", Duration=120, Device="Desktop", Country="USA"),
    Row(UserID=3, Page="Cart", Timestamp="2024-04-10 10:05:00", Duration=45, Device="Tablet", Country="UK"),
    Row(UserID=1, Page="Checkout", Timestamp="2024-04-10 10:08:00", Duration=60, Device="Mobile", Country="India"),
    Row(UserID=4, Page="Home", Timestamp="2024-04-10 10:10:00", Duration=15, Device="Mobile", Country="Canada"),
    Row(UserID=2, Page="Contact", Timestamp="2024-04-10 10:15:00", Duration=25, Device="Desktop", Country="USA"),
    Row(UserID=5, Page="Products", Timestamp="2024-04-10 10:20:00", Duration=90, Device="Desktop", Country="India"),
]
df_web = spark.createDataFrame(web_data)
df_web.show(truncate=False)
df_web.printSchema()

+------+--------+-------------------+--------+-------+-------+
|UserID|Page    |Timestamp          |Duration|Device |Country|
+------+--------+-------------------+--------+-------+-------+
|1     |Home    |2024-04-10 10:00:00|35      |Mobile |India  |
|2     |Products|2024-04-10 10:02:00|120     |Desktop|USA    |
|3     |Cart    |2024-04-10 10:05:00|45      |Tablet |UK     |
|1     |Checkout|2024-04-10 10:08:00|60      |Mobile |India  |
|4     |Home    |2024-04-10 10:10:00|15      |Mobile |Canada |
|2     |Contact |2024-04-10 10:15:00|25      |Desktop|USA    |
|5     |Products|2024-04-10 10:20:00|90      |Desktop|India  |
+------+--------+-------------------+--------+-------+-------+

root
 |-- UserID: long (nullable = true)
 |-- Page: string (nullable = true)
 |-- Timestamp: string (nullable = true)
 |-- Duration: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Country: string (nullable = true)



**Data Exploration & Preparation**

In [0]:
#1.Display the schema
print("schema of web traffic:")
df_web.printSchema()
#2.Convert Timestamp to proper timestamp type
from pyspark.sql.functions import to_timestamp,minute
df_web = df_web.withColumn("Timestamp", to_timestamp("Timestamp", "yyyy-MM-dd HH:mm:ss"))
df_web.printSchema()
#3.Add SessionMinute column
df_web = df_web.withColumn("SessionMinute", minute("Timestamp"))
df_web.select("UserID", "Page", "Timestamp", "SessionMinute").show()

schema of web traffic:
root
 |-- UserID: long (nullable = true)
 |-- Page: string (nullable = true)
 |-- Timestamp: string (nullable = true)
 |-- Duration: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Country: string (nullable = true)

root
 |-- UserID: long (nullable = true)
 |-- Page: string (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- Duration: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Country: string (nullable = true)

+------+--------+-------------------+-------------+
|UserID|    Page|          Timestamp|SessionMinute|
+------+--------+-------------------+-------------+
|     1|    Home|2024-04-10 10:00:00|            0|
|     2|Products|2024-04-10 10:02:00|            2|
|     3|    Cart|2024-04-10 10:05:00|            5|
|     1|Checkout|2024-04-10 10:08:00|            8|
|     4|    Home|2024-04-10 10:10:00|           10|
|     2| Contact|2024-04-10 10:15:00|           15|
|     5|Products|2024-04-10 10:20:00| 

**Filtering and Conditions**

In [0]:
#4.Filter users who used a "Mobile" device and visited the "Checkout" page
from pyspark.sql.functions import col
print("Mobile users who visited the Checkout page:")
df_web.filter((col("Device") == "Mobile") & (col("Page") == "Checkout")).show()
#5.Show all entries with a Duration greater than 60 seconds
print("Entries with a Duration greater than 60 seconds:")
df_web.filter(col("Duration") > 60).show()
#6.Find all users from India who visited the "Products" page
print("Users from India who visited the Products page:")
df_web.filter((col("Country") == "India") & (col("Page") == "Products")).show()

Mobile users who visited the Checkout page:
+------+--------+-------------------+--------+------+-------+-------------+
|UserID|    Page|          Timestamp|Duration|Device|Country|SessionMinute|
+------+--------+-------------------+--------+------+-------+-------------+
|     1|Checkout|2024-04-10 10:08:00|      60|Mobile|  India|            8|
+------+--------+-------------------+--------+------+-------+-------------+

Entries with a Duration greater than 60 seconds:
+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|
+------+--------+-------------------+--------+-------+-------+-------------+
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|            2|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|           20|
+------+--------+-------------------+--------+-------+-------+-------------+

Users from India who visited the Products page:
+------+-------

**Aggregation and Grouping**

In [0]:
#7.Get the average duration per device type
from pyspark.sql.functions import avg
print("Average duration per device type:")
df_web.groupBy("Device").agg(avg("Duration").alias("AverageDuration")).show()
#8.Count the number of sessions per country
from pyspark.sql.functions import countDistinct
print("Number of sessions per country:")
df_web.groupBy("Country").agg(countDistinct("UserID").alias("Sessions")).show()
#9.Find the most visited page overall
from pyspark.sql.functions import desc
print("Most visited page overall:")
df_web.groupBy("Page").count().orderBy(desc("count")).limit(1).show()

Average duration per device type:
+-------+------------------+
| Device|   AverageDuration|
+-------+------------------+
| Mobile|36.666666666666664|
| Tablet|              45.0|
|Desktop| 78.33333333333333|
+-------+------------------+

Number of sessions per country:
+-------+--------+
|Country|Sessions|
+-------+--------+
|  India|       2|
|    USA|       1|
|     UK|       1|
| Canada|       1|
+-------+--------+

Most visited page overall:
+----+-----+
|Page|count|
+----+-----+
|Home|    2|
+----+-----+



**Window Functions**

In [0]:
#10.Rank each user’s pages by timestamp (oldest to newest)
from pyspark.sql.functions import dense_rank
from pyspark.sql.window import Window
w= Window.partitionBy("UserID").orderBy("Timestamp")
ranked = df_web.withColumn("PageRank", dense_rank().over(w))
print("Ranked pages by timestamp:")
ranked.select("UserID", "Page", "Timestamp", "PageRank").show()
#11.Find the total duration of all sessions per user using groupBy
print("Total duration of all sessions per user:")
df_web.groupBy("UserID").sum("Duration").withColumnRenamed("sum(Duration)", "TotalDuration").show()

Ranked pages by timestamp:
+------+--------+-------------------+--------+
|UserID|    Page|          Timestamp|PageRank|
+------+--------+-------------------+--------+
|     1|    Home|2024-04-10 10:00:00|       1|
|     1|Checkout|2024-04-10 10:08:00|       2|
|     2|Products|2024-04-10 10:02:00|       1|
|     2| Contact|2024-04-10 10:15:00|       2|
|     3|    Cart|2024-04-10 10:05:00|       1|
|     4|    Home|2024-04-10 10:10:00|       1|
|     5|Products|2024-04-10 10:20:00|       1|
+------+--------+-------------------+--------+

Total duration of all sessions per user:
+------+-------------+
|UserID|TotalDuration|
+------+-------------+
|     1|           95|
|     3|           45|
|     2|          145|
|     4|           15|
|     5|           90|
+------+-------------+



**Spark SQL Tasks**

In [0]:
#12.Create a temporary view called traffic_view
df_web.createOrReplaceTempView("traffic_view")
#13.Write a SQL query to get the top 2 longest sessions by duration
print("Top 2 longest sessions by duration:")
spark.sql("""
SELECT * FROM (
  SELECT *, DENSE_RANK() OVER (ORDER BY Duration DESC) as rank
  FROM traffic_view
) WHERE rank <= 2
""").show()
#14.Get the number of unique users per page using SQL
print("Number of unique users per page:") 
spark.sql("SELECT Page, COUNT(DISTINCT UserID) AS UniqueUsers FROM traffic_view GROUP BY Page").show()

Top 2 longest sessions by duration:
+------+--------+-------------------+--------+-------+-------+-------------+----+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|rank|
+------+--------+-------------------+--------+-------+-------+-------------+----+
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|            2|   1|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|           20|   2|
+------+--------+-------------------+--------+-------+-------+-------------+----+

Number of unique users per page:
+--------+-----------+
|    Page|UniqueUsers|
+--------+-----------+
|    Cart|          1|
|    Home|          2|
|Checkout|          1|
|Products|          2|
| Contact|          1|
+--------+-----------+



**Export & Save**

In [0]:
#15.Save the final DataFrame to CSV
df_web.write.mode("overwrite").option("header", True).csv("/mnt/csv/web_traffic_output")
#16.Save partitioned by Country in Parquet format
df_web.write.mode("overwrite").partitionBy("Country").parquet("/mnt/parquet/web_traffic_partitioned")

In [0]:
#diaplay the saved file
display(dbutils.fs.ls("/mnt/csv/web_traffic_output"))
display(dbutils.fs.ls("/mnt/parquet/web_traffic_partitioned"))

path,name,size,modificationTime
dbfs:/mnt/csv/web_traffic_output/_SUCCESS,_SUCCESS,0,1749703513000
dbfs:/mnt/csv/web_traffic_output/_committed_4489892982662196564,_committed_4489892982662196564,380,1749703513000
dbfs:/mnt/csv/web_traffic_output/_started_4489892982662196564,_started_4489892982662196564,0,1749703513000
dbfs:/mnt/csv/web_traffic_output/part-00000-tid-4489892982662196564-ed2806e8-e133-44fd-aef6-cf1e640d39e2-191-1-c000.csv,part-00000-tid-4489892982662196564-ed2806e8-e133-44fd-aef6-cf1e640d39e2-191-1-c000.csv,110,1749703513000
dbfs:/mnt/csv/web_traffic_output/part-00001-tid-4489892982662196564-ed2806e8-e133-44fd-aef6-cf1e640d39e2-192-1-c000.csv,part-00001-tid-4489892982662196564-ed2806e8-e133-44fd-aef6-cf1e640d39e2-192-1-c000.csv,161,1749703513000
dbfs:/mnt/csv/web_traffic_output/part-00002-tid-4489892982662196564-ed2806e8-e133-44fd-aef6-cf1e640d39e2-193-1-c000.csv,part-00002-tid-4489892982662196564-ed2806e8-e133-44fd-aef6-cf1e640d39e2-193-1-c000.csv,166,1749703513000
dbfs:/mnt/csv/web_traffic_output/part-00003-tid-4489892982662196564-ed2806e8-e133-44fd-aef6-cf1e640d39e2-194-1-c000.csv,part-00003-tid-4489892982662196564-ed2806e8-e133-44fd-aef6-cf1e640d39e2-194-1-c000.csv,169,1749703513000


path,name,size,modificationTime
dbfs:/mnt/parquet/web_traffic_partitioned/Country=Canada/,Country=Canada/,0,1749703514000
dbfs:/mnt/parquet/web_traffic_partitioned/Country=India/,Country=India/,0,1749703513000
dbfs:/mnt/parquet/web_traffic_partitioned/Country=UK/,Country=UK/,0,1749703513000
dbfs:/mnt/parquet/web_traffic_partitioned/Country=USA/,Country=USA/,0,1749703513000
dbfs:/mnt/parquet/web_traffic_partitioned/_SUCCESS,_SUCCESS,0,1749703514000
