In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
                    .appName("Exercise_1") \
                .getOrCreate()
spark

<pyspark.sql.connect.session.SparkSession at 0x7f1257ee39d0>

#Dataset: web_traffic_data

In [0]:
from datetime import datetime 
from pyspark.sql import Row 
web_data = [ 
                Row(UserID=1, Page="Home", Timestamp="2024-04-10 10:00:00", Duration=35, 
                Device="Mobile", Country="India"), 
                Row(UserID=2, Page="Products", Timestamp="2024-04-10 10:02:00", Duration=120, 
                Device="Desktop", Country="USA"), 
                Row(UserID=3, Page="Cart", Timestamp="2024-04-10 10:05:00", Duration=45, 
                Device="Tablet", Country="UK"), 
                Row(UserID=1, Page="Checkout", Timestamp="2024-04-10 10:08:00", Duration=60, 
                Device="Mobile", Country="India"), 
                Row(UserID=4, Page="Home", Timestamp="2024-04-10 10:10:00", Duration=15, 
                Device="Mobile", Country="Canada"), 
                Row(UserID=2, Page="Contact", Timestamp="2024-04-10 10:15:00", Duration=25, 
                Device="Desktop", Country="USA"), 
                Row(UserID=5, Page="Products", Timestamp="2024-04-10 10:20:00", Duration=90, 
                Device="Desktop", Country="India"), 
            ] 
web_traffic_df = spark.createDataFrame(web_data) 
web_traffic_df.show(truncate=False)

+------+--------+-------------------+--------+-------+-------+
|UserID|Page    |Timestamp          |Duration|Device |Country|
+------+--------+-------------------+--------+-------+-------+
|1     |Home    |2024-04-10 10:00:00|35      |Mobile |India  |
|2     |Products|2024-04-10 10:02:00|120     |Desktop|USA    |
|3     |Cart    |2024-04-10 10:05:00|45      |Tablet |UK     |
|1     |Checkout|2024-04-10 10:08:00|60      |Mobile |India  |
|4     |Home    |2024-04-10 10:10:00|15      |Mobile |Canada |
|2     |Contact |2024-04-10 10:15:00|25      |Desktop|USA    |
|5     |Products|2024-04-10 10:20:00|90      |Desktop|India  |
+------+--------+-------------------+--------+-------+-------+



## PySpark Exercises – Set 5 (Web Traffic Data)
### Data Exploration & Preparation

1. Display the schema Of web _traffic_data

In [0]:
web_traffic_df.printSchema()

root
 |-- UserID: long (nullable = true)
 |-- Page: string (nullable = true)
 |-- Timestamp: string (nullable = true)
 |-- Duration: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Country: string (nullable = true)



2. convert the Timestamp column to a proper timestamp type .

In [0]:
from pyspark.sql.functions import to_timestamp, col
web_traffic_df = web_traffic_df.withColumn("Timestamp", to_timestamp(col("Timestamp")))
web_traffic_df.printSchema()

root
 |-- UserID: long (nullable = true)
 |-- Page: string (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- Duration: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Country: string (nullable = true)



3. Add a new column SessionMinute by extracting the minute from the Timestamp

In [0]:
from pyspark.sql.functions import minute
web_traffic_df = web_traffic_df.withColumn("SessionMinute", minute(col("Timestamp")))
web_traffic_df.show()

+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|
+------+--------+-------------------+--------+-------+-------+-------------+
|     1|    Home|2024-04-10 10:00:00|      35| Mobile|  India|            0|
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|            2|
|     3|    Cart|2024-04-10 10:05:00|      45| Tablet|     UK|            5|
|     1|Checkout|2024-04-10 10:08:00|      60| Mobile|  India|            8|
|     4|    Home|2024-04-10 10:10:00|      15| Mobile| Canada|           10|
|     2| Contact|2024-04-10 10:15:00|      25|Desktop|    USA|           15|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|           20|
+------+--------+-------------------+--------+-------+-------+-------------+



### Filtering and Conditions

4.  Filter users who used a "Mobile" device and visited the "Checkout" page.

In [0]:
web_traffic_df.filter((col("Device") == "Mobile") & (col("Page") == "Checkout")) \
              .select("UserID", "Timestamp", "Duration", "Country") \
              .show()

+------+-------------------+--------+-------+
|UserID|          Timestamp|Duration|Country|
+------+-------------------+--------+-------+
|     1|2024-04-10 10:08:00|      60|  India|
+------+-------------------+--------+-------+



5. Show all entries with a Duration greater than 60 seconds.

In [0]:
from  pyspark.sql.functions import second
web_traffic_df.filter(col("Duration") > 60).show()

+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|
+------+--------+-------------------+--------+-------+-------+-------------+
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|            2|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|           20|
+------+--------+-------------------+--------+-------+-------+-------------+



6. Find all users from India who visited the "Products" page

In [0]:
web_traffic_df.filter((col("Country") == "India") & (col("Page") == "Products")) \
              .select("UserID", "Timestamp", "Duration", "Device") \
              .show()

+------+-------------------+--------+-------+
|UserID|          Timestamp|Duration| Device|
+------+-------------------+--------+-------+
|     5|2024-04-10 10:20:00|      90|Desktop|
+------+-------------------+--------+-------+



### Aggregation and Grouping

7. Get the average duration per device type.

In [0]:
from pyspark.sql.functions import avg, round
web_traffic_df.groupBy(col("Device")) \
              .agg(round(avg(col("Duration")), 2).alias("AverageDuration")) \
              .show()

+-------+---------------+
| Device|AverageDuration|
+-------+---------------+
| Mobile|          36.67|
|Desktop|          78.33|
| Tablet|           45.0|
+-------+---------------+



8. Count the number of sessions per country.

In [0]:
web_traffic_df.groupBy(col("Country")) \
              .count() \
              .withColumnRenamed("Count", "TotalSessions") \
              .show()

+-------+-------------+
|Country|TotalSessions|
+-------+-------------+
|  India|            3|
|    USA|            2|
|     UK|            1|
| Canada|            1|
+-------+-------------+



9.  Find the most visited page overall.

In [0]:
from pyspark.sql.functions import count
web_traffic_df.groupBy("Page") \
    .agg(count("*").alias("VisitCount")) \
    .orderBy("VisitCount", ascending=False) \
    .withColumnRenamed("Page", "MostVisitedPage") \
    .show(1)

+---------------+----------+
|MostVisitedPage|VisitCount|
+---------------+----------+
|           Home|         2|
+---------------+----------+
only showing top 1 row


###  Window Functions

10.  Rank each user’s pages by timestamp (oldest to newest).

In [0]:
from pyspark.sql import Window
from pyspark.sql.functions import rank
user_df = Window.partitionBy("UserID").orderBy("Timestamp")
rank_df = web_traffic_df.withColumn("TimeStampRank", rank().over(user_df))

rank_df.select("UserID", "Page", "Timestamp", "Duration", "TimeStampRank", "Country") \
       .show()

+------+--------+-------------------+--------+-------------+-------+
|UserID|    Page|          Timestamp|Duration|TimeStampRank|Country|
+------+--------+-------------------+--------+-------------+-------+
|     1|    Home|2024-04-10 10:00:00|      35|            1|  India|
|     1|Checkout|2024-04-10 10:08:00|      60|            2|  India|
|     2|Products|2024-04-10 10:02:00|     120|            1|    USA|
|     2| Contact|2024-04-10 10:15:00|      25|            2|    USA|
|     3|    Cart|2024-04-10 10:05:00|      45|            1|     UK|
|     4|    Home|2024-04-10 10:10:00|      15|            1| Canada|
|     5|Products|2024-04-10 10:20:00|      90|            1|  India|
+------+--------+-------------------+--------+-------------+-------+



11. Find the total duration of all sessions per user using 
groupBy .

In [0]:
from pyspark.sql.functions import sum
web_traffic_df.groupBy(col("UserID")) \
              .agg(sum("Duration").alias("TotalDuration")) \
              .show()

+------+-------------+
|UserID|TotalDuration|
+------+-------------+
|     1|           95|
|     2|          145|
|     3|           45|
|     4|           15|
|     5|           90|
+------+-------------+



### Spark SQL Tasks

12. Create a temporary view called traffic_view .

In [0]:
web_traffic_df.createOrReplaceTempView("traffic_view")

13.  Write a SQL query to get the top 2 longest sessions by duration.

In [0]:
spark.sql("""
            SELECT 
                UserID,
                Timestamp,
                Duration, 
                Country,
                RANK() OVER(ORDER BY Duration DESC) AS LongestDuration
            FROM traffic_view
            LIMIT 2    
""").show()

+------+-------------------+--------+-------+---------------+
|UserID|          Timestamp|Duration|Country|LongestDuration|
+------+-------------------+--------+-------+---------------+
|     2|2024-04-10 10:02:00|     120|    USA|              1|
|     5|2024-04-10 10:20:00|      90|  India|              2|
+------+-------------------+--------+-------+---------------+



14. Get the number of unique users per page using SQL

In [0]:
spark.sql("""
            SELECT
              Page,
              COUNT(DISTINCT UserID) AS UniqueUsers
            FROM traffic_view
            GROUP BY Page
            ORDER BY UniqueUsers DESC
""").show()

+--------+-----------+
|    Page|UniqueUsers|
+--------+-----------+
|Products|          2|
|    Home|          2|
|Checkout|          1|
| Contact|          1|
|    Cart|          1|
+--------+-----------+



###  Export & Save

15. Save the final DataFrame to CSV.

In [0]:
web_traffic_df.write \
              .option("Header", True) \
              .mode("Overwrite") \
              .csv("dbfs:/FileStore/web_traffic.csv")

16.  Save partitioned by Country in Parquet format

In [0]:
web_traffic_df.write \
              .partitionBy("Country") \
              .mode("overwrite") \
              .parquet("dbfs:/FileStore/web_traffic_parquet")

Viewing Files which is saved

In [0]:
display(dbutils.fs.ls("dbfs:/FileStore/"))

path,name,size,modificationTime
dbfs:/FileStore/employee_data_csv/,employee_data_csv/,0,1749632743000
dbfs:/FileStore/exploded_parquet/,exploded_parquet/,0,1749703797000
dbfs:/FileStore/joined_data_parquet/,joined_data_parquet/,0,1749632828000
dbfs:/FileStore/performance_csv/,performance_csv/,0,1749633311000
dbfs:/FileStore/web_traffic.csv/,web_traffic.csv/,0,1749706929000
dbfs:/FileStore/web_traffic_parquet/,web_traffic_parquet/,0,1749706989000
