In [1]:
from pyspark.sql import SparkSession

spark =SparkSession.builder.appName("june12set2").enableHiveSupport().getOrCreate()
spark

In [9]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)


Mounted at /content/drive


In [2]:
from pyspark.sql import Row
web_data = [
    Row(UserID=1, Page="Home", Timestamp="2024-04-10 10:00:00", Duration=35, Device="Mobile", Country="India"),
    Row(UserID=2, Page="Products", Timestamp="2024-04-10 10:02:00", Duration=120, Device="Desktop", Country="USA"),
    Row(UserID=3, Page="Cart", Timestamp="2024-04-10 10:05:00", Duration=45, Device="Tablet", Country="UK"),
    Row(UserID=1, Page="Checkout", Timestamp="2024-04-10 10:08:00", Duration=60, Device="Mobile", Country="India"),
    Row(UserID=4, Page="Home", Timestamp="2024-04-10 10:10:00", Duration=15, Device="Mobile", Country="Canada"),
    Row(UserID=2, Page="Contact", Timestamp="2024-04-10 10:15:00", Duration=25, Device="Desktop", Country="USA"),
    Row(UserID=5, Page="Products", Timestamp="2024-04-10 10:20:00", Duration=90, Device="Desktop", Country="India"),
]

df_web = spark.createDataFrame(web_data)
df_web.show()


+------+--------+-------------------+--------+-------+-------+
|UserID|    Page|          Timestamp|Duration| Device|Country|
+------+--------+-------------------+--------+-------+-------+
|     1|    Home|2024-04-10 10:00:00|      35| Mobile|  India|
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|
|     3|    Cart|2024-04-10 10:05:00|      45| Tablet|     UK|
|     1|Checkout|2024-04-10 10:08:00|      60| Mobile|  India|
|     4|    Home|2024-04-10 10:10:00|      15| Mobile| Canada|
|     2| Contact|2024-04-10 10:15:00|      25|Desktop|    USA|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|
+------+--------+-------------------+--------+-------+-------+



In [10]:
# Data Exploration & Preparation
# 1. Display the schema of web_traffic_data .
# 2. Convert the Timestamp column to a proper timestamp type.
# 3. Add a new column SessionMinute by extracting the minute from the Timestamp .
from pyspark.sql.functions import *
df_web.printSchema()
df_web.withColumn("Timestamp",to_timestamp("Timestamp"))
df_web.withColumn("SessionMinute",minute("Timestamp"))


# Filtering and Conditions
# 4. Filter users who used a "Mobile" device and visited the "Checkout" page.
# 5. Show all entries with a Duration greater than 60 seconds.
# 6. Find all users from India who visited the "Products" page.
df_web.filter((col("Device")=="Mobile") & (col("Page")=="Checkout")).show()
df_web.filter(col("Duration")>60).show()
# Aggregation and Grouping
# 7. Get the average duration per device type.
# 8. Count the number of sessions per country.
# 9. Find the most visited page overall.
df_web.groupBy("Device").agg(avg("Duration")).show()
df_web.groupBy("Country").agg(count("UserID")).show()
df_web.groupBy("Page").count().orderBy(desc("count")).limit(1).show()
# Window Functions
# 10. Rank each user’s pages by timestamp (oldest to newest).
# 11. Find the total duration of all sessions per user using groupBy .
from pyspark.sql.window import Window
windowval = Window.partitionBy("UserID").orderBy("Timestamp")
df_web.withColumn("Rank", row_number().over(windowval)).select("UserID", "Page", "Timestamp", "Rank").show()
df_web.groupBy("UserID").agg(sum("Duration")).show()
# Spark SQL Tasks
# 12. Create a temporary view called traffic_view .
# 13. Write a SQL query to get the top 2 longest sessions by duration.
# 14. Get the number of unique users per page using SQL.
df_web.createOrReplaceTempView("traffic_view")
spark.sql("SELECT Page, Duration FROM traffic_view ORDER BY Duration DESC LIMIT 2").show()
spark.sql("SELECT Page, COUNT(DISTINCT UserID) AS UniqueUsers FROM traffic_view GROUP BY Page").show()

# Export & Save
# 15. Save the final DataFrame to CSV.
# 16. Save partitioned by Country in Parquet format.
df_web.write.mode("overwrite").option("header", "true").csv("/content/drive/MyDrive/pysparkout/csv_output")
df_web.write.mode("overwrite").partitionBy("Country").parquet("/content/drive/MyDrive/pysparkout/parquet_output")


root
 |-- UserID: long (nullable = true)
 |-- Page: string (nullable = true)
 |-- Timestamp: string (nullable = true)
 |-- Duration: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Country: string (nullable = true)

+------+--------+-------------------+--------+------+-------+
|UserID|    Page|          Timestamp|Duration|Device|Country|
+------+--------+-------------------+--------+------+-------+
|     1|Checkout|2024-04-10 10:08:00|      60|Mobile|  India|
+------+--------+-------------------+--------+------+-------+

+------+--------+-------------------+--------+-------+-------+
|UserID|    Page|          Timestamp|Duration| Device|Country|
+------+--------+-------------------+--------+-------+-------+
|     2|Products|2024-04-10 10:02:00|     120|Desktop|    USA|
|     5|Products|2024-04-10 10:20:00|      90|Desktop|  India|
+------+--------+-------------------+--------+-------+-------+

+-------+------------------+
| Device|     avg(Duration)|
+-------+---------