### Analysiis Log

In [35]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Log Analysis") \
    .getOrCreate()

# Load the CSV file with schema inference
df = spark.read.csv("data/web_server_logs.csv", header=True, inferSchema=True)

# Add a new column that extracts the date from the timestamp
df_with_date = df.withColumn("date", to_date(col("timestamp")))


# Group by IP and count number of requests
ip_request_counts = df.groupBy("ip") \
    .count() \
    .withColumnRenamed("count", "request_count") \
    .orderBy(col("request_count").desc())

print("Top 10 active IP addresses:")
ip_request_counts.show(10)


# Group by HTTP method and count the number of requests
group_by_method = df.groupBy("method") \
    .count() \
    .withColumnRenamed("count", "method_count") \
    .orderBy(col("method_count"))

print("Requests count by HTTP method:")
group_by_method.show(10)


# Filter and count the number of requests with a 404 response code.
requests = df.filter(col("response_code") == 404) \
    .count()

print(f"Number of 404 response code: {requests}") 


# Group the data by date and sum the response sizes, then sort by date
daily_stats = df_with_date.groupBy("date") \
    .sum("response_size") \
    .withColumnRenamed("sum(response_size)", "total_response_size") \
    .orderBy(col("date"))

print("Total response size by day:")
daily_stats.show(20)

Top 10 active IP addresses:
+---------------+-------------+
|             ip|request_count|
+---------------+-------------+
|  180.37.222.54|            1|
|  90.234.175.93|            1|
| 137.134.34.254|            1|
|   120.24.47.74|            1|
|    9.42.193.82|            1|
| 20.226.236.156|            1|
|  82.164.150.52|            1|
|  70.216.116.59|            1|
|137.131.115.117|            1|
| 42.175.154.228|            1|
+---------------+-------------+
only showing top 10 rows

Requests count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|   GET|       24822|
|   PUT|       25016|
|  POST|       25021|
|DELETE|       25141|
+------+------------+

Number of 404 response code: 24945
Total response size by day:
+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2025-01-01|            3880741|
|2025-01-02|            3898155|
|2025-01-03|            4022114|
|2025-01-04|            37909