In [8]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F


# Настройка Spark

spark = SparkSession.builder.appName("pyspark_exam").getOrCreate()

logs = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load('/content/web_server_logs.csv')

top10_ip = logs.groupBy("ip").agg(F.count("url").alias("request_count")).sort(F.col("request_count").desc()).limit(10)
print("top 10 active IP:")
top10_ip.show()

http_counts = logs.groupBy("method").agg(F.count("method").alias("method_count"))
print("Request count by methods:")
http_counts.show()

print("Number of 404 response: ", logs.filter('response_code = 404').count())

date_statistic = logs.withColumn('date', F.to_date(F.col("timestamp")))
print("total responze size by date:")
date_statistic.groupBy("date").agg(F.sum("response_size").alias("total_response_size")).sort(F.col("date")).show()

spark.stop()


top 10 active IP:
+--------------+-------------+
|            ip|request_count|
+--------------+-------------+
|   12.21.29.78|            2|
|  25.109.44.28|            1|
| 104.239.87.63|            1|
|118.180.221.28|            1|
|153.11.180.123|            1|
|46.208.205.165|            1|
| 52.166.106.62|            1|
| 160.245.31.85|            1|
| 131.125.93.43|            1|
| 61.194.71.253|            1|
+--------------+-------------+

Request count by methods:
+------+------------+
|method|method_count|
+------+------------+
|  POST|       24812|
|DELETE|       25105|
|   PUT|       25061|
|   GET|       25022|
+------+------------+

Number of 404 response:  24985
total responze size by date:
+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2025-01-01|            4547557|
|2025-01-02|            4484407|
|2025-01-03|            4657385|
|2025-01-04|            4783012|
|2025-01-05|            4420930|
|2025-01-06|        

In [5]:
logs.printSchema()

root
 |-- ip: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- method: string (nullable = true)
 |-- url: string (nullable = true)
 |-- response_code: integer (nullable = true)
 |-- response_size: integer (nullable = true)

