In [50]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count

# Создание сессии Spark
spark = SparkSession.builder.appName("Weather").getOrCreate()

# Чтение CSV файла в DataFrame
df = spark.read.csv("/home/jovyan/work/PySpark_test/config/web_server_logs.csv", header=True, inferSchema=True)

# Создание временной представления для SQL запросов
df.createOrReplaceTempView("web_server_logs")

# Запрос для получения 10 самых активных IP-адресов
top10_ip = spark.sql("""
SELECT ip, COUNT(ip) AS request_count
FROM web_server_logs
GROUP BY ip
ORDER BY request_count DESC
LIMIT 10
""")

# Группировка по HTTP методам с подсчетом количества запросов
count_http = df.groupBy("method").agg(count("method").alias("method_count"))

# Подсчет числа 404 ответов
code_404 = df.filter(df["response_code"] == 404).agg(count("response_code"))

# Запрос для получения общего размера ответов по дням
total_size = spark.sql("""
SELECT CAST(timestamp AS date) AS date, SUM(response_size) AS total_response_size
FROM web_server_logs
GROUP BY date 
ORDER BY date 
""")

# Вывод результатов
print("Top 10 active IP addresses:")
top10_ip.show()
print("Request count by HTTP method:")
count_http.show()
print("Number of 404 response codes:", code_404.collect()[0][0])
print("Total response size by day:")
total_size.show()

# Остановка сессии Spark
spark.stop()

Top 10 active IP addresses:
+---------------+-------------+
|             ip|request_count|
+---------------+-------------+
| 14.226.125.170|            1|
|121.124.114.243|            1|
|163.102.177.155|            1|
|181.132.185.191|            1|
| 139.161.147.74|            1|
|  176.73.26.141|            1|
|   13.5.107.209|            1|
|151.255.128.177|            1|
| 117.109.237.24|            1|
|    6.192.11.79|            1|
+---------------+-------------+

Request count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|  POST|       24836|
|DELETE|       25074|
|   PUT|       25010|
|   GET|       25080|
+------+------------+

Number of 404 response codes: 25006
Total response size by day:
+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2024-01-01|            2574720|
|2024-01-02|            2384346|
|2024-01-03|            2451687|
|2024-01-04|            2582353|
|2024-01-05|         