In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count

# Создание сессии Spark
spark = SparkSession.builder.appName("Weather").getOrCreate()

# Чтение CSV файла в DataFrame
df = spark.read.csv("/home/jovyan/work/PySpark_test/config/web_server_logs.csv", header=True, inferSchema=True)

df.show()

# Создание временной представления для SQL запросов
df.createOrReplaceTempView("web_server_logs")

# Запрос для получения 10 самых активных IP-адресов
top10_ip = spark.sql("""
SELECT ip, COUNT(ip) AS request_count
FROM web_server_logs
GROUP BY ip
ORDER BY request_count DESC
LIMIT 10
""")

# Группировка по HTTP методам с подсчетом количества запросов
count_http = df.groupBy("method").agg(count("method").alias("method_count"))

# Подсчет числа 404 ответов
code_404 = df.filter(df["response_code"] == 404).agg(count("response_code"))

# Запрос для получения общего размера ответов по дням
total_size = spark.sql("""
SELECT CAST(timestamp AS date) AS date, SUM(response_size) AS total_response_size
FROM web_server_logs
GROUP BY date 
ORDER BY date 
""")

# Вывод результатов
# print("Top 10 active IP addresses:")
# top10_ip.show()
# print("Request count by HTTP method:")
# count_http.show()
# print("Number of 404 response codes:", code_404.collect()[0][0])
# print("Total response size by day:")
# total_size.show()

# Остановка сессии Spark
spark.stop()

+---------------+--------------------+------+--------------------+-------------+-------------+
|             ip|           timestamp|method|                 url|response_code|response_size|
+---------------+--------------------+------+--------------------+-------------+-------------+
|172.106.214.246|2024-01-07 04:26:...|  POST|categories/posts/...|          200|         5582|
|   176.72.6.231|2024-07-02 08:49:...|   PUT|category/wp-conte...|          200|         2412|
|135.104.133.136|2024-04-23 21:51:...|  POST|     wp-content/main|          404|          818|
| 122.109.79.170|2024-02-18 22:00:...|   PUT|                tags|          404|         7354|
|   84.174.51.56|2024-07-07 08:41:...|   GET| categories/category|          200|          374|
|  73.78.144.101|2024-07-11 21:47:...|   PUT|     explore/tag/tag|          404|         8202|
| 116.98.100.139|2024-03-15 03:34:...|DELETE|          posts/main|          500|         5782|
|  44.241.213.41|2024-03-13 19:31:...|  POST|     