In [None]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col, max, row_number, desc
from pyspark.sql.window import Window
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.appName("covid_analysis").getOrCreate()

In [None]:
df = spark.read.csv("covid-data.csv", header = True)
df.show()

In [None]:
result_df = (df.filter(F.col("date").between("2021-03-25", "2021-03-31"))
               .withColumn("Число", F.date_format(col("date"), "dd"))
               .withColumn("Страны", F.col("location"))
               .withColumn("кол-во новых случаев", F.col("new_cases")))
result_df = (result_df.groupBy("Страны","Число")
                      .agg(F.sum("кол-во новых случаев").alias("Кол-во новых случаев")))
windowSpec = Window.partitionBy("Страны").orderBy(col("Кол-во новых случаев").desc())
result_df = result_df.withColumn("number", F.row_number().over(windowSpec))
result_df.show()

In [5]:
result = (result_df.filter(F.col("number") == 1)
                   .filter(~F.col("Страны").isin("World", "Europe", "European Union", "Asia","South America","North America", "United States"))
                   .select("Число", "Страны", "Кол-во новых случаев")
                   .orderBy(F.col("Кол-во новых случаев").desc())
                   .limit(10))

result.show()


+-----+-------+--------------------+
|Число| Страны|Кол-во новых случаев|
+-----+-------+--------------------+
|   25| Brazil|            100158.0|
|   31|  India|             72330.0|
|   31| France|             59054.0|
|   31| Turkey|             39302.0|
|   26| Poland|             35145.0|
|   31|Germany|             25014.0|
|   26|  Italy|             24076.0|
|   25|   Peru|             19206.0|
|   26| Africa|             18544.0|
|   26|Ukraine|             18226.0|
+-----+-------+--------------------+

