<a href="https://colab.research.google.com/github/Benji8bit/dpro_spark/blob/main/spark_hw_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Шаг 1: Импорт необходимых библиотек и создание Spark Session

In [2]:
pip install pyspark



In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, sum, datediff, to_date, when

# Создаем Spark Session
spark = SparkSession.builder.appName("COVID-19 Analysis").getOrCreate()

### Шаг 2: Загрузка данных
У нас есть CSV-файл с данными о COVID-19. Загрузим его в DataFrame:

In [4]:
# Файл называется 'covid-data.csv' и находится в той же директории
df = spark.read.csv('covid-data.csv', header=True, inferSchema=True)

In [5]:
df.show(5)

+--------+---------+-----------+----------+-----------+---------+------------------+------------+----------+-------------------+-----------------------+---------------------+------------------------------+------------------------+----------------------+-------------------------------+-----------------+------------+------------------------+-------------+-------------------------+---------------------+---------------------------------+----------------------+----------------------------------+---------+-----------+------------------------+----------------------+------------------+-------------------------------+-------------+--------------+-----------+------------------+-----------------+-----------------------+----------------+-------------------------+------------------------------+-----------------------------+-----------------------------------+-------------------------------------+----------------+-----------+------------------+----------+-------------+-------------+--------------+--

In [6]:
df.printSchema()

root
 |-- iso_code: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- location: string (nullable = true)
 |-- date: date (nullable = true)
 |-- total_cases: double (nullable = true)
 |-- new_cases: double (nullable = true)
 |-- new_cases_smoothed: double (nullable = true)
 |-- total_deaths: double (nullable = true)
 |-- new_deaths: double (nullable = true)
 |-- new_deaths_smoothed: double (nullable = true)
 |-- total_cases_per_million: double (nullable = true)
 |-- new_cases_per_million: double (nullable = true)
 |-- new_cases_smoothed_per_million: double (nullable = true)
 |-- total_deaths_per_million: double (nullable = true)
 |-- new_deaths_per_million: double (nullable = true)
 |-- new_deaths_smoothed_per_million: double (nullable = true)
 |-- reproduction_rate: double (nullable = true)
 |-- icu_patients: double (nullable = true)
 |-- icu_patients_per_million: double (nullable = true)
 |-- hosp_patients: double (nullable = true)
 |-- hosp_patients_per_million: 

###Шаг 3: Подготовка данных
Отфильтруем датасет по дате

In [7]:
df.count()

82289

In [8]:
filtered_df = df.filter(col("date") == lit("2021-03-31")).filter("iso_code NOT LIKE '%OWID%'")
filtered_df.count()

202

In [9]:
filtered_df.select("iso_code", "location", "date", "new_cases").orderBy(col("new_cases").desc()).show()

+--------+-------------+----------+---------+
|iso_code|     location|      date|new_cases|
+--------+-------------+----------+---------+
|     BRA|       Brazil|2021-03-31|  90638.0|
|     IND|        India|2021-03-31|  72330.0|
|     USA|United States|2021-03-31|  67039.0|
|     FRA|       France|2021-03-31|  59054.0|
|     TUR|       Turkey|2021-03-31|  39302.0|
|     POL|       Poland|2021-03-31|  32891.0|
|     DEU|      Germany|2021-03-31|  25014.0|
|     ITA|        Italy|2021-03-31|  23887.0|
|     ARG|    Argentina|2021-03-31|  16056.0|
|     PER|         Peru|2021-03-31|  15686.0|
|     UKR|      Ukraine|2021-03-31|  11345.0|
|     IRN|         Iran|2021-03-31|  10330.0|
|     CAN|       Canada|2021-03-31|   8728.0|
|     CZE|      Czechia|2021-03-31|   8664.0|
|     COL|     Colombia|2021-03-31|   8646.0|
|     ESP|        Spain|2021-03-31|   8534.0|
|     SWE|       Sweden|2021-03-31|   8441.0|
|     NLD|  Netherlands|2021-03-31|   8221.0|
|     RUS|       Russia|2021-03-31

###Шаг 4: Выбор 15 стран с наибольшим процентом переболевших

In [10]:
total_cases_top_15_df = filtered_df.sort(filtered_df.total_cases.desc()).limit(15)

In [11]:
total_cases_top_15_df.select("iso_code", "location", "population", "total_cases").show()

+--------+--------------+-------------+-----------+
|iso_code|      location|   population|total_cases|
+--------+--------------+-------------+-----------+
|     USA| United States| 3.31002647E8| 3.046221E7|
|     BRA|        Brazil| 2.12559409E8|1.2748747E7|
|     IND|         India|1.380004385E9|1.2221665E7|
|     FRA|        France|  6.8147687E7|  4705068.0|
|     RUS|        Russia|  1.4593446E8|  4494234.0|
|     GBR|United Kingdom|  6.7886004E7|  4359982.0|
|     ITA|         Italy|  6.0461828E7|  3584899.0|
|     TUR|        Turkey|  8.4339067E7|  3317182.0|
|     ESP|         Spain|  4.6754783E7|  3284353.0|
|     DEU|       Germany|  8.3783945E7|  2843644.0|
|     COL|      Colombia|  5.0882884E7|  2406377.0|
|     ARG|     Argentina|  4.5195777E7|  2348821.0|
|     POL|        Poland|  3.7846605E7|  2321717.0|
|     MEX|        Mexico| 1.28932753E8|  2238887.0|
|     IRN|          Iran|  8.3992953E7|  1885564.0|
+--------+--------------+-------------+-----------+



In [12]:
cases_percent_df = total_cases_top_15_df.withColumn("cases_percent", (total_cases_top_15_df.total_cases * 100)/total_cases_top_15_df.population)

In [13]:
cases_percent_df = cases_percent_df.select("iso_code", "location", "cases_percent").orderBy(cases_percent_df.cases_percent.desc()).show(15)

+--------+--------------+------------------+
|iso_code|      location|     cases_percent|
+--------+--------------+------------------+
|     USA| United States| 9.203010995860707|
|     ESP|         Spain|  7.02463531912874|
|     FRA|        France| 6.904222589388837|
|     GBR|United Kingdom| 6.422504998232037|
|     POL|        Poland| 6.134544961166266|
|     BRA|        Brazil| 5.997733555986693|
|     ITA|         Italy| 5.929193870883295|
|     ARG|     Argentina|  5.19699218800907|
|     COL|      Colombia| 4.729246479032124|
|     TUR|        Turkey|3.9331499837435953|
|     DEU|       Germany| 3.394020178925688|
|     RUS|        Russia|  3.07962492203692|
|     IRN|          Iran| 2.244907379313119|
|     MEX|        Mexico|1.7364765336236945|
|     IND|         India|0.8856250844449309|
+--------+--------------+------------------+



###Шаг 5: Выбор 10 стран с максимальным зафиксированным количеством новых случаев за последнюю неделю марта 2021

In [14]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, max

In [15]:
# Фильтруем данные за последнюю неделю марта 2021
df_march_21 = df.filter((col("date") >= "2021-03-23") & (col("date") <= "2021-03-31"))
df_march_21.count()

1927

In [16]:
# Определение окна, которое делит данные по location
window_spec = Window.partitionBy("location")

# Добавление новой колонки с максимальным значением в каждой группе
df_top_10_locs = df_march_21.withColumn("max_new_cases", max("new_cases").over(window_spec))

# Вывод результатов
df_top_10_locs = df_top_10_locs.select("date", "location", "max_new_cases").filter("iso_code NOT LIKE '%OWID%'")
# Выше также отфильтровали iso_code 'OWID_', предоставляющий данные в разрезе континентального и мирового масштаба
df_top_10_locs = df_top_10_locs.filter(col("new_cases") == col("max_new_cases")).orderBy(df_top_10_locs.max_new_cases.desc())
df_top_10_locs.show(10)

+----------+-------------+-------------+
|      date|     location|max_new_cases|
+----------+-------------+-------------+
|2021-03-25|       Brazil|     100158.0|
|2021-03-24|United States|      86960.0|
|2021-03-31|        India|      72330.0|
|2021-03-24|       France|      65392.0|
|2021-03-31|       Turkey|      39302.0|
|2021-03-26|       Poland|      35145.0|
|2021-03-31|      Germany|      25014.0|
|2021-03-26|        Italy|      24076.0|
|2021-03-25|         Peru|      19206.0|
|2021-03-26|      Ukraine|      18226.0|
+----------+-------------+-------------+
only showing top 10 rows



###Шаг 6: Изменение случаев относительно предыдущего дня в России за последнюю неделю марта 2021

In [17]:
# Группируем данные по дате и подсчитываем новые случаи
daily_cases = df.filter(col("iso_code") == "RUS").groupBy("iso_code", "date").agg(sum("new_cases").alias("cases"))
daily_cases.show(5)

+--------+----------+-------+
|iso_code|      date|  cases|
+--------+----------+-------+
|     RUS|2021-03-24| 8769.0|
|     RUS|2020-02-04|    0.0|
|     RUS|2020-09-17| 5667.0|
|     RUS|2020-04-28| 6411.0|
|     RUS|2020-11-06|20368.0|
+--------+----------+-------+
only showing top 5 rows



In [18]:
# Преобразуем данные в DataFrame с колонками для каждого дня
daily_cases_df = daily_cases.withColumn("prev_day", lag(col("cases"), 1).over(Window.orderBy("date")))
daily_cases_df.show(5)

+--------+----------+-----+--------+
|iso_code|      date|cases|prev_day|
+--------+----------+-----+--------+
|     RUS|2020-01-31|  2.0|    NULL|
|     RUS|2020-02-01|  0.0|     2.0|
|     RUS|2020-02-02|  0.0|     0.0|
|     RUS|2020-02-03|  0.0|     0.0|
|     RUS|2020-02-04|  0.0|     0.0|
+--------+----------+-----+--------+
only showing top 5 rows



In [19]:
# Вычисляем изменение случаев
delta_cases_rus = daily_cases_df.withColumn("delta", col("cases") - col("prev_day")).filter((col("date") >= "2021-03-23") & (col("date") <= "2021-03-31"))
delta_cases_rus.show(5)

+--------+----------+------+--------+------+
|iso_code|      date| cases|prev_day| delta|
+--------+----------+------+--------+------+
|     RUS|2021-03-23|8369.0|  9195.0|-826.0|
|     RUS|2021-03-24|8769.0|  8369.0| 400.0|
|     RUS|2021-03-25|9128.0|  8769.0| 359.0|
|     RUS|2021-03-26|9073.0|  9128.0| -55.0|
|     RUS|2021-03-27|8783.0|  9073.0|-290.0|
+--------+----------+------+--------+------+
only showing top 5 rows



In [20]:
# Выводим результат
delta_cases_rus.select("date", "cases", "delta").orderBy("date").show()

+----------+------+------+
|      date| cases| delta|
+----------+------+------+
|2021-03-23|8369.0|-826.0|
|2021-03-24|8769.0| 400.0|
|2021-03-25|9128.0| 359.0|
|2021-03-26|9073.0| -55.0|
|2021-03-27|8783.0|-290.0|
|2021-03-28|8979.0| 196.0|
|2021-03-29|8589.0|-390.0|
|2021-03-30|8162.0|-427.0|
|2021-03-31|8156.0|  -6.0|
+----------+------+------+

