<center>

# $\textbf{Covid-19 - until 2024-02-04}$

<center>

### $\textbf{Code}$

In [320]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, row_number
from pyspark.sql.window import Window
from pyspark.sql.functions import when

In [321]:
spark = SparkSession.builder.appName('Covid').master("local").enableHiveSupport().getOrCreate()
spark

In [322]:
# Creating dataframe from the csv file and infering the schema
df = spark.read.load("Files/Covid.csv", format="csv", sep=",", inferschema="true", header="true")

                                                                                

In [323]:
# Select the columns you're interested in
df = df.select("location","date","total_cases")

In [324]:
# Rename the columns "location" and "total_cases"
df = df.withColumnRenamed("location","country")
df = df.withColumnRenamed("total_cases","covid_cases")

In [325]:
# Group by country and year, and select the last date of each year for each country

# Define a window specification to partition by year and location and order by date
window_spec = Window.partitionBy(year("date"), "country").orderBy("date")

# Add row numbers to each partition
df = df.withColumn("row_number", row_number().over(window_spec))

# Filter rows where row_number = 1
df = df.filter(col("row_number") == 1).drop("row_number")

In [326]:
# Extract the year from the 'date' column and store it in a new column
df = df.withColumn("Year", year("Date"))

In [327]:
# Drop the 'date' and 'continent' columns, and rename columns
df = df.drop("date")
df = df.orderBy("location")

In [328]:
#Storing this dataframe in parquet
df.write.mode("overwrite").parquet("FilesParquet/Covid.parquet" )
spark.read.parquet("FilesParquet/Covid.parquet").show()
spark.stop()

                                                                                

+-----------+-----------+----+
|    country|covid_cases|Year|
+-----------+-----------+----+
|Afghanistan|       null|2020|
|Afghanistan|    51848.0|2021|
|Afghanistan|   157902.0|2022|
|Afghanistan|   207579.0|2023|
|Afghanistan|   230375.0|2024|
|     Africa|       null|2020|
|     Africa|  2662452.0|2021|
|     Africa|  9634415.0|2022|
|     Africa|1.3001383E7|2023|
|     Africa|1.3130839E7|2024|
|    Albania|       null|2020|
|    Albania|    55380.0|2021|
|    Albania|   207221.0|2022|
|    Albania|   332794.0|2023|
|    Albania|   334596.0|2024|
|    Algeria|       null|2020|
|    Algeria|    97857.0|2021|
|    Algeria|   216376.0|2022|
|    Algeria|   271217.0|2023|
|    Algeria|   272010.0|2024|
+-----------+-----------+----+
only showing top 20 rows

