<center>

# $\textbf{Covid-19 - until 2024-02-04}$

<center>

### $\textbf{Code}$

In [63]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, year, sum, col, when
from pyspark.sql.window import Window

In [64]:
spark = SparkSession.builder.appName('Covid').master("local").enableHiveSupport().getOrCreate()
spark

In [65]:
# Creating dataframe from the csv file and infering the schema
df = spark.read.load("Files/Covid.csv", format="csv", sep=",", inferschema="true", header="true")

                                                                                

In [66]:
# Select the columns you're interested in
df = df.select("location","date","total_cases")

In [67]:
# Rename the columns "location" and "total_cases"
df = df.withColumnRenamed("location","country")
df = df.withColumnRenamed("total_cases","covid_cases")

In [68]:
# Extract year from 'day' column
df = df.withColumn("year", year("date"))
df = df.drop("date")

# Replace null values in 'total_cases' column with 0
df = df.withColumn("covid_cases", when(df["covid_cases"].isNull(), 0).otherwise(df["covid_cases"]))

# Replace null values in 'total_cases' column with 0
#df = df.na.fill({'total_cases': 0})

# Remove non-numeric characters from 'total_cases' column
df = df.withColumn("covid_cases", regexp_replace("covid_cases", "[^0-9]", ""))

# Cast 'total_cases' column to integer type
df = df.withColumn("covid_cases", col("covid_cases").cast("int"))

# Filter out any null values after cleaning
df = df.filter(col("covid_cases").isNotNull())

# Calculate total cases per year
df = df.groupBy("country", "year").agg(sum("covid_cases"))

# Order by country and then by year
df = df.orderBy("country", "year")

# Rename 'sum(covid_cases)' to 'covid_cases'
df = df.withColumnRenamed("sum(covid_cases)","covid_cases")

In [69]:
#Storing this dataframe in parquet
df.write.mode("overwrite").parquet("FilesParquet/Covid.parquet" )
spark.read.parquet("FilesParquet/Covid.parquet").show()
spark.stop()

                                                                                

+-----------+----+-----------+
|    country|year|covid_cases|
+-----------+----+-----------+
|Afghanistan|2020|   82849920|
|Afghanistan|2021|  391774200|
|Afghanistan|2022|  675763230|
|Afghanistan|2023|  802611370|
|Afghanistan|2024|   80812150|
|     Africa|2020| 2850575310|
|     Africa|2021|22223520660|
|     Africa|2022|40143651429|
|     Africa|2023|46949658745|
|     Africa|2024| 4596780605|
|    Albania|2020|   34448550|
|    Albania|2021|  503635430|
|    Albania|2022| 1072461670|
|    Albania|2023| 1218905130|
|    Albania|2024|  117160410|
|    Algeria|2020|  101885690|
|    Algeria|2021|  567017170|
|    Algeria|2022|  963218050|
|    Algeria|2023|  991880860|
|    Algeria|2024|   95203500|
+-----------+----+-----------+
only showing top 20 rows

