In [1]:
!pip install -q pyspark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("BigDataTask1").getOrCreate()
spark


In [6]:
# Download and load the dataset
!wget -O covid.csv https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/latest/owid-covid-latest.csv

# Load CSV using PySpark
df = spark.read.option("header", True).csv("covid.csv", inferSchema=True)

# Preview the data
df.show(5)


--2025-06-24 17:30:16--  https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/latest/owid-covid-latest.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 55437 (54K) [text/plain]
Saving to: ‘covid.csv’


2025-06-24 17:30:16 (932 KB/s) - ‘covid.csv’ saved [55437/55437]

+--------+---------+--------------+-----------------+-----------+---------+------------------+------------+----------+-------------------+-----------------------+---------------------+------------------------------+------------------------+----------------------+-------------------------------+-----------------+------------+------------------------+-------------+-------------------------+---------------------+---------------------------------+----------------------+--------

In [7]:
# See column names
df.columns

# View schema (data types of each column)
df.printSchema()


root
 |-- iso_code: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- location: string (nullable = true)
 |-- last_updated_date: date (nullable = true)
 |-- total_cases: double (nullable = true)
 |-- new_cases: double (nullable = true)
 |-- new_cases_smoothed: double (nullable = true)
 |-- total_deaths: double (nullable = true)
 |-- new_deaths: double (nullable = true)
 |-- new_deaths_smoothed: double (nullable = true)
 |-- total_cases_per_million: double (nullable = true)
 |-- new_cases_per_million: double (nullable = true)
 |-- new_cases_smoothed_per_million: double (nullable = true)
 |-- total_deaths_per_million: double (nullable = true)
 |-- new_deaths_per_million: double (nullable = true)
 |-- new_deaths_smoothed_per_million: double (nullable = true)
 |-- reproduction_rate: string (nullable = true)
 |-- icu_patients: double (nullable = true)
 |-- icu_patients_per_million: double (nullable = true)
 |-- hosp_patients: double (nullable = true)
 |-- hosp_patients_

In [8]:
#top 10 countries by total covid cases
df.orderBy(df["total_cases"].desc()).select("location", "total_cases").show(10)


+--------------------+------------+
|            location| total_cases|
+--------------------+------------+
|               World|7.75866783E8|
|High-income count...|4.29044049E8|
|                Asia|3.01499099E8|
|              Europe|2.52916868E8|
|Upper-middle-inco...|2.51753518E8|
| European Union (27)|1.85822587E8|
|       North America|1.24492666E8|
|       United States|1.03436829E8|
|               China| 9.9373219E7|
|Lower-middle-inco...|   9.19544E7|
+--------------------+------------+
only showing top 10 rows



In [9]:
# Calculate death rate = total_deaths / total_cases
from pyspark.sql.functions import col, round

df = df.withColumn("death_rate", round(col("total_deaths") / col("total_cases"), 4))
df.select("location", "total_cases", "total_deaths", "death_rate").orderBy(col("death_rate").desc()).show(10)


+--------------------+-----------+------------+----------+
|            location|total_cases|total_deaths|death_rate|
+--------------------+-----------+------------+----------+
|               Yemen|    11945.0|      2159.0|    0.1807|
|               Sudan|    63993.0|      5046.0|    0.0789|
|               Syria|    57423.0|      3163.0|    0.0551|
|             Somalia|    27334.0|      1361.0|    0.0498|
|                Peru|  4526977.0|    220975.0|    0.0488|
|               Egypt|   516023.0|     24830.0|    0.0481|
|              Mexico|  7619458.0|    334551.0|    0.0439|
|Bosnia and Herzeg...|   403666.0|     16392.0|    0.0406|
|             Liberia|     7930.0|       294.0|    0.0371|
|         Afghanistan|   235214.0|      7998.0|     0.034|
+--------------------+-----------+------------+----------+
only showing top 10 rows



In [10]:
#Top 10 Countries by Vaccination Rate
# Create new column: vaccination_rate = people_vaccinated / population
df = df.withColumn("vaccination_rate", round(col("people_vaccinated") / col("population"), 4))

# Show top 10 countries by vaccination_rate
df.select("location", "people_vaccinated", "population", "vaccination_rate")\
  .orderBy(col("vaccination_rate").desc())\
  .show(10)


+--------------------+-----------------+-------------+----------------+
|            location|people_vaccinated|   population|vaccination_rate|
+--------------------+-----------------+-------------+----------------+
|           Hong Kong|        6920120.0|    7488863.0|          0.9241|
|Upper-middle-inco...|    2.109015426E9|  2.5259213E9|          0.8349|
|            Malaysia|      2.8138569E7|  3.3938216E7|          0.8291|
|High-income count...|     9.98723024E8|  1.2505146E9|          0.7986|
|                Asia|    3.689438947E9| 4.72138337E9|          0.7814|
|       North America|     4.58563506E8| 6.00323657E8|          0.7639|
| European Union (27)|     3.38119562E8| 4.50146793E8|          0.7511|
|               India|    1.027438924E9| 1.41717312E9|           0.725|
|           Lithuania|        1958300.0|    2750058.0|          0.7121|
|               World|    5.631263739E9|7.975105024E9|          0.7061|
+--------------------+-----------------+-------------+----------

In [11]:
# Total Cases per Million People
# Create new column: cases_per_million = total_cases / population * 1,000,000
df = df.withColumn("cases_per_million", round((col("total_cases") / col("population")) * 1000000, 2))

# Show top 10 countries
df.select("location", "total_cases", "population", "cases_per_million")\
  .orderBy(col("cases_per_million").desc())\
  .show(10)


+-------------+-----------+-----------+-----------------+
|     location|total_cases| population|cases_per_million|
+-------------+-----------+-----------+-----------------+
|       Cyprus|   696410.0|   896007.0|        777237.23|
|       Brunei|   347723.0|   449002.0|         774435.3|
|   San Marino|    25292.0|    33690.0|        750727.22|
|      Austria|  6082444.0|  8939617.0|        680392.01|
|  South Korea|3.4571873E7|5.1815808E7|        667207.06|
|Faroe Islands|    34658.0|    53117.0|        652484.14|
|     Slovenia|  1356582.0|  2119843.0|        639944.56|
|    Gibraltar|    20550.0|    32677.0|         628882.7|
|   Martinique|   230354.0|   367512.0|        626793.14|
|   Luxembourg|   393542.0|   647601.0|        607692.08|
+-------------+-----------+-----------+-----------------+
only showing top 10 rows



In [12]:
#Does Higher Vaccination Lower Death Rate?
# Filter countries where both vaccination_rate and death_rate are not null
df_filtered = df.filter((col("vaccination_rate").isNotNull()) & (col("death_rate").isNotNull()))

# Select relevant columns
df_filtered.select("location", "vaccination_rate", "death_rate")\
  .orderBy(col("vaccination_rate").desc())\
  .show(15)


+--------------------+----------------+----------+
|            location|vaccination_rate|death_rate|
+--------------------+----------------+----------+
|Upper-middle-inco...|          0.8349|    0.0112|
|            Malaysia|          0.8291|     0.007|
|High-income count...|          0.7986|     0.007|
|                Asia|          0.7814|    0.0054|
|       North America|          0.7639|    0.0134|
| European Union (27)|          0.7511|    0.0068|
|               India|           0.725|    0.0118|
|           Lithuania|          0.7121|    0.0072|
|               World|          0.7061|    0.0091|
|              Europe|          0.7033|    0.0083|
|             Czechia|          0.6653|    0.0091|
|Lower-middle-inco...|          0.6649|    0.0129|
|             Estonia|          0.6562|    0.0049|
|             Oceania|           0.643|    0.0022|
+--------------------+----------------+----------+



In [None]:
#Deeper Insights:

#Countries like HONG KONG, Malaysia have vaccinated a high proportion of their population.

#Adjusting for population,countries like Cyprus,Brunei had highest infection densities.

#There appears to be an inverse pattern between vaccination rate and death rate, suggesting vaccinations reduce mortality.

#These findings show the power of PySpark in analyzing large public health datasets quickly and at scale.

