In [None]:
import kagglehub
import os
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("imdevskp/corona-virus-report")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/imdevskp/corona-virus-report?dataset_version_number=166...


100%|██████████| 19.0M/19.0M [00:00<00:00, 57.8MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/imdevskp/corona-virus-report/versions/166


# Exercise 4

This exercise focuses on data visualization and interpretation using a real-world COVID-19 dataset. The dataset contains daily records of confirmed cases, deaths, recoveries, and active cases across countries and regions, along with temporal and geographic information.
The goal of this exercise is not only to create charts, but to choose appropriate visualizations, apply correct data aggregation, and draw meaningful insights from the data. You will work with time-based, categorical, numerical, and geographic variables, and you are expected to think critically about how design choices affect interpretation.

Your visualizations should follow good practices:
- Use clear titles, axis labels, and legends
- Choose chart types appropriate to the data and question
- Avoid misleading scales or cluttered designs
- Clearly explain patterns, trends, or anomalies you observe

Unless stated otherwise, you may filter, aggregate, or group the data as needed.

<img src="https://d3i6fh83elv35t.cloudfront.net/static/2020/03/Screen-Shot-2020-03-05-at-6.29.29-PM-1024x574.png"/>

In [None]:
if os.path.isdir(path):
  print(True)

contents = os.listdir(path)
contents

mydataset = path + "/" + contents[0]
mydataset


df = pd.read_csv(mydataset)

True


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35156 entries, 0 to 35155
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Date            35156 non-null  object
 1   Country/Region  35156 non-null  object
 2   Confirmed       35156 non-null  int64 
 3   Deaths          35156 non-null  int64 
 4   Recovered       35156 non-null  int64 
 5   Active          35156 non-null  int64 
 6   New cases       35156 non-null  int64 
 7   New deaths      35156 non-null  int64 
 8   New recovered   35156 non-null  int64 
 9   WHO Region      35156 non-null  object
dtypes: int64(7), object(3)
memory usage: 2.7+ MB


In [None]:
df.head()

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,WHO Region
0,2020-01-22,Afghanistan,0,0,0,0,0,0,0,Eastern Mediterranean
1,2020-01-22,Albania,0,0,0,0,0,0,0,Europe
2,2020-01-22,Algeria,0,0,0,0,0,0,0,Africa
3,2020-01-22,Andorra,0,0,0,0,0,0,0,Europe
4,2020-01-22,Angola,0,0,0,0,0,0,0,Africa


In [None]:
df.query("`Country/Region` == 'Philippines'")

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,WHO Region
133,2020-01-22,Philippines,0,0,0,0,0,0,0,Western Pacific
320,2020-01-23,Philippines,0,0,0,0,0,0,0,Western Pacific
507,2020-01-24,Philippines,0,0,0,0,0,0,0,Western Pacific
694,2020-01-25,Philippines,0,0,0,0,0,0,0,Western Pacific
881,2020-01-26,Philippines,0,0,0,0,0,0,0,Western Pacific
...,...,...,...,...,...,...,...,...,...,...
34354,2020-07-23,Philippines,74390,1871,24383,48136,2121,28,760,Western Pacific
34541,2020-07-24,Philippines,76444,1879,24502,50063,2054,8,119,Western Pacific
34728,2020-07-25,Philippines,78412,1897,25752,50763,1968,18,1250,Western Pacific
34915,2020-07-26,Philippines,80448,1932,26110,52406,2036,35,358,Western Pacific


In [None]:
df['Country/Region'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Brunei', 'Bulgaria', 'Burkina Faso', 'Burma', 'Burundi',
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
       'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia',
       'Comoros', 'Congo (Brazzaville)', 'Congo (Kinshasa)', 'Costa Rica',
       "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Denmark',
       'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gabon',
       'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Greenland',
       'Grenada', 'Guatemala', 'Guinea', 'G

## A. Time-Based Visualizations

1. Global Trend `(5 pts)`

Aggregate the data by Date and create a line chart showing the global number of confirmed COVID-19 cases over time.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#Categorizing "Date" data by month
grouped = df.groupby(df['Date'].dt.to_period('M'))["Confirmed"].sum().reset_index() #monthly
grouped['Date'] = grouped['Date'].astype(str)

#Showing Line Chart
plt.figure(figsize=(12, 6))
sns.lineplot(data=grouped, x="Date", y="Confirmed", color="blue")
plt.title("Global Number of Confirmed COVID-19 Cases Per Month")
plt.xlabel("Month")
plt.ylabel("Confirmed Cases")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

AttributeError: Can only use .dt accessor with datetimelike values

2. Country-Level Trends `(5 pts)`

Select three countries and visualize their confirmed case counts over time on the same plot.

In [None]:
#Angola, Uzbekistan, Poland

df_selected = df[df['Country/Region'].isin(['Angola', 'Uzbekistan', 'Poland'])]
df_selected['Date'] = pd.to_datetime(df_selected['Date'])
grouped = df_selected.groupby(['Country/Region', df_selected['Date'].dt.to_period('M')])['Confirmed'].sum().reset_index()
grouped['Date'] = grouped['Date'].astype(str)

plt.figure(figsize=(12, 6))
sns.lineplot(data=grouped, x='Date', y='Confirmed', hue='Country/Region')
plt.title('Number of Confirmed Cases Over Time for Angola, Poland, and Uzbekistan')
plt.xlabel('Date')
plt.ylabel('Confirmed Cases')
plt.xticks(rotation=45)
plt.legend(title='Country')
plt.show()

3. Active vs Recovered `(5 pts)`

For a selected country, create a line chart showing Active and Recovered cases over time.

In [None]:
df_selected = df[df['Country/Region'] == 'France']
df_selected['Date'] = pd.to_datetime(df_selected['Date'])
grouped = df_selected.groupby('Date')[['Active', 'Recovered']].sum().reset_index()

plt.figure(figsize=(12, 6))
sns.lineplot(data=grouped, x='Date', y='Active', label='Active', color='blue')
sns.lineplot(data=grouped, x='Date', y='Recovered', label='Recovered', color='green')
plt.title('Active and Recovered Cases Over Time for France')
plt.xlabel('Date')
plt.ylabel('Count')
plt.legend()
plt.show()

## B: Comparative Visualizations

4. Country Comparison `(5 pts)`

Using data from a single date, create a bar chart showing the top 10 countries by confirmed cases.

In [None]:
grouped_countries = df['Country/Region'].value_counts().reset_index()
grouped_countries.columns = ['Country', 'Confirmed Cases']
grouped_countries = grouped_countries.head(10)

plt.figure(figsize=(12, 6))
sns.barplot(data=grouped_countries, x='Country', y='Confirmed Cases', palette='viridis')
plt.title('Top 10 Countries by Confirmed Cases')

5. WHO Region Comparison `(5 pts)`

Aggregate confirmed cases by WHO Region and visualize the result using a bar chart.

In [None]:
grouped_region = df.groupby(df['WHO Region'])["Confirmed"].sum().reset_index()

plt.figure(figsize=(12, 6))
sns.barplot(data=grouped_region, x='WHO Region', y='Confirmed', palette='viridis')
plt.title('Confirmed Cases by WHO Region')

## C. Geographic Visualization

6. Geographic Spread `(10 pts)`

Using Latitude and Longitude, create a map-based visualization showing confirmed cases for a selected date.

In [None]:
import plotly.express as px
import pandas as pd

select_date = '2020-07-07'
df_date = df[df['Date'] == select_date]

geo_vis = df_date.groupby(['Lat', 'Long', 'Country/Region'])['Confirmed'].max().reset_index()

fig = px.scatter_geo(geo_vis,
                     lat='Lat',
                     lon='Long',
                     color='Confirmed',
                     hover_name='Country/Region',
                     size='Confirmed',
                     projection='natural earth',
                     title=f'Geographic Spread of Confirmed COVID-19 Cases on {select_date}',
                     color_continuous_scale=px.colors.sequential.Viridis)

fig.show()

7. Regional Clustering `(15 pts)`

Create a visualization that shows how confirmed cases are distributed geographically within a single WHO Region.

In [None]:
select_region = 'Americas'
df_region = df[df['WHO Region'] == select_region]

geo_vis_2 = df_region.groupby(['Lat', 'Long', 'Country/Region'])['Confirmed'].max().reset_index()

fig = px.scatter_geo(geo_vis_2,
                     lat='Lat',
                     lon='Long',
                     color='Confirmed',
                     hover_name='Country/Region',
                     size='Confirmed',
                     projection='natural earth',
                     title=f'Regional Clustering of Confirmed COVID-19 Cases in {select_region}',
                     color_continuous_scale=px.colors.sequential.Viridis)

fig.show()