# COVID-19 Global Data Tracker

A clean, end-to-end notebook to explore and visualize global COVID-19 data.

**Contents**
1. Setup & Imports
2. Load Data
3. Clean & Select
4. Quick EDA
5. Global Trends
6. Top Countries
7. Per-Capita Comparisons
8. Vaccination Overview
9. Save Outputs (optional)

In [None]:
# 1) Setup & Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pathlib import Path

plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['axes.grid'] = True
sns.set_context('talk')

OUT_DIR = Path('outputs')
OUT_DIR.mkdir(exist_ok=True)
print('Environment ready.')

In [None]:
# 2) Load Data (from Our World in Data) with basic error handling
URL = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'
try:
    df = pd.read_csv(URL)
    print('Loaded rows:', len(df))
except Exception as e:
    raise SystemExit(f'Failed to load data from {URL}: {e}')

In [None]:
# 3) Clean & Select
cols = [
    'location','continent','date','population',
    'total_cases','new_cases','total_deaths','new_deaths',
    'total_cases_per_million','total_deaths_per_million',
    'new_cases_smoothed','new_deaths_smoothed',
    'people_vaccinated','people_fully_vaccinated','total_boosters'
]
available = [c for c in cols if c in df.columns]
data = df[available].copy()
data['date'] = pd.to_datetime(data['date'], errors='coerce')
data = data.dropna(subset=['date'])
data.sort_values(['location','date'], inplace=True)
data.head()

In [None]:
# Utility functions (reusable)
def latest_snapshot(frame: pd.DataFrame) -> pd.DataFrame:
    """Return the latest row for each location."""
    return frame.sort_values('date').groupby('location', as_index=False).tail(1)

def top_n(frame: pd.DataFrame, col: str, n: int = 10) -> pd.DataFrame:
    """Return top n locations by a numeric column at latest snapshot."""
    snap = latest_snapshot(frame)
    snap = snap.dropna(subset=[col])
    return snap.sort_values(col, ascending=False).head(n)

In [None]:
# 4) Quick EDA
countries = data['location'].nunique() if 'location' in data.columns else None
date_min = data['date'].min()
date_max = data['date'].max()
print(f'Countries/regions: {countries}\nDate range: {date_min.date()} → {date_max.date()}')
data.describe(include='all').T.head(12)

In [None]:
# 5) Global Trends: total cases and deaths over time
global_daily = data.groupby('date', as_index=False).agg({
    'total_cases':'sum',
    'total_deaths':'sum'
})

fig, ax = plt.subplots()
ax.plot(global_daily['date'], global_daily['total_cases'], label='Total cases')
ax.plot(global_daily['date'], global_daily['total_deaths'], label='Total deaths')
ax.set_title('Global COVID-19 Totals Over Time')
ax.set_xlabel('Date')
ax.set_ylabel('Count (sum across locations)')
ax.legend();
plt.tight_layout()
plt.savefig(OUT_DIR/'global_trends_matplotlib.png', dpi=150)
plt.show()

In [None]:
# Interactive view (Plotly)
fig = px.line(global_daily, x='date', y=['total_cases','total_deaths'],
              title='Global COVID-19 Totals Over Time (Interactive)')
fig.show()

In [None]:
# 6) Top countries by cumulative cases (latest snapshot)
top10_cases = top_n(data, 'total_cases', n=10)
fig = px.bar(top10_cases, x='location', y='total_cases',
             title='Top 10 Countries by Total Cases (Latest)', text='total_cases')
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.update_layout(xaxis_title='Country', yaxis_title='Total cases')
fig.show()
top10_cases[['location','total_cases','total_deaths','population']].reset_index(drop=True)

In [None]:
# 7) Per-capita comparisons (cases/deaths per million)
percap_cols = [c for c in ['total_cases_per_million','total_deaths_per_million'] if c in data.columns]
if percap_cols:
    snap = latest_snapshot(data)
    snap = snap.dropna(subset=percap_cols)
    top_cases_pm = snap.sort_values('total_cases_per_million', ascending=False).head(15)
    fig = px.bar(top_cases_pm, x='location', y='total_cases_per_million',
                 title='Top 15 by Total Cases per Million (Latest)')
    fig.show()
else:
    print('Per-capita columns not found in dataset.')

In [None]:
# 8) Vaccination overview (if present)
vacc_cols = [c for c in ['people_vaccinated','people_fully_vaccinated','total_boosters'] if c in data.columns]
if vacc_cols:
    vacc = data.dropna(subset=['date'])[['location','date'] + vacc_cols].copy()
    world_vacc = vacc.groupby('date', as_index=False)[vacc_cols].sum()
    fig = px.line(world_vacc, x='date', y=vacc_cols, title='Global Vaccination Progress')
    fig.show()
else:
    print('Vaccination columns not found in dataset.')

In [None]:
# 9) Save cleaned snapshot & metadata
snap = latest_snapshot(data)
stamp = pd.Timestamp.utcnow().strftime('%Y%m%dT%H%MZ')
csv_path = OUT_DIR / f'latest_snapshot_{stamp}.csv'
snap.to_csv(csv_path, index=False)
print('Saved:', csv_path)