# Baseline (observed) dataset: COVID-19 cases vs time (England, UKHSA)

This notebook downloads a **pre-existing real dataset** from the UKHSA dashboard API and produces a simple **time vs infected** plot.

We use **confirmed COVID-19 cases by day** as the closest available proxy to ‘infected’.

Output:
- `data/processed/observed/ukhsa_covid19_cases_by_day_england.csv`


In [None]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt

from epidemiology_project.paths import processed_data_dir, figures_dir
from epidemiology_project.ukhsa import UksHaMetricQuery, fetch_metric


## Download from UKHSA API


In [None]:
q = UksHaMetricQuery(
    theme='infectious_disease',
    sub_theme='respiratory',
    topic='COVID-19',
    geography_type='Nation',
    geography='England',
    metric='COVID-19_cases_casesByDay',
)

df_raw = fetch_metric(q)
df_raw[['date','metric_value']].head()


## Keep only the time series we need


In [None]:
df = df_raw[['date', 'metric_value']].dropna().sort_values('date')
df = df.rename(columns={'metric_value': 'cases'})
df.head()


## Save to data/processed (observed baseline)


In [None]:
out_dir = processed_data_dir() / 'observed'
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / 'ukhsa_covid19_cases_by_day_england.csv'
df.to_csv(out_path, index=False)
out_path


## Plot: time vs infected (proxy)


In [None]:
fig, ax = plt.subplots(figsize=(11,5))
ax.plot(df['date'], df['cases'], linewidth=1)
ax.set_title('England: confirmed COVID-19 cases by day (UKHSA)')
ax.set_xlabel('Date')
ax.set_ylabel('Cases')
ax.grid(True, alpha=0.3)
fig.tight_layout()

fig_path = figures_dir(final=True) / 'ukhsa_observed_baseline_covid_cases_by_day_england.png'
fig.savefig(fig_path, dpi=200)
fig_path


## Optional: smoother curve (7-day mean)


In [None]:
df2 = df.set_index('date').copy()
df2['cases_7d_mean'] = df2['cases'].rolling(7).mean()

fig, ax = plt.subplots(figsize=(11,5))
ax.plot(df2.index, df2['cases_7d_mean'], linewidth=1.5)
ax.set_title('England: confirmed COVID-19 cases (7-day mean)')
ax.set_xlabel('Date')
ax.set_ylabel('Cases (7-day mean)')
ax.grid(True, alpha=0.3)
fig.tight_layout()

fig_path = figures_dir(final=True) / 'ukhsa_observed_baseline_covid_cases_by_day_england_7dmean.png'
fig.savefig(fig_path, dpi=200)
fig_path
