# Analysis of Percentage Tests PCR vs Antigen

Notebook contains quick analysis of percenage of tests are PCR lab based vs antigen using HHS Protect Data.

1. Data: HHS Protect Unified State Testing Metrics (Timeseries) HHS Protect [README](https://protect.hhs.gov/workspace/report/ri.report.main.report.90f21a15-7376-4fd8-b2c9-0a88619ed9c5)
    1. Note, this data is also public via [healthdata.gov](https://healthdata.gov/dataset/COVID-19-Diagnostic-Laboratory-Testing-PCR-Testing/j8mb-icvb)
    1. From the README: The metric fields are calculated separately for NAAT (PCR) and Antigen testing. Note that fields without a specification are NAAT definitions. Antigen fields have "antigen_" explicitly labelled as a prefix for the field. The following convention is utilized for naming of metric fields: 

In [None]:
import datetime as dt

import matplotlib
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option('display.max_rows', 1000)

In [None]:
# Local path to a CSV download from HHS Protect; hacky approach. If need to refresh with updated data see links above.
df = pd.read_csv(r'C:\Users\ryy0\inventory\unified_reporting_results_states_historical.csv')
df['date'] = pd.to_datetime(df['date'])

In [None]:
df.daily_sources.value_counts()

In [None]:
# Checking that the rows are unique at the state and date level
df.groupby(['state', 'date'], dropna=False).size().sort_values(ascending=False)

In [None]:
df_grouped = df.groupby(['date']).agg({'new_test_results_reported': np.sum, 'antigen_new_test_results_reported': np.sum})

In [None]:
df_grouped['new_test_results_reported_7_day_rolling'] = df_grouped['new_test_results_reported'].rolling('7D').sum()
df_grouped['antigen_new_test_results_reported_7_day_rolling'] = df_grouped['antigen_new_test_results_reported'].rolling('7D').sum()

In [None]:
df_grouped['percentage_pcr_daily'] = df_grouped['new_test_results_reported'] / (df_grouped['new_test_results_reported'] + df_grouped['antigen_new_test_results_reported'])
df_grouped['percentage_pcr_7_day_rolling'] = df_grouped['new_test_results_reported_7_day_rolling'] / (df_grouped['new_test_results_reported_7_day_rolling'] + df_grouped['antigen_new_test_results_reported_7_day_rolling'])

In [None]:
df_grouped

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
sns.lineplot(x=df_grouped.index, y=df_grouped['percentage_pcr_daily'], ax=ax, alpha=0.4)
sns.lineplot(x=df_grouped.index, y=df_grouped['percentage_pcr_7_day_rolling'], ax=ax)

ax.set_title('Percentage of COVID-19 Tests that are PCR', fontsize=26)
ax.set_ylabel('Percentage of COVID-19 Tests that are PCR', fontsize=18)
ax.set_xlabel('Date', fontsize=18)
ax.set_xlim(dt.date(2021, 1, 1), dt.date(2021, 12, 31))
ax.set_ylim(0.6, 1)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1, decimals=0))

plt.annotate(
    '7-Day Rolling Average of \nPercentage COVID-19 Tests \nthat are PCR',
    xy=(dt.date(2021, 12, 7), df_grouped.loc[dt.datetime(2021, 12, 7, 0, 0, 0)]['percentage_pcr_7_day_rolling']),
    xytext=(dt.date(2021, 9, 7), df_grouped.loc[dt.datetime(2021, 12, 7, 0, 0, 0)]['percentage_pcr_7_day_rolling'] + 0.1),
    fontsize=16,
    color='darkorange',
    arrowprops=dict(arrowstyle="->", color='darkorange')
)

plt.annotate(
    'Daily Percentage \nof COVID-19 Tests \nthat are PCR',
    xy=(dt.date(2021, 8, 2), df_grouped.loc[dt.datetime(2021, 8, 2, 0, 0, 0)]['percentage_pcr_daily']),
    xytext=(dt.date(2021, 5, 1), df_grouped.loc[dt.datetime(2021, 8, 1, 0, 0, 0)]['percentage_pcr_daily'] - 0.17),
    fontsize=16,
    color='Blue',
    arrowprops=dict(arrowstyle="->", color='Blue')
)

ax.tick_params(axis='y', labelsize=14)

plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))

plt.savefig(r'C:\Users\ryy0\inventory\percentage_pcr.png', facecolor='white')

In [None]:
df_grouped