# Explore state reporting statistics

[CovidTracking.com](https://covidtracking.com/api) collects daily COVID-19 testing data from US states and territories. The quality of data and what data can be collected varies from state to state. This leads to questions: What data has been collected? How does data vary from state to state? How has this changed over time?

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.colors as colors
from datetime import date

In [None]:
pd.options.display.max_columns = None
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

# Import Data

In [None]:
daily = pd.read_csv("https://covidtracking.com/api/states/daily.csv", parse_dates=['date', 'dateChecked'])

In [None]:
daily.info()

### Drop some deprecated and calculated columns

Some statistics are deprecated and some are calculated from previous and current values. Since our interest is on data that is collected from each state, we will drop many of these columns. For a full explanation of these statistics, see [https://covidtracking.com/api#states-historical-data](https://covidtracking.com/api#states-historical-data)

In [None]:
deprecated = ['total', 'posNeg', 'hash']
calculated = ['totalTestResults', 'deathIncrease',
    'hospitalizedIncrease', 'negativeIncrease', 'positiveIncrease',
    'totalTestResultsIncrease']

In [None]:
daily.drop([*deprecated, *calculated, 'fips', 'dateChecked', 'hash'], axis=1, inplace=True)

In [None]:
daily.head()

# Create a new DataFrame where rows reference a date and columns represent a count of the number of states that have reported for that given statistic

In [None]:
aggregated_columns = list(daily.columns[2:])

In [None]:
data_completeness = daily.groupby('date').apply(lambda x: x.loc[:, aggregated_columns].count())

In [None]:
data_completeness['ordinal'] = data_completeness.index.to_series().apply(lambda d: d.toordinal())
data_completeness.tail()

# Explore the frequency of reporting over time

Since [some statistics are sparsely reported](https://covidtracking.com/data), it is helpul to understand the reporting frequency of some statistics over time.

In [None]:
plt.close('all')
plt.style.use('default')
plt.style.use('ggplot')

fig, axes = plt.subplots(5, 1, figsize=(10, 6), dpi=150, sharex=True, sharey=True, tight_layout=True)

_dc=data_completeness.copy()

x = _dc['ordinal']
y = _dc['positive']

label_groups = [aggregated_columns[0:3], aggregated_columns[9:12], aggregated_columns[3:5], aggregated_columns[5:7], aggregated_columns[7:9]]

x_ticks = np.linspace(min(x), max(x), num=8)
y_ticks = np.linspace(0, max(y), num=5)

for i, ax in enumerate(fig.axes):
    labels = [*label_groups[i]]
    width = 1 / len(labels)
    bars = [ax.bar(x + (i * width), _dc[f], width, label=f) for i, f in enumerate(labels)]
    ax.set_xticks(x_ticks)
    ax.set_yticks(y_ticks)
    new_labels = [date.fromordinal(int(xt)).strftime("%b %d, %Y") for xt in ax.get_xticks()]
    ax.set_xticklabels(new_labels)
    ax.xaxis.set_tick_params(rotation=25, labelsize=7)
    ax.yaxis.set_tick_params(labelsize=6)
    ax.legend(handles=[*bars], loc='upper left', framealpha=0.9, fontsize=6)

fig.text(-0.05, 0.5, 'Number of States and Territories Reporting', va='center', rotation='vertical')

plt.show()

# Examine relationships between reporting counts and grade

[State reporting grades are calculated](https://covidtracking.com/about-data#state-data-quality-grades) using a variety of factors. Lets see if there are any relationships between grade and certain statistics.

In [None]:
from pandas.api.types import CategoricalDtype

# Import the current state data

This data contains the grade and other statistics. For a full explanation see [https://covidtracking.com/api#states-current-values](https://covidtracking.com/api#states-current-values)

In [None]:
current = pd.read_csv('https://covidtracking.com/api/v1/states/current.csv')

### Convert grade to a numeric

In [None]:
list(current['dataQualityGrade'].value_counts().sort_index().index)

In [None]:
# Reverse the array becuase we want A+ to have the highest value and F to have the lowest
categories = ['A+', 'A', 'B', 'C', 'D', 'F'][::-1]
categories

In [None]:
# Create the category type and change `dataQualityGrade` to the category type `cat_type`
cat_type = CategoricalDtype(categories=categories, ordered=True)
current['dataQualityGrade'] = current['dataQualityGrade'].astype(cat_type)

In [None]:
# Create a new dataframe with just state and grade
quality = current.loc[:, ['state', 'dataQualityGrade']].sort_values(by='dataQualityGrade')
# Generate codes -> grade mapping
grade_codes, grade_uniques = pd.factorize(quality['dataQualityGrade'])
# Create new column containing grades as integers
quality['dataQualityCode'] = grade_codes

In [None]:
quality.tail()

# Create a new DataFrame that aggregates and counts all reported statistics by state

In [None]:
states = daily.groupby('state').apply(lambda x: x.loc[:, aggregated_columns].count())

### Create a sum column that sums all of the report counts

In [None]:
states['sum'] = states.sum(axis=1)

### Join DataFrames containing state reporting counts and quality grades

In [None]:
all_data = states.join(quality.set_index('state'), on='state').sort_values(by='dataQualityCode')

# Look for correlations between grade and other reported statistics

In [None]:
cm = colors.LinearSegmentedColormap.from_list('my_map', ['#951556', '#35013f'])

In [None]:
def getBackground(t):
    return 'background-color: %s; color: %s;' % (colors.to_hex(t), '#f6f5f5')

In [None]:
def highlight_cells(x):
    return [getBackground(cm(i[1])) for i in x.items()]

In [None]:
correlations = all_data.corr()
correlations.style.apply(highlight_cells)

# Data quality grade correlations sorted:

In [None]:
correlations['dataQualityCode'].sort_values(ascending=False)

# Plot states by grade

This is perhaps not very useful, but here is a plot of states sorted by their grade.

In [None]:
fig, ax = plt.subplots(figsize=(6, 6), dpi=150)
y_pos = np.arange(0, len(all_data), 1)
x_grade = all_data['dataQualityCode'].values
# x_sum = np.interp(all_data['sum'].values, np.linspace(0, all_data['sum'].max(), num=6), np.arange(0,len(categories),1) )
# x_icu = np.interp(all_data['inIcuCurrently'].values, np.linspace(0, all_data['inIcuCurrently'].max(), num=6), np.arange(0,6,1) )

width = 0.5

ax.barh(y_pos, x_grade, width, align='center')
# ax.barh(y_pos - width/2, x_sum, width, align='center')
# ax.barh(y_pos + width/2, x_icu, width, align='center')
ax.set_yticks(y_pos)
ax.set_xticklabels(categories)
ax.set_yticklabels(list(all_data.index))
ax.yaxis.set_tick_params(labelsize=5)
plt.show()