In [3]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
from datetime import datetime

In [5]:
dtypes = {
    'aggregate': 'object',
    'cases': 'float64',
    'city': 'object',
    'population': 'float64',
    'deaths': 'float64',
    'country': 'object',
    'state': 'object',
    'date': 'object'
}

In [7]:
df = dd.read_csv("timeseries.csv",dtype=dtypes)

In [9]:
# Filter for US states only and state-level data
us_df = df[(df['country'] == 'United States') & (df['aggregate'] == 'state')]

In [11]:
# Define date range
start_date = '2020-01-01'
end_date = '2021-02-28'
mask = (us_df['date'] >= start_date) & (us_df['date'] <= end_date)
period_df = us_df[mask]

In [13]:
state_metrics = period_df.groupby('state').agg({
    'deaths': 'sum',
    'population': 'mean'
}).compute()

state_metrics['per_capita_mortality'] = state_metrics['deaths'] / state_metrics['population']
state_metrics['mortality_rank'] = state_metrics['per_capita_mortality'].rank(ascending=False)

print(state_metrics.sort_values('mortality_rank'))

Empty DataFrame
Columns: [deaths, population, per_capita_mortality, mortality_rank]
Index: []


In [15]:
period_df = period_df.compute()
period_df['date'] = pd.to_datetime(period_df['date'])
period_df['month_year'] = period_df['date'].dt.strftime('%Y-%m')
monthly_metrics = period_df.groupby(['state', 'month_year']).agg({
    'deaths': 'last',
    'cases': 'last'
}).reset_index()

# Calculate CFR (Case Fatality Rate)
monthly_metrics['cfr'] = monthly_metrics['deaths'] / monthly_metrics['cases']
cfr_matrix = monthly_metrics.pivot(index='state', columns='month_year', values='cfr')
print(cfr_matrix)

  df = reader(bio, **kwargs)


ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+-----------+---------+----------+
| Column    | Found   | Expected |
+-----------+---------+----------+
| county    | object  | float64  |
| recovered | float64 | int64    |
+-----------+---------+----------+

The following columns also raised exceptions on conversion:

- county
  ValueError("could not convert string to float: 'Grant County'")

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'county': 'object',
       'recovered': 'float64'}

to the call to `read_csv`/`read_table`.

In [None]:
cfr_changes = cfr_matrix.diff(axis=1)
total_change = cfr_changes.abs().sum(axis=1)
cfr_rankings = total_change.rank(ascending=True)
print(cfr_rankings.sort_values())