In [31]:
import dask.dataframe as dd
import pandas as pd
import numpy as np

dtypes = {
    'country': 'object',
    'state': 'object',
    'city': 'object',
    'aggregate': 'object',
    'date': 'object',  # We'll parse this to datetime later
    'deaths': 'float64',
    'population': 'float64',
    'cases': 'float64'

}

# Load the CSV file into a Dask DataFrame
df = dd.read_csv('timeseries.csv', dtype=dtypes)

# Convert date column to datetime
df['date'] = dd.to_datetime(df['date'])

# Now extract year-month

df['year_month'] = df['date'].dt.strftime('%Y-%m')



In [32]:
# Filter for U.S. states
df = df[(df['country'] == 'United States') & (df['state'].notnull())]

# Filter for the required time period
df = df[(df['date'] >= '2020-01-01') & (df['date'] <= '2021-02-28')]

# Group by state and compute total deaths and average population over the period
state_stats = df.groupby('state').agg({'deaths': 'sum', 'population': 'mean'}).compute()

  df = reader(bio, **kwargs)


In [33]:
state_stats['per_capita_mortality'] = state_stats['deaths'] / state_stats['population']

# Rank states based on per-capita mortality
ranked_states = state_stats.sort_values(by='per_capita_mortality', ascending=False)
print(ranked_states)

                                 deaths    population  per_capita_mortality
state                                                                      
New York                      3852431.0  6.023470e+05              6.395701
Michigan                       880814.0  2.748462e+05              3.204753
Louisiana                      423430.0  1.548027e+05              2.735288
Illinois                       765763.0  3.746646e+05              2.043863
New Jersey                    1710535.0  8.421302e+05              2.031200
Georgia                        309920.0  1.529650e+05              2.026085
Pennsylvania                   758770.0  4.142725e+05              1.831572
Virginia                       186685.0  1.373630e+05              1.359063
Mississippi                    104608.0  7.710448e+04              1.356705
Indiana                        217660.0  1.608219e+05              1.353423
Ohio                           308006.0  2.908617e+05              1.058943
Iowa        

In [34]:
# Extract year-month for grouping
df['year_month'] = df['date'].dt.strftime('%Y-%m')

monthly_stats = df.groupby(['state', 'year_month']).agg({'deaths': 'sum', 'cases': 'sum'}).compute()

monthly_stats['CFR'] = monthly_stats['deaths'] / monthly_stats['cases']

monthly_stats = monthly_stats.fillna(0)

monthly_stats = monthly_stats.sort_index()
monthly_stats

  df = reader(bio, **kwargs)


Unnamed: 0_level_0,Unnamed: 1_level_0,deaths,cases,CFR
state,year_month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,2020-03,64.0,12023.0,0.005323
Alabama,2020-04,7073.0,249850.0,0.028309
Alabama,2020-05,28789.0,740216.0,0.038893
Alabama,2020-06,47377.0,1599004.0,0.029629
Alabama,2020-07,9836.0,412970.0,0.023818
...,...,...,...,...
Wyoming,2020-03,0.0,1540.0,0.000000
Wyoming,2020-04,81.0,20451.0,0.003961
Wyoming,2020-05,354.0,45419.0,0.007794
Wyoming,2020-06,1021.0,67428.0,0.015142


In [35]:
monthly_stats['CFR_change'] = monthly_stats.groupby('state')['CFR'].diff()

state_cfr_variation = monthly_stats.groupby('state')['CFR_change'].sum().sort_values(ascending=False)
state_cfr_variation

Unnamed: 0_level_0,CFR_change
state,Unnamed: 1_level_1
New Jersey,0.078497
Connecticut,0.07686
Massachusetts,0.07438
Pennsylvania,0.063375
New Hampshire,0.06168
Michigan,0.061298
Rhode Island,0.057596
New York,0.051587
Illinois,0.048028
Maryland,0.043883
