In [91]:
import numpy as np
import pandas as pd
import datetime as dt
import plotly.express as px

In [79]:
# Reading super covid dataset
super_covid = pd.read_csv('../../Team/STAGE1/superCovidDS.CSV')
super_covid.head()

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2020-01-22_x,2020-01-23_x,2020-01-24_x,2020-01-25_x,2020-01-26_x,2020-01-27_x,...,2023-01-08_y,2023-01-09_y,2023-01-10_y,2023-01-11_y,2023-01-12_y,2023-01-13_y,2023-01-14_y,2023-01-15_y,2023-01-16_y,population
0,1001,Autauga County,AL,1,0,0,0,0,0,0,...,230,230,230,230,230,230,230,230,230,55869
1,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,719,719,719,719,721,721,721,721,721,223234
2,1005,Barbour County,AL,1,0,0,0,0,0,0,...,103,103,103,103,103,103,103,103,103,24686
3,1007,Bibb County,AL,1,0,0,0,0,0,0,...,108,108,108,108,108,108,108,108,108,22394
4,1009,Blount County,AL,1,0,0,0,0,0,0,...,260,260,260,260,261,261,261,261,261,57826


In [80]:
super_covid_column_names = list(super_covid.columns)
super_covid_column_names[super_covid_column_names.index('County Name')] = "County_Name"
super_covid.columns=super_covid_column_names

In [81]:
transformed_super_covid = pd.DataFrame(columns=['Date','Week','countyFIPS','County_Name', 'State', 'StateFIPS', 'population', 'Cases', 'New_Cases', 'Deaths', 'New_Deaths'])
transformed_super_covid.head()

Unnamed: 0,Date,Week,countyFIPS,County_Name,State,StateFIPS,population,Cases,New_Cases,Deaths,New_Deaths


In [82]:
start_date = dt.datetime(2022,6,1)
end_date = dt.datetime(2022,12,31)
date_series = pd.date_range(start_date, end_date, freq='d')
date_delta = dt.timedelta(days=1)
for date in date_series:
    data = []
    for _ , row in super_covid.iterrows():
        temp = [date, date.isocalendar().week, getattr(row, 'countyFIPS'), getattr(row, 'County_Name'),
                getattr(row, 'State'), getattr(row, 'StateFIPS'), getattr(row, 'population')]
        cases_column = date.strftime('%Y-%m-%d_x')
        temp.append(getattr(row, cases_column))
        temp.append(getattr(row, cases_column) - getattr(row, (date-date_delta).strftime('%Y-%m-%d_x')))
        deaths_column = date.strftime('%Y-%m-%d_y')
        temp.append(getattr(row, deaths_column))
        temp.append(getattr(row, deaths_column) - getattr(row, (date-date_delta).strftime('%Y-%m-%d_y')))
        data.append(temp)
    transformed_super_covid = pd.concat([transformed_super_covid, pd.DataFrame(data, columns=transformed_super_covid.columns)])
transformed_super_covid.head()

Unnamed: 0,Date,Week,countyFIPS,County_Name,State,StateFIPS,population,Cases,New_Cases,Deaths,New_Deaths
0,2022-06-01,22,1001,Autauga County,AL,1,55869,15969,6,216,0
1,2022-06-01,22,1003,Baldwin County,AL,1,223234,56580,68,683,0
2,2022-06-01,22,1005,Barbour County,AL,1,24686,5710,3,99,0
3,2022-06-01,22,1007,Bibb County,AL,1,22394,6508,8,105,0
4,2022-06-01,22,1009,Blount County,AL,1,57826,15077,4,244,0


In [83]:
transformed_super_covid.shape

(672388, 11)

In [84]:
# Aggregating values across US
aggregated_super_covid = transformed_super_covid.groupby(by=['Date','Week']).sum().reset_index()
aggregated_super_covid.drop(columns=['State','countyFIPS', 'StateFIPS', 'County_Name'], inplace=True)
aggregated_super_covid.head()

Unnamed: 0,Date,Week,population,Cases,New_Cases,Deaths,New_Deaths
0,2022-06-01,22,328239523,81427445,158949,946824,483
1,2022-06-02,22,328239523,81494654,67209,947016,192
2,2022-06-03,22,328239523,81701504,206850,947235,219
3,2022-06-04,22,328239523,81712058,10554,947279,44
4,2022-06-05,22,328239523,81737066,25008,947279,0


In [95]:
from statistics import mode
aggregations = ['mean', 'median', mode]
super_covid_statistics = aggregated_super_covid.groupby(by='Week').agg({'New_Cases': aggregations, 'New_Deaths': aggregations}).reset_index()
super_covid_statistics.head()

Unnamed: 0_level_0,Week,New_Cases,New_Cases,New_Cases,New_Deaths,New_Deaths,New_Deaths
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mode,mean,median,mode
0,22,93714.0,67209.0,158949,187.6,192.0,483
1,23,86726.571429,83367.0,150752,294.571429,165.0,376
2,24,102962.571429,72445.0,75974,253.0,260.0,217
3,25,74973.0,72909.0,52290,220.0,135.0,105
4,26,102170.0,71155.0,71155,354.0,330.0,134


In [96]:
def get_week_range_string(weekNumber):
    week_start = dt.datetime.strptime(f'2022-W{weekNumber}-1', "%Y-W%W-%w")
    week_end = dt.datetime.strptime(f'2022-W{weekNumber}-0', "%Y-W%W-%w")
    start_date = dt.datetime(2022, 6, 1)
    end_date = dt.datetime(2022, 12, 31)
    output_format = '%b-%d'
    if week_start < start_date:
        week_start = start_date
    if week_end > end_date:
        week_end = end_date
    return ' to '.join([week_start.strftime(output_format), week_end.strftime(output_format)])

super_covid_statistics.columns = ['_'.join(col) for col in super_covid_statistics.columns.values]
cols = list(super_covid_statistics.columns)
cols[cols.index('Week_')] = 'Week_Number'
super_covid_statistics.columns = cols
super_covid_statistics['Week_Dates'] = super_covid_statistics['Week_Number'].apply(get_week_range_string)

In [111]:
super_covid_statistics['Population'] = aggregated_super_covid['population']
super_covid_statistics.head()

Unnamed: 0,Week_Number,New_Cases_mean,New_Cases_median,New_Cases_mode,New_Deaths_mean,New_Deaths_median,New_Deaths_mode,Week_Dates,Population
0,22,93714.0,67209.0,158949,187.6,192.0,483,Jun-01 to Jun-05,328239523
1,23,86726.571429,83367.0,150752,294.571429,165.0,376,Jun-06 to Jun-12,328239523
2,24,102962.571429,72445.0,75974,253.0,260.0,217,Jun-13 to Jun-19,328239523
3,25,74973.0,72909.0,52290,220.0,135.0,105,Jun-20 to Jun-26,328239523
4,26,102170.0,71155.0,71155,354.0,330.0,134,Jun-27 to Jul-03,328239523


In [112]:
px.line(super_covid_statistics,x='Week_Dates',y='New_Cases_mean')

In [113]:
px.line(super_covid_statistics,x='Week_Dates',y='New_Deaths_mean')

In [114]:
super_covid_statistics.to_csv('US_level_covid_statistics.csv',index = False)