In [1]:
import numpy as np
import pandas as pd
import datetime as dt

In [2]:
# Reading super covid dataset
super_covid = pd.read_csv('../../Team/STAGE1/superCovidDS.CSV')
super_covid.head()

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2020-01-22_x,2020-01-23_x,2020-01-24_x,2020-01-25_x,2020-01-26_x,2020-01-27_x,...,2023-01-08_y,2023-01-09_y,2023-01-10_y,2023-01-11_y,2023-01-12_y,2023-01-13_y,2023-01-14_y,2023-01-15_y,2023-01-16_y,population
0,1001,Autauga County,AL,1,0,0,0,0,0,0,...,230,230,230,230,230,230,230,230,230,55869
1,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,719,719,719,719,721,721,721,721,721,223234
2,1005,Barbour County,AL,1,0,0,0,0,0,0,...,103,103,103,103,103,103,103,103,103,24686
3,1007,Bibb County,AL,1,0,0,0,0,0,0,...,108,108,108,108,108,108,108,108,108,22394
4,1009,Blount County,AL,1,0,0,0,0,0,0,...,260,260,260,260,261,261,261,261,261,57826


In [3]:
super_covid_column_names = list(super_covid.columns)
super_covid_column_names[super_covid_column_names.index('County Name')] = "County_Name"
super_covid.columns=super_covid_column_names

In [4]:
transformed_df = pd.DataFrame(columns=['Date','Week','countyFIPS','County_Name', 'State', 'StateFIPS', 'population', 'Cases', 'New_Cases', 'Deaths', 'New_Deaths'])
transformed_df.head()

Unnamed: 0,Date,Week,countyFIPS,County_Name,State,StateFIPS,population,Cases,New_Cases,Deaths,New_Deaths


In [5]:
start_date = dt.datetime(2022,6,1)
end_date = dt.datetime(2022,12,31)
date_series = pd.date_range(start_date, end_date, freq='d')
date_delta = dt.timedelta(days=1)
for date in date_series:
    data = []
    for _ , row in super_covid.iterrows():
        temp = [date, date.isocalendar().week, getattr(row, 'countyFIPS'), getattr(row, 'County_Name'),
                getattr(row, 'State'), getattr(row, 'StateFIPS'), getattr(row, 'population')]
        cases_column = date.strftime('%Y-%m-%d_x')
        temp.append(getattr(row, cases_column))
        temp.append(getattr(row, cases_column) - getattr(row, (date-date_delta).strftime('%Y-%m-%d_x')))
        deaths_column = date.strftime('%Y-%m-%d_y')
        temp.append(getattr(row, deaths_column))
        temp.append(getattr(row, deaths_column) - getattr(row, (date-date_delta).strftime('%Y-%m-%d_y')))
        data.append(temp)
    transformed_df = pd.concat([transformed_df, pd.DataFrame(data, columns=transformed_df.columns)])
transformed_df.head()

Unnamed: 0,Date,Week,countyFIPS,County_Name,State,StateFIPS,population,Cases,New_Cases,Deaths,New_Deaths
0,2022-06-01,22,1001,Autauga County,AL,1,55869,15969,6,216,0
1,2022-06-01,22,1003,Baldwin County,AL,1,223234,56580,68,683,0
2,2022-06-01,22,1005,Barbour County,AL,1,24686,5710,3,99,0
3,2022-06-01,22,1007,Bibb County,AL,1,22394,6508,8,105,0
4,2022-06-01,22,1009,Blount County,AL,1,57826,15077,4,244,0


In [6]:
transformed_df.shape

(672388, 11)

In [7]:
# Filtering new cases and deaths across California state
CA_Covid = transformed_df.query("State=='CA'").copy()
CA_Covid.head()

Unnamed: 0,Date,Week,countyFIPS,County_Name,State,StateFIPS,population,Cases,New_Cases,Deaths,New_Deaths
186,2022-06-01,22,6001,Alameda County,CA,6,1671329,285709,658,1870,0
187,2022-06-01,22,6003,Alpine County,CA,6,1129,128,0,0,0
188,2022-06-01,22,6005,Amador County,CA,6,39752,8820,3,87,0
189,2022-06-01,22,6007,Butte County,CA,6,219186,34122,17,427,0
190,2022-06-01,22,6009,Calaveras County,CA,6,45905,7522,8,121,0


In [8]:
# Aggregating values since this is a state level analysis
CA_aggregate_df = CA_Covid.groupby(by=['State','Date','Week']).sum().reset_index()
CA_aggregate_df.drop(columns=['countyFIPS', 'StateFIPS', 'County_Name'], inplace=True)
CA_aggregate_df.head()

Unnamed: 0,State,Date,Week,County_Name,population,Cases,New_Cases,Deaths,New_Deaths
0,CA,2022-06-01,22,Alameda County Alpine County Amador County But...,39512223,8984553,11487,90627,1
1,CA,2022-06-02,22,Alameda County Alpine County Amador County But...,39512223,8986351,1798,90627,0
2,CA,2022-06-03,22,Alameda County Alpine County Amador County But...,39512223,8986351,0,90627,0
3,CA,2022-06-04,22,Alameda County Alpine County Amador County But...,39512223,8986351,0,90627,0
4,CA,2022-06-05,22,Alameda County Alpine County Amador County But...,39512223,8986351,0,90627,0


In [19]:
aggregations = ['mean', 'median', pd.Series.mode]
CA_aggregate_df.groupby(by='Week').agg({'New_Cases': aggregations, 'New_Deaths': aggregations}).reset_index()

Unnamed: 0_level_0,Week,New_Cases,New_Cases,New_Cases,New_Deaths,New_Deaths,New_Deaths
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mode,mean,median,mode
0,22,2657.0,0.0,0,0.2,0.0,0
1,23,25480.285714,0.0,0,41.571429,0.0,0
2,24,15981.571429,8637.0,"[335, 1023, 3448, 8637, 21846, 27525, 49057]",43.142857,4.0,0
3,25,13680.142857,10147.0,"[1867, 5252, 6898, 10147, 12709, 14015, 44873]",15.428571,1.0,"[0, 1, 3]"
4,26,21643.285714,17789.0,"[0, 1368, 11346, 17789, 18047, 19695, 83258]",132.857143,33.0,0
5,27,17704.142857,20082.0,"[9724, 9866, 13353, 20082, 21038, 22869, 26997]",40.285714,42.0,"[29, 31, 41, 42, 43, 44, 52]"
6,28,16451.714286,10948.0,0,-40.714286,3.0,0
7,29,14650.428571,10009.0,"[1857, 4883, 8370, 10009, 11403, 17856, 48175]",20.857143,2.0,0
8,30,15832.857143,10608.0,"[0, 1449, 4625, 10608, 11406, 14757, 67985]",41.428571,0.0,0
9,31,12790.285714,8442.0,"[1108, 1679, 4087, 8442, 8889, 31804, 33523]",43.0,2.0,0


In [None]:
stats_df = pd.DataFrame(columns=['Week', 'State', 'New_Cases_Mean','New_Cases_Median','New_Cases_Mode',
                                    'New_Deaths_Mean', 'New_Deaths_Median', 'New_Deaths_Mode'])
stats_df.head()