In [1]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path

# File to Load
raw_file_path = Path("Resources/Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv")

# Read data file and store into Pandas DataFrames
covid_data = pd.read_csv(raw_file_path)
covid_data.head(5)

Unnamed: 0,Data As Of,Start Date,End Date,Group,Year,Month,State,Condition Group,Condition,ICD10_codes,Age Group,COVID-19 Deaths,Number of Mentions,Flag
0,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,1569.0,1647.0,
1,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,25-34,5804.0,6029.0,
2,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,35-44,15080.0,15699.0,
3,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,45-54,37414.0,38878.0,
4,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,55-64,82668.0,85708.0,


In [2]:
# overview of the covid dataframe
covid_data.columns

Index(['Data As Of', 'Start Date', 'End Date', 'Group', 'Year', 'Month',
       'State', 'Condition Group', 'Condition', 'ICD10_codes', 'Age Group',
       'COVID-19 Deaths', 'Number of Mentions', 'Flag'],
      dtype='object')

In [3]:
# removing unnecessary columns from the raw dataframe
covid_data = covid_data[["Year", "Month", "Group", "State", "Condition Group", "Condition", "Age Group", "COVID-19 Deaths", "Number of Mentions"]]
covid_data.head(5)

Unnamed: 0,Year,Month,Group,State,Condition Group,Condition,Age Group,COVID-19 Deaths,Number of Mentions
0,,,By Total,United States,Respiratory diseases,Influenza and pneumonia,0-24,1569.0,1647.0
1,,,By Total,United States,Respiratory diseases,Influenza and pneumonia,25-34,5804.0,6029.0
2,,,By Total,United States,Respiratory diseases,Influenza and pneumonia,35-44,15080.0,15699.0
3,,,By Total,United States,Respiratory diseases,Influenza and pneumonia,45-54,37414.0,38878.0
4,,,By Total,United States,Respiratory diseases,Influenza and pneumonia,55-64,82668.0,85708.0


In [4]:
# calculate the number of rows with the values for each column
column_value_counts = covid_data.count()
print(column_value_counts)

Year                  608580
Month                 558900
Group                 621000
State                 621000
Condition Group       621000
Condition             621000
Age Group             621000
COVID-19 Deaths       437551
Number of Mentions    443423
dtype: int64


In [5]:
# filtering out the rows with "by Years" and "in year 2020 and 2021"
filtered_covid_data = covid_data[(covid_data["Group"] == "By Month") & (covid_data["Year"].isin([2020, 2021]))]
filtered_covid_data

Unnamed: 0,Year,Month,Group,State,Condition Group,Condition,Age Group,COVID-19 Deaths,Number of Mentions
62100,2020.0,1.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
62101,2020.0,2.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
62102,2020.0,3.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,0-24,9.0,9.0
62103,2020.0,4.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,0-24,27.0,30.0
62104,2020.0,5.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,0-24,19.0,19.0
...,...,...,...,...,...,...,...,...,...
620974,2021.0,8.0,By Month,Puerto Rico,COVID-19,COVID-19,All Ages,314.0,314.0
620975,2021.0,9.0,By Month,Puerto Rico,COVID-19,COVID-19,All Ages,269.0,269.0
620976,2021.0,10.0,By Month,Puerto Rico,COVID-19,COVID-19,All Ages,79.0,79.0
620977,2021.0,11.0,By Month,Puerto Rico,COVID-19,COVID-19,All Ages,28.0,28.0


In [6]:
# Checking the dataframe with value counts
column_value_counts = filtered_covid_data.count()
column_value_counts

Year                  298080
Month                 298080
Group                 298080
State                 298080
Condition Group       298080
Condition             298080
Age Group             298080
COVID-19 Deaths       210583
Number of Mentions    213085
dtype: int64

In [7]:
# drop the rows have the null values in the column "COVID-19 eaths"
filtered_covid_data = filtered_covid_data.dropna(subset=["COVID-19 Deaths"])
filtered_covid_data

Unnamed: 0,Year,Month,Group,State,Condition Group,Condition,Age Group,COVID-19 Deaths,Number of Mentions
62100,2020.0,1.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
62101,2020.0,2.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
62102,2020.0,3.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,0-24,9.0,9.0
62103,2020.0,4.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,0-24,27.0,30.0
62104,2020.0,5.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,0-24,19.0,19.0
...,...,...,...,...,...,...,...,...,...
620974,2021.0,8.0,By Month,Puerto Rico,COVID-19,COVID-19,All Ages,314.0,314.0
620975,2021.0,9.0,By Month,Puerto Rico,COVID-19,COVID-19,All Ages,269.0,269.0
620976,2021.0,10.0,By Month,Puerto Rico,COVID-19,COVID-19,All Ages,79.0,79.0
620977,2021.0,11.0,By Month,Puerto Rico,COVID-19,COVID-19,All Ages,28.0,28.0


In [8]:
column_value_counts = filtered_covid_data.count()
column_value_counts

Year                  210583
Month                 210583
Group                 210583
State                 210583
Condition Group       210583
Condition             210583
Age Group             210583
COVID-19 Deaths       210583
Number of Mentions    210583
dtype: int64

In [9]:
# version_2 - filtered data without using the Age Group (this dataset can be combined with other factors such as medicaid users, TSA travel# by state, etc.
filtered_covid_data_wo_age_group = filtered_covid_data[filtered_covid_data["Age Group"] == "All Ages"]
filtered_covid_data_wo_age_group.head(10)

Unnamed: 0,Year,Month,Group,State,Condition Group,Condition,Age Group,COVID-19 Deaths,Number of Mentions
62505,2020.0,1.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,All Ages,2.0,2.0
62506,2020.0,2.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,All Ages,8.0,8.0
62507,2020.0,3.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,All Ages,3229.0,3256.0
62508,2020.0,4.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,All Ages,27501.0,27725.0
62509,2020.0,5.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,All Ages,14756.0,14969.0
62510,2020.0,6.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,All Ages,7104.0,7241.0
62511,2020.0,7.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,All Ages,13767.0,13986.0
62512,2020.0,8.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,All Ages,14016.0,14303.0
62513,2020.0,9.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,All Ages,8697.0,8916.0
62514,2020.0,10.0,By Month,United States,Respiratory diseases,Influenza and pneumonia,All Ages,10805.0,11060.0


In [10]:
# export both dataframes filtered_covid_data & filtered_covid_data_wo_age_group
filtered_covid_data.to_excel("Resources/filtered_covid_data.xlsx", index=False)
filtered_covid_data_wo_age_group.to_excel("Resources/filtered_covid_data_without_age_group.xlsx", index=False)