In [1]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path

# File to Load
raw_file_path = Path("Resources/Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv")

# Read data file and store into Pandas DataFrames
covid_data = pd.read_csv(raw_file_path)
covid_data.head(5)

Unnamed: 0,Data As Of,Start Date,End Date,Group,Year,Month,State,Condition Group,Condition,ICD10_codes,Age Group,COVID-19 Deaths,Number of Mentions,Flag
0,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,1569.0,1647.0,
1,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,25-34,5804.0,6029.0,
2,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,35-44,15080.0,15699.0,
3,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,45-54,37414.0,38878.0,
4,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,55-64,82668.0,85708.0,


In [2]:
# overview of the covid dataframe
covid_data.columns

Index(['Data As Of', 'Start Date', 'End Date', 'Group', 'Year', 'Month',
       'State', 'Condition Group', 'Condition', 'ICD10_codes', 'Age Group',
       'COVID-19 Deaths', 'Number of Mentions', 'Flag'],
      dtype='object')

In [3]:
# removing unnecessary columns from the raw dataframe
covid_data = covid_data[["Year", "Month", "Group", "State", "Condition Group", "Condition", "Age Group", "COVID-19 Deaths", "Number of Mentions"]]
covid_data.head(5)

Unnamed: 0,Year,Month,Group,State,Condition Group,Condition,Age Group,COVID-19 Deaths,Number of Mentions
0,,,By Total,United States,Respiratory diseases,Influenza and pneumonia,0-24,1569.0,1647.0
1,,,By Total,United States,Respiratory diseases,Influenza and pneumonia,25-34,5804.0,6029.0
2,,,By Total,United States,Respiratory diseases,Influenza and pneumonia,35-44,15080.0,15699.0
3,,,By Total,United States,Respiratory diseases,Influenza and pneumonia,45-54,37414.0,38878.0
4,,,By Total,United States,Respiratory diseases,Influenza and pneumonia,55-64,82668.0,85708.0


In [4]:
# calculate the number of rows with the values for each column
column_value_counts = covid_data.count()
print(column_value_counts)

Year                  608580
Month                 558900
Group                 621000
State                 621000
Condition Group       621000
Condition             621000
Age Group             621000
COVID-19 Deaths       437551
Number of Mentions    443423
dtype: int64


In [5]:
# filtering out the rows with "by Years" and "in year 2020 and 2021"
filtered_covid_data = covid_data[
    (covid_data["Group"] == "By Month") &
    (covid_data["Year"].isin([2020, 2021])) &
    ~(covid_data["State"].isin(["United States", "District of Columbia", "Puerto Rico"]))
]
filtered_covid_data

Unnamed: 0,Year,Month,Group,State,Condition Group,Condition,Age Group,COVID-19 Deaths,Number of Mentions
72450,2020.0,1.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
72451,2020.0,2.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
72452,2020.0,3.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
72453,2020.0,4.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
72454,2020.0,5.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
...,...,...,...,...,...,...,...,...,...
610624,2021.0,8.0,By Month,Wyoming,COVID-19,COVID-19,All Ages,79.0,79.0
610625,2021.0,9.0,By Month,Wyoming,COVID-19,COVID-19,All Ages,174.0,174.0
610626,2021.0,10.0,By Month,Wyoming,COVID-19,COVID-19,All Ages,210.0,210.0
610627,2021.0,11.0,By Month,Wyoming,COVID-19,COVID-19,All Ages,158.0,158.0


In [6]:
# Checking the dataframe with value counts
column_value_counts = filtered_covid_data.count()
column_value_counts

Year                  281520
Month                 281520
Group                 281520
State                 281520
Condition Group       281520
Condition             281520
Age Group             281520
COVID-19 Deaths       197462
Number of Mentions    199919
dtype: int64

In [7]:
# drop the rows have the null values in the column "COVID-19 eaths"
filtered_covid_data = filtered_covid_data.dropna(subset=["COVID-19 Deaths"])
filtered_covid_data

Unnamed: 0,Year,Month,Group,State,Condition Group,Condition,Age Group,COVID-19 Deaths,Number of Mentions
72450,2020.0,1.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
72451,2020.0,2.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
72452,2020.0,3.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
72453,2020.0,4.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
72454,2020.0,5.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,0-24,0.0,0.0
...,...,...,...,...,...,...,...,...,...
610624,2021.0,8.0,By Month,Wyoming,COVID-19,COVID-19,All Ages,79.0,79.0
610625,2021.0,9.0,By Month,Wyoming,COVID-19,COVID-19,All Ages,174.0,174.0
610626,2021.0,10.0,By Month,Wyoming,COVID-19,COVID-19,All Ages,210.0,210.0
610627,2021.0,11.0,By Month,Wyoming,COVID-19,COVID-19,All Ages,158.0,158.0


In [8]:
column_value_counts = filtered_covid_data.count()
column_value_counts

Year                  197462
Month                 197462
Group                 197462
State                 197462
Condition Group       197462
Condition             197462
Age Group             197462
COVID-19 Deaths       197462
Number of Mentions    197462
dtype: int64

In [9]:
# version_2 - filtered data without using the Age Group (this dataset can be combined with other factors such as medicaid users, TSA travel# by state, etc.
filtered_covid_data_wo_age_group = filtered_covid_data[filtered_covid_data["Age Group"] == "All Ages"]
filtered_covid_data_wo_age_group.head(10)

Unnamed: 0,Year,Month,Group,State,Condition Group,Condition,Age Group,COVID-19 Deaths,Number of Mentions
72855,2020.0,1.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,All Ages,0.0,0.0
72856,2020.0,2.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,All Ages,0.0,0.0
72857,2020.0,3.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,All Ages,11.0,11.0
72858,2020.0,4.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,All Ages,92.0,92.0
72859,2020.0,5.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,All Ages,128.0,131.0
72860,2020.0,6.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,All Ages,125.0,125.0
72861,2020.0,7.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,All Ages,264.0,267.0
72862,2020.0,8.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,All Ages,318.0,324.0
72863,2020.0,9.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,All Ages,188.0,190.0
72864,2020.0,10.0,By Month,Alabama,Respiratory diseases,Influenza and pneumonia,All Ages,183.0,185.0


In [10]:
# export both dataframes filtered_covid_data & filtered_covid_data_wo_age_group
filtered_covid_data.to_csv("Resources/filtered_covid_data.csv", index=False)
filtered_covid_data_wo_age_group.to_csv("Resources/filtered_covid_data_without_age_group.csv", index=False)