In [12]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path
import numpy as np

# File to Load

raw_file_path = Path("Resources/Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv")

# Read the raw data file and store into pandas dataframe

raw_covid_data = pd.read_csv(raw_file_path)
raw_covid_data.head(5)

Unnamed: 0,Data As Of,Start Date,End Date,Group,Year,Month,State,Condition Group,Condition,ICD10_codes,Age Group,COVID-19 Deaths,Number of Mentions,Flag
0,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,1569.0,1647.0,
1,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,25-34,5804.0,6029.0,
2,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,35-44,15080.0,15699.0,
3,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,45-54,37414.0,38878.0,
4,09/24/2023,01/01/2020,09/23/2023,By Total,,,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,55-64,82668.0,85708.0,


In [13]:
# Show the overview of the columns

raw_covid_data.columns

Index(['Data As Of', 'Start Date', 'End Date', 'Group', 'Year', 'Month',
       'State', 'Condition Group', 'Condition', 'ICD10_codes', 'Age Group',
       'COVID-19 Deaths', 'Number of Mentions', 'Flag'],
      dtype='object')

In [14]:
# Creata a dataframe with the necessary columns to be used for data analysis

filtered_covid_data = raw_covid_data[['Year', 'Month', 'Group', 'State', 'Condition', 'Age Group', 'COVID-19 Deaths']]
filtered_covid_data

Unnamed: 0,Year,Month,Group,State,Condition,Age Group,COVID-19 Deaths
0,,,By Total,United States,Influenza and pneumonia,0-24,1569.0
1,,,By Total,United States,Influenza and pneumonia,25-34,5804.0
2,,,By Total,United States,Influenza and pneumonia,35-44,15080.0
3,,,By Total,United States,Influenza and pneumonia,45-54,37414.0
4,,,By Total,United States,Influenza and pneumonia,55-64,82668.0
...,...,...,...,...,...,...,...
620995,2023.0,5.0,By Month,Puerto Rico,COVID-19,All Ages,67.0
620996,2023.0,6.0,By Month,Puerto Rico,COVID-19,All Ages,122.0
620997,2023.0,7.0,By Month,Puerto Rico,COVID-19,All Ages,114.0
620998,2023.0,8.0,By Month,Puerto Rico,COVID-19,All Ages,78.0


In [16]:
# Filter the csv based on these parameters:
# Condition: COVID-19
# Age Group: Filter out "All Ages" and "Not stated"
# Group: Show data "By Month"
# States: Filter out "New York City", "District of Columbia", and "Puerto Rico"

further_filtered_covid_data = filtered_covid_data[
    (filtered_covid_data["Condition"] == "COVID-19") &
    (filtered_covid_data["Group"] == "By Month") &
    ~(filtered_covid_data["State"].isin(["District of Columbia", "New York City", "Puerto Rico", "United States"])) &
    ~(filtered_covid_data["Age Group"].isin(["All Ages", "Not stated"]))
]

further_filtered_covid_data

Unnamed: 0,Year,Month,Group,State,Condition,Age Group,COVID-19 Deaths
72000,2020.0,1.0,By Month,United States,COVID-19,0-24,0.0
72001,2020.0,2.0,By Month,United States,COVID-19,0-24,0.0
72002,2020.0,3.0,By Month,United States,COVID-19,0-24,19.0
72003,2020.0,4.0,By Month,United States,COVID-19,0-24,93.0
72004,2020.0,5.0,By Month,United States,COVID-19,0-24,67.0
...,...,...,...,...,...,...,...
610555,2023.0,5.0,By Month,Wyoming,COVID-19,85+,
610556,2023.0,6.0,By Month,Wyoming,COVID-19,85+,
610557,2023.0,7.0,By Month,Wyoming,COVID-19,85+,0.0
610558,2023.0,8.0,By Month,Wyoming,COVID-19,85+,


In [18]:
# Now that the data is showing only COVID-19 related deaths, we can delete that column as it is repeating

further_filtered_wo_condition = further_filtered_covid_data.drop("Condition", axis = 1)
further_filtered_wo_condition

Unnamed: 0,Year,Month,Group,State,Age Group,COVID-19 Deaths
72000,2020.0,1.0,By Month,United States,0-24,0.0
72001,2020.0,2.0,By Month,United States,0-24,0.0
72002,2020.0,3.0,By Month,United States,0-24,19.0
72003,2020.0,4.0,By Month,United States,0-24,93.0
72004,2020.0,5.0,By Month,United States,0-24,67.0
...,...,...,...,...,...,...
610555,2023.0,5.0,By Month,Wyoming,85+,
610556,2023.0,6.0,By Month,Wyoming,85+,
610557,2023.0,7.0,By Month,Wyoming,85+,0.0
610558,2023.0,8.0,By Month,Wyoming,85+,


In [19]:
# dropna the values in covid-19 deaths column that do not show a value

further_filtered_wo_condition = further_filtered_wo_condition.dropna(subset = ['COVID-19 Deaths'])
further_filtered_wo_condition

Unnamed: 0,Year,Month,Group,State,Age Group,COVID-19 Deaths
72000,2020.0,1.0,By Month,United States,0-24,0.0
72001,2020.0,2.0,By Month,United States,0-24,0.0
72002,2020.0,3.0,By Month,United States,0-24,19.0
72003,2020.0,4.0,By Month,United States,0-24,93.0
72004,2020.0,5.0,By Month,United States,0-24,67.0
...,...,...,...,...,...,...
610539,2022.0,1.0,By Month,Wyoming,85+,16.0
610540,2022.0,2.0,By Month,Wyoming,85+,27.0
610544,2022.0,6.0,By Month,Wyoming,85+,11.0
610549,2022.0,11.0,By Month,Wyoming,85+,12.0


In [20]:
# Change the datatypes of the dataframe without the condition column

# Change the float64 datatypes to integers

further_filtered_wo_condition['Year'] = further_filtered_wo_condition['Year'].astype(np.int64)
further_filtered_wo_condition['Month'] = further_filtered_wo_condition['Month'].astype(np.int64)
further_filtered_wo_condition['COVID-19 Deaths'] = further_filtered_wo_condition['COVID-19 Deaths'].astype(np.int64)

print(further_filtered_wo_condition.dtypes)

Year                int64
Month               int64
Group              object
State              object
Age Group          object
COVID-19 Deaths     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  further_filtered_wo_condition['Year'] = further_filtered_wo_condition['Year'].astype(np.int64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  further_filtered_wo_condition['Month'] = further_filtered_wo_condition['Month'].astype(np.int64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  further_filte

In [21]:
# Export this new dataframe without the conditons column and the blank values within COVID-19 Deaths

further_filtered_wo_condition.to_csv("Resources/cleaned_covid_data.csv", index = False)

In [22]:
# Add the populations for each state in into the dataset

In [None]:
# Create dataframe to use for heatmap
# The heatmap will show the top and bottom states based on their covid deaths