In [1]:
#Code written by Victoria Dunkley

# Understanding the Reach and Impact of the Centers for Disease Control and Prevention’s Women’s Health Research, 2018–2023
#### Code Notebook Goal: Analyze 2018-2021 National Vital Statistics System (NVSS) mortality data with a focus on discerning which conditions result in high relative mortality risk for US females. CDC-auhtored publications will be identified via systematic search for contitions with high relative mortality risk for US females
#### Initial Data pulled from :
   1. CDC Wonder Query for sex-stratified age-adjusted dataset (df_nvss): https://wonder.cdc.gov/controller/saved/D158/D450F314
   2. CDC Wonder Query for overarching sex-stratified 2018-2021 mortality (CDC_WONDER_TOTALS ): https://wonder.cdc.gov/controller/saved/D158/D402F797 ( this populates table 1)

#### In this notebook this analysis will:
1. Import NVSS mortality data from CDC Wonder
2. Restructure sex-stratified mortality data from long to wide format
3. Calculate relative mortality risk ( female age-adjusted death rate/male age-adjusted death rate) for US females and additional relevant mortality metrics
4. Export the dataset that will be made into a supplemental data table (CDC_WONDER_NCHS_MERGE)




## Load libraries and data

In [2]:
## import modules (base kernel is Python 3.11.7)
import numpy as np
import pandas as pd
import os
from IPython.display import Image
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
## Enable multiple outputs from jupyter cells
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## disable the Pandas "setting a copy of a slice" warning
pd.options.mode.chained_assignment = None

## set default number of DataFrame rows printed to 20
pd.set_option('display.max_rows', 20)

In [4]:
#get working directory
os.getcwd()

'c:\\Users\\utu2\\OneDrive - CDC\\OS-OSQ-DataAnalytics - Documents\\Portfolio Analytics\\JWH Manuscript\\202509_REPO_for_GitHub_Share\\Code'

In [5]:
# Change working directory to one folder up
os.chdir('..')
os.getcwd()

'c:\\Users\\utu2\\OneDrive - CDC\\OS-OSQ-DataAnalytics - Documents\\Portfolio Analytics\\JWH Manuscript\\202509_REPO_for_GitHub_Share'

In [6]:
# Import NVSS 2018-2021 mortality data and get the general shape of the data
df_nvss=pd.read_excel("Data/NVSS_2018_2021_Sex_Stratified_NCHS_Grouping_AgeAdjusted.xlsx") 
df_nvss.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257 entries, 0 to 256
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Notes                       0 non-null      float64
 1   Sex                         257 non-null    object 
 2   Sex Code                    257 non-null    object 
 3   ICD-10 113 Cause List       257 non-null    object 
 4   ICD-10 113 Cause List Code  257 non-null    object 
 5   Deaths                      257 non-null    int64  
 6   Population                  257 non-null    int64  
 7   Crude Rate                  257 non-null    object 
 8   Age Adjusted Rate           257 non-null    object 
dtypes: float64(1), int64(2), object(6)
memory usage: 18.2+ KB


In [7]:
## Import data with deaths per year per sex and rate info (overarching 2018-2021 mortality numbers)
CDC_WONDER_TOTALS = pd.read_excel("Data/CDC_WONDER_4_YEAR_TOTAL.xlsx")  
CDC_WONDER_TOTALS.head(20)

Unnamed: 0,Notes,Gender,Gender Code,Year,Year Code,Deaths,Population,Crude Rate
0,,Female,F,2018.0,2018.0,1380736.0,166038800.0,831.6
1,,Female,F,2019.0,2019.0,1381015.0,166582200.0,829.0
2,,Female,F,2020.0,2020.0,1613845.0,167227900.0,965.1
3,,Female,F,2021.0,2021.0,1626123.0,167509000.0,970.8
4,Total,Female,F,,,6001719.0,667357900.0,899.3
5,,Male,M,2018.0,2018.0,1458469.0,161128700.0,905.2
6,,Male,M,2019.0,2019.0,1473823.0,161657300.0,911.7
7,,Male,M,2020.0,2020.0,1769884.0,162256200.0,1090.8
8,,Male,M,2021.0,2021.0,1838108.0,164384700.0,1118.2
9,Total,Male,M,,,6540284.0,649426900.0,1007.1


## Data Restructuring

In [8]:
# parse out 2018-2021 sex-stratified total deaths numbers
female_deaths = CDC_WONDER_TOTALS.loc[4,'Deaths']
male_deaths = CDC_WONDER_TOTALS.loc[9,'Deaths']

In [9]:
#Ensure unreliable and "" are NA
df_nvss= df_nvss.replace(['', 'Unreliable'], np.nan)

#check to make sure unreliable has been made NA.. should return empty
unreliable_rows = df_nvss.loc[(df_nvss['Crude Rate'] == 'Unreliable') | (df_nvss['Age Adjusted Rate'] == 'Unreliable')]


# check... should return empty rows
unreliable_rows

Unnamed: 0,Notes,Sex,Sex Code,ICD-10 113 Cause List,ICD-10 113 Cause List Code,Deaths,Population,Crude Rate,Age Adjusted Rate


In [10]:
# create sex-stratified datasets... data is currently in long format and we want to restructure it to wide
df_female=df_nvss.loc[(df_nvss['Sex']=='Female')]
df_female.head(5)

df_male=df_nvss.loc[(df_nvss['Sex']=='Male')]
df_male.head(5)

Unnamed: 0,Notes,Sex,Sex Code,ICD-10 113 Cause List,ICD-10 113 Cause List Code,Deaths,Population,Crude Rate,Age Adjusted Rate
0,,Female,F,#Salmonella infections (A01-A02),GR113-001,104,667357878,0.0,0.0
1,,Female,F,"#Shigellosis and amebiasis (A03,A06)",GR113-002,17,667357878,,
2,,Female,F,"Certain other intestinal infections (A04,A07-A09)",GR113-003,17177,667357878,2.6,1.9
3,,Female,F,#Tuberculosis (A16-A19),GR113-004,805,667357878,0.1,0.1
4,,Female,F,Respiratory tuberculosis (A16),GR113-005,516,667357878,0.1,0.1


Unnamed: 0,Notes,Sex,Sex Code,ICD-10 113 Cause List,ICD-10 113 Cause List Code,Deaths,Population,Crude Rate,Age Adjusted Rate
130,,Male,M,#Salmonella infections (A01-A02),GR113-001,137,649426947,0.0,0.0
131,,Male,M,"#Shigellosis and amebiasis (A03,A06)",GR113-002,11,649426947,,
132,,Male,M,"Certain other intestinal infections (A04,A07-A09)",GR113-003,11908,649426947,1.8,1.7
133,,Male,M,#Tuberculosis (A16-A19),GR113-004,1465,649426947,0.2,0.2
134,,Male,M,Respiratory tuberculosis (A16),GR113-005,1027,649426947,0.2,0.1


In [11]:
#sort based on Cause of death Code prior to re-merging
df_female= df_female.sort_values(["ICD-10 113 Cause List"])

df_male= df_male.sort_values(["ICD-10 113 Cause List"])

In [12]:
# Remerge data to format data from long to wide
CDC_WONDER_NCHS_MERGE=\
    pd.merge(df_female,
             df_male,
             on=["ICD-10 113 Cause List"],
             how='outer',
            suffixes=('_F', '_M')) # designates what comes from df_female v. df_male

In [13]:
# view data shape and data presentation
CDC_WONDER_NCHS_MERGE.info()
CDC_WONDER_NCHS_MERGE.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Notes_F                       0 non-null      float64
 1   Sex_F                         130 non-null    object 
 2   Sex Code_F                    130 non-null    object 
 3   ICD-10 113 Cause List         134 non-null    object 
 4   ICD-10 113 Cause List Code_F  130 non-null    object 
 5   Deaths_F                      130 non-null    float64
 6   Population_F                  130 non-null    float64
 7   Crude Rate_F                  127 non-null    float64
 8   Age Adjusted Rate_F           127 non-null    float64
 9   Notes_M                       0 non-null      float64
 10  Sex_M                         127 non-null    object 
 11  Sex Code_M                    127 non-null    object 
 12  ICD-10 113 Cause List Code_M  127 non-null    object 
 13  Death

Unnamed: 0,Notes_F,Sex_F,Sex Code_F,ICD-10 113 Cause List,ICD-10 113 Cause List Code_F,Deaths_F,Population_F,Crude Rate_F,Age Adjusted Rate_F,Notes_M,Sex_M,Sex Code_M,ICD-10 113 Cause List Code_M,Deaths_M,Population_M,Crude Rate_M,Age Adjusted Rate_M
0,,Female,F,"#Accidents (unintentional injuries) (V01-X59,Y...",GR113-112,262646.0,667357878.0,39.4,34.6,,Male,M,GR113-112,503411.0,649426947.0,77.5,76.2
1,,Female,F,#Acute bronchitis and bronchiolitis (J20-J21),GR113-080,427.0,667357878.0,0.1,0.0,,Male,M,GR113-080,344.0,649426947.0,0.1,0.0
2,,Female,F,#Alzheimer disease (G30),GR113-052,342971.0,667357878.0,51.4,35.0,,Male,M,GR113-052,154188.0,649426947.0,23.7,24.4
3,,Female,F,#Anemias (D50-D64),GR113-045,12160.0,667357878.0,1.8,1.3,,Male,M,GR113-045,9976.0,649426947.0,1.5,1.5
4,,Female,F,#Aortic aneurysm and dissection (I71),GR113-073,15960.0,667357878.0,2.4,1.8,,Male,M,GR113-073,23221.0,649426947.0,3.6,3.3


In [14]:
CDC_WONDER_NCHS_MERGE['Population_F']=female_deaths
CDC_WONDER_NCHS_MERGE['Population_M']=male_deaths

## Analysis to discern metrics by sex and condition:
- Total deaths
- Relative Mortality Risk
- Mortality Burden


In [15]:
# sum male and female deaths for total deaths due to a specific cause
# if row is NA fill with 0 to get accurate numeric total death
CDC_WONDER_NCHS_MERGE['Total_deaths'] = CDC_WONDER_NCHS_MERGE['Deaths_F'].fillna(0) + CDC_WONDER_NCHS_MERGE['Deaths_M'].fillna(0)
CDC_WONDER_NCHS_MERGE.head()

Unnamed: 0,Notes_F,Sex_F,Sex Code_F,ICD-10 113 Cause List,ICD-10 113 Cause List Code_F,Deaths_F,Population_F,Crude Rate_F,Age Adjusted Rate_F,Notes_M,Sex_M,Sex Code_M,ICD-10 113 Cause List Code_M,Deaths_M,Population_M,Crude Rate_M,Age Adjusted Rate_M,Total_deaths
0,,Female,F,"#Accidents (unintentional injuries) (V01-X59,Y...",GR113-112,262646.0,6001719.0,39.4,34.6,,Male,M,GR113-112,503411.0,6540284.0,77.5,76.2,766057.0
1,,Female,F,#Acute bronchitis and bronchiolitis (J20-J21),GR113-080,427.0,6001719.0,0.1,0.0,,Male,M,GR113-080,344.0,6540284.0,0.1,0.0,771.0
2,,Female,F,#Alzheimer disease (G30),GR113-052,342971.0,6001719.0,51.4,35.0,,Male,M,GR113-052,154188.0,6540284.0,23.7,24.4,497159.0
3,,Female,F,#Anemias (D50-D64),GR113-045,12160.0,6001719.0,1.8,1.3,,Male,M,GR113-045,9976.0,6540284.0,1.5,1.5,22136.0
4,,Female,F,#Aortic aneurysm and dissection (I71),GR113-073,15960.0,6001719.0,2.4,1.8,,Male,M,GR113-073,23221.0,6540284.0,3.6,3.3,39181.0


In [16]:
# sort by total death to get top causes of death (aligns with literature)
CDC_WONDER_NCHS_MERGE.sort_values(['Total_deaths'], ascending=[False]).head(20)

Unnamed: 0,Notes_F,Sex_F,Sex Code_F,ICD-10 113 Cause List,ICD-10 113 Cause List Code_F,Deaths_F,Population_F,Crude Rate_F,Age Adjusted Rate_F,Notes_M,Sex_M,Sex Code_M,ICD-10 113 Cause List Code_M,Deaths_M,Population_M,Crude Rate_M,Age Adjusted Rate_M,Total_deaths
80,,Female,F,Major cardiovascular diseases (I00-I78),GR113-053,1712252.0,6001719.0,256.6,181.2,,Male,M,GR113-053,1870625.0,6540284.0,288.0,268.2,3582877.0
18,,Female,F,"#Diseases of heart (I00-I09,I11,I13,I20-I51)",GR113-054,1227104.0,6001719.0,183.9,129.9,,Male,M,GR113-054,1479827.0,6540284.0,227.9,211.5,2706931.0
29,,Female,F,#Malignant neoplasms (C00-C97),GR113-019,1138608.0,6001719.0,170.6,126.7,,Male,M,GR113-019,1267830.0,6540284.0,195.2,172.9,2406438.0
56,,Female,F,All other diseases (Residual),GR113-111,842954.0,6001719.0,126.3,90.5,,Male,M,GR113-111,647363.0,6540284.0,99.7,94.7,1490317.0
78,,Female,F,Ischemic heart diseases (I20-I25),GR113-058,602205.0,6001719.0,90.2,63.6,,Male,M,GR113-058,882735.0,6540284.0,135.9,124.7,1484940.0
118,,Female,F,Other forms of chronic ischemic heart disease ...,GR113-061,419598.0,6001719.0,62.9,44.0,,Male,M,GR113-061,616089.0,6540284.0,94.9,87.8,1035687.0
119,,Female,F,Other heart diseases (I26-I51),GR113-064,472859.0,6001719.0,70.9,50.2,,Male,M,GR113-064,448771.0,6540284.0,69.1,65.7,921630.0
107,,Female,F,Other and unspecified infectious and parasitic...,GR113-018,357278.0,6001719.0,53.5,39.4,,Male,M,GR113-018,450310.0,6540284.0,69.3,63.6,807588.0
8,,Female,F,#COVID-19 (U07.1),GR113-137,338602.0,6001719.0,50.7,37.3,,Male,M,GR113-137,429122.0,6540284.0,66.1,60.6,767724.0
0,,Female,F,"#Accidents (unintentional injuries) (V01-X59,Y...",GR113-112,262646.0,6001719.0,39.4,34.6,,Male,M,GR113-112,503411.0,6540284.0,77.5,76.2,766057.0


In [17]:
#ensure age-adjusted death rates are numeric
CDC_WONDER_NCHS_MERGE["Age Adjusted Rate_F"] = pd.to_numeric(CDC_WONDER_NCHS_MERGE["Age Adjusted Rate_F"], errors="coerce")
CDC_WONDER_NCHS_MERGE["Age Adjusted Rate_M"] = pd.to_numeric(CDC_WONDER_NCHS_MERGE["Age Adjusted Rate_M"], errors="coerce")

In [18]:
#calculate relative mortality risk for females using age-adjusted death rates
CDC_WONDER_NCHS_MERGE["Relative Mortality Risk_F"] =CDC_WONDER_NCHS_MERGE["Age Adjusted Rate_F"]/CDC_WONDER_NCHS_MERGE["Age Adjusted Rate_M"]

In [19]:
#calculate relative mortality risk for males using age-adjusted death rates
CDC_WONDER_NCHS_MERGE["Relative Mortality Risk_M"]=CDC_WONDER_NCHS_MERGE["Age Adjusted Rate_M"]/CDC_WONDER_NCHS_MERGE["Age Adjusted Rate_F"]

In [20]:
#of deaths due to this cause in females/ total # of deaths in females ( Mortality burden)
CDC_WONDER_NCHS_MERGE["Female Mortality Burden"] = CDC_WONDER_NCHS_MERGE["Deaths_F"] /female_deaths

CDC_WONDER_NCHS_MERGE["Male Mortality Burden"]=CDC_WONDER_NCHS_MERGE["Deaths_M"] /male_deaths
CDC_WONDER_NCHS_MERGE

Unnamed: 0,Notes_F,Sex_F,Sex Code_F,ICD-10 113 Cause List,ICD-10 113 Cause List Code_F,Deaths_F,Population_F,Crude Rate_F,Age Adjusted Rate_F,Notes_M,...,ICD-10 113 Cause List Code_M,Deaths_M,Population_M,Crude Rate_M,Age Adjusted Rate_M,Total_deaths,Relative Mortality Risk_F,Relative Mortality Risk_M,Female Mortality Burden,Male Mortality Burden
0,,Female,F,"#Accidents (unintentional injuries) (V01-X59,Y...",GR113-112,262646.0,6001719.0,39.4,34.6,,...,GR113-112,503411.0,6540284.0,77.5,76.2,766057.0,0.454068,2.202312,0.043762,0.076971
1,,Female,F,#Acute bronchitis and bronchiolitis (J20-J21),GR113-080,427.0,6001719.0,0.1,0.0,,...,GR113-080,344.0,6540284.0,0.1,0.0,771.0,,,0.000071,0.000053
2,,Female,F,#Alzheimer disease (G30),GR113-052,342971.0,6001719.0,51.4,35.0,,...,GR113-052,154188.0,6540284.0,23.7,24.4,497159.0,1.434426,0.697143,0.057145,0.023575
3,,Female,F,#Anemias (D50-D64),GR113-045,12160.0,6001719.0,1.8,1.3,,...,GR113-045,9976.0,6540284.0,1.5,1.5,22136.0,0.866667,1.153846,0.002026,0.001525
4,,Female,F,#Aortic aneurysm and dissection (I71),GR113-073,15960.0,6001719.0,2.4,1.8,,...,GR113-073,23221.0,6540284.0,3.6,3.3,39181.0,0.545455,1.833333,0.002659,0.003550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,,Female,F,"Water, air and space, and other and unspecifie...",GR113-116,1300.0,6001719.0,0.2,0.2,,...,GR113-116,5395.0,6540284.0,0.8,0.8,6695.0,0.250000,4.000000,0.000217,0.000825
130,,,,#Hyperplasia of prostate (N40),,,6001719.0,,,,...,GR113-103,2636.0,6540284.0,0.4,0.4,2636.0,,,,0.000403
131,,,,#Malaria (B50-B54),,,6001719.0,,,,...,GR113-017,14.0,6540284.0,,,14.0,,,,0.000002
132,,,,"#Operations of war and their sequelae (Y36,Y89.1)",,,6001719.0,,,,...,GR113-134,47.0,6540284.0,0.0,0.0,47.0,,,,0.000007


### Export final dataset
Export dataset to discern conditions with high relative mortality risk amongst females

In [21]:
# the following notebook will reformat  and better organize this information to generate a table 1
CDC_WONDER_NCHS_MERGE.to_excel("Results/RelativeMortalityInitialOutput.xlsx")