In [6]:
#Code written by Victoria Dunkley

# Understanding the Reach and Impact of the Centers for Disease Control and Prevention’s Women’s Health Research, 2018–2023
#### Goal: Create a results table that will populate table 2 and the figure 1 in the publication

#### Data:
- Import: Datasets _ Indicator infomration created in the previous code notebook (3_Merge Indicator Information to SciClips)
- Export: __Bibliometric_Indicator_Table.xlsx__ ( this creates the base of table 2 )
  
#### In this notebook we will:
   1. Create a table shell.
   2. Create functions that calculate indicators for each subset of publications on conditions of interest.
   3. Populate the table shell
   4. Export table

In [7]:
## import modules
import numpy as np
import pandas as pd
import os
from IPython.display import Image
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
## Enable multiple outputs from jupyter cells
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## disable the Pandas "setting a copy of a slice" warning
pd.options.mode.chained_assignment = None

## set default number of DataFrame rows printed to 20
pd.set_option('display.max_rows', 20)

In [9]:
#get working directory
os.getcwd()
os.chdir('..')
os.getcwd()

'c:\\Users\\utu2\\OneDrive - CDC\\OS-OSQ-DataAnalytics - Documents\\Portfolio Analytics\\JWH Manuscript\\202509_REPO_for_GitHub_Share\\Code'

'c:\\Users\\utu2\\OneDrive - CDC\\OS-OSQ-DataAnalytics - Documents\\Portfolio Analytics\\JWH Manuscript\\202509_REPO_for_GitHub_Share'

In [10]:
#SciClips
df_sciclips= pd.read_excel("Results/Bibliometrics/SciClips Publication plus Indicators/20250116_2018_2023_SciClips_Publications_SciClips_Indicators.xlsx")
df_sciclips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20178 entries, 0 to 20177
Data columns (total 27 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         20178 non-null  int64         
 1   Authors                            20178 non-null  object        
 2   CDC Authors                        20178 non-null  object        
 3   Title                              20178 non-null  object        
 4   Abstract                           20178 non-null  object        
 5   Year                               20178 non-null  int64         
 6   Journal                            20178 non-null  object        
 7   Volume                             19790 non-null  object        
 8   Issue                              17213 non-null  object        
 9   Pages                              18521 non-null  object        
 10  PMID                              

In [11]:
#Alzheimers
df_alz= pd.read_excel("Results/Bibliometrics/SciClips Publication plus Indicators/20250116_2018_2023_Alzheimer_Publications_SciClips_Indicators.xlsx")
df_alz.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         35 non-null     int64         
 1   Authors                            35 non-null     object        
 2   CDC Authors                        35 non-null     object        
 3   Title_x                            35 non-null     object        
 4   Abstract                           35 non-null     object        
 5   Year                               35 non-null     int64         
 6   Journal                            35 non-null     object        
 7   Volume                             35 non-null     object        
 8   Issue                              29 non-null     object        
 9   Pages                              33 non-null     object        
 10  PMID                               35 no

In [12]:
#BreastCancer
df_breastcancer= pd.read_excel("Results/Bibliometrics/SciClips Publication plus Indicators/20250116_2018_2023_BreastCancer_Publications_SciClips_Indicators.xlsx")
df_breastcancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         146 non-null    int64         
 1   Authors                            146 non-null    object        
 2   CDC Authors                        146 non-null    object        
 3   Title_x                            146 non-null    object        
 4   Abstract                           146 non-null    object        
 5   Year                               146 non-null    int64         
 6   Journal                            146 non-null    object        
 7   Volume                             146 non-null    object        
 8   Issue                              117 non-null    object        
 9   Pages                              141 non-null    object        
 10  PMID                               145

In [13]:
#Kidney
df_kidney= pd.read_excel("Results/Bibliometrics/SciClips Publication plus Indicators/20250116_2018_2023_Kidney_Publications_SciClips_Indicators.xlsx")
df_kidney.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         24 non-null     int64         
 1   Authors                            24 non-null     object        
 2   CDC Authors                        24 non-null     object        
 3   Title_x                            24 non-null     object        
 4   Abstract                           24 non-null     object        
 5   Year                               24 non-null     int64         
 6   Journal                            24 non-null     object        
 7   Volume                             24 non-null     int64         
 8   Issue                              21 non-null     float64       
 9   Pages                              22 non-null     object        
 10  PMID                               23 no

In [14]:
#Rheumatic
df_rhu= pd.read_excel("Results/Bibliometrics/SciClips Publication plus Indicators/20250116_2018_2023_Rheumatic_Publications_SciClips_Indicators.xlsx")
df_rhu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         14 non-null     int64         
 1   Authors                            14 non-null     object        
 2   CDC Authors                        14 non-null     object        
 3   Title_x                            14 non-null     object        
 4   Abstract                           14 non-null     object        
 5   Year                               14 non-null     int64         
 6   Journal                            14 non-null     object        
 7   Volume                             14 non-null     int64         
 8   Issue                              14 non-null     int64         
 9   Pages                              14 non-null     object        
 10  PMID                               14 no

In [None]:


# Create initial table shell
bib_results_dict = {
    'Disease Condition': [
        "Alzheimer's Disease",
        'Breast Cancer',
        'Acute Rheumatic Fever and Chronic Rheumatic Heart Diseases',
        'Infections of Kidney',
        'All CDC Publications'
    ],
    'Percent with any Media Attention, Altmetric': [None] * 5,
    'Median Media Attention, Altmetric': [None] * 5,
    '25th_Percentile Media Attention, Altmetric': [None] * 5,
    '75th_Percentile Media Attention, Altmetric': [None] * 5,
    'Percent with any Academic Citation, Altmetric': [None] * 5,
    'Median Academic Citation, Altmetric': [None] * 5,
    '25th_Percentile Academic Citation, Altmetric': [None] * 5,
    '75th_Percentile Academic Citation, Altmetric': [None] * 5,
    'Percent with any Policy Citation, Altmetric, BMJ': [None] * 5,
    'Percent with any Policy Citation, Altmetric, BMJ': [None] * 5,
    'Number of Publications, 2018-2023': [None] * 5,
     'Number of Publications, 2018-2020': [None] * 5
    
}

# Create the DataFrame from the dictionary
bib_results = pd.DataFrame(bib_results_dict)

# Display the created DataFrame
bib_results

Unnamed: 0,Disease Condition,"Percent with any Media Attention, Altmetric","Median Media Attention, Altmetric","25th_Percentile Media Attention, Altmetric","75th_Percentile Media Attention, Altmetric","Percent with any Academic Citation, Altmetric","Median Academic Citation, Altmetric","25th_Percentile Academic Citation, Altmetric","75th_Percentile Academic Citation, Altmetric","Percent with any Policy Citation, Altmetric, BMJ","Number of Publications, 2018-2023","Number of Publications, 2018-2020"
0,Alzheimer's Disease,,,,,,,,,,,
1,Breast Cancer,,,,,,,,,,,
2,Acute Rheumatic Fever and Chronic Rheumatic He...,,,,,,,,,,,
3,Infections of Kidney,,,,,,,,,,,
4,All CDC Publications,,,,,,,,,,,


In [16]:
shape=df_sciclips.shape
shape
shape=df_alz.shape
shape
shape=df_breastcancer.shape
shape
shape=df_kidney.shape
shape
shape=df_rhu.shape
shape

(20178, 27)

(35, 25)

(146, 25)

(24, 25)

(14, 25)

In [17]:
#function for the number of publications on each topic from 2018-2023
def numpub20182023(condition_df):
    count_row = condition_df.shape[0]
    return count_row


In [18]:
# Populate the table with Number of Publications, 2018-2023
bib_results.loc[0, 'Number of Publications, 2018-2023'] = numpub20182023(df_alz)
bib_results.loc[1, 'Number of Publications, 2018-2023'] = numpub20182023(df_breastcancer)
bib_results.loc[2, 'Number of Publications, 2018-2023'] = numpub20182023(df_rhu)
bib_results.loc[3, 'Number of Publications, 2018-2023'] = numpub20182023(df_kidney)
bib_results.loc[4, 'Number of Publications, 2018-2023'] = numpub20182023(df_sciclips)

In [19]:
bib_results

Unnamed: 0,Disease Condition,"Percent with any Media Attention, Altmetric","Median Media Attention, Altmetric","25th_Percentile Media Attention, Altmetric","75th_Percentile Media Attention, Altmetric","Percent with any Academic Citation, Altmetric","Median Academic Citation, Altmetric","25th_Percentile Academic Citation, Altmetric","75th_Percentile Academic Citation, Altmetric","Percent with any Policy Citation, Altmetric, BMJ","Number of Publications, 2018-2023","Number of Publications, 2018-2020"
0,Alzheimer's Disease,,,,,,,,,,35,
1,Breast Cancer,,,,,,,,,,146,
2,Acute Rheumatic Fever and Chronic Rheumatic He...,,,,,,,,,,14,
3,Infections of Kidney,,,,,,,,,,24,
4,All CDC Publications,,,,,,,,,,20178,


In [None]:
# create a function that calculates the percent of publications (by condition) that recieved any (greatrer than 0) media attentnion (paresed from the altemtric attention score)
def any_attention(condition_df, column_name='Altmetric Attention Score'):
    attention_score_greater_than_zero = (condition_df[column_name] > 0).sum()
    total_count = len(condition_df)
    if total_count == 0:
        return 0  
    
    return round(attention_score_greater_than_zero / total_count,2)

# create a function that calculates median attention (by condition)
def median_attention(condition_df, column_name='Altmetric Attention Score'):
    median = condition_df[column_name].median()
    return median
# create a function that calculates 25th percentile attention (by condition)
def percentile25_attention(condition_df, column_name='Altmetric Attention Score'):
    percentile_25 = condition_df[column_name].quantile(0.25)
    return percentile_25 

# create a function that calculates 75th percentile attention  (by condition)
def percentile75_attention(condition_df, column_name='Altmetric Attention Score'):
    percentile_75 = condition_df[column_name].quantile(0.75)
    return percentile_75

In [None]:
#populate table
bib_results.loc[0, 'Percent with any Media Attention, Altmetric'] = any_attention(df_alz)
bib_results.loc[1, 'Percent with any Media Attention, Altmetric'] = any_attention(df_breastcancer)
bib_results.loc[2, 'Percent with any Media Attention, Altmetric'] = any_attention(df_rhu)
bib_results.loc[3, 'Percent with any Media Attention, Altmetric'] = any_attention(df_kidney)
bib_results.loc[4, 'Percent with any Media Attention, Altmetric'] = any_attention(df_sciclips)

In [None]:
#populate table
bib_results.loc[0, 'Median Media Attention, Altmetric'] = median_attention(df_alz)
bib_results.loc[1, 'Median Media Attention, Altmetric'] = median_attention(df_breastcancer)
bib_results.loc[2, 'Median Media Attention, Altmetric'] = median_attention(df_rhu)
bib_results.loc[3, 'Median Media Attention, Altmetric'] = median_attention(df_kidney)
bib_results.loc[4, 'Median Media Attention, Altmetric'] = median_attention(df_sciclips)

In [None]:
#populate table
bib_results.loc[0, '25th_Percentile Media Attention, Altmetric'] = percentile25_attention(df_alz)
bib_results.loc[1, '25th_Percentile Media Attention, Altmetric'] = percentile25_attention(df_breastcancer)
bib_results.loc[2, '25th_Percentile Media Attention, Altmetric'] = percentile25_attention(df_rhu)
bib_results.loc[3, '25th_Percentile Media Attention, Altmetric'] = percentile25_attention(df_kidney)
bib_results.loc[4, '25th_Percentile Media Attention, Altmetric'] = percentile25_attention(df_sciclips)

In [None]:
#populate table
bib_results.loc[0, '75th_Percentile Media Attention, Altmetric'] = percentile75_attention(df_alz)
bib_results.loc[1, '75th_Percentile Media Attention, Altmetric'] = percentile75_attention(df_breastcancer)
bib_results.loc[2, '75th_Percentile Media Attention, Altmetric'] = percentile75_attention(df_rhu)
bib_results.loc[3, '75th_Percentile Media Attention, Altmetric'] = percentile75_attention(df_kidney)
bib_results.loc[4, '75th_Percentile Media Attention, Altmetric'] = percentile75_attention(df_sciclips)

In [25]:
bib_results

Unnamed: 0,Disease Condition,"Percent with any Media Attention, Altmetric","Median Media Attention, Altmetric","25th_Percentile Media Attention, Altmetric","75th_Percentile Media Attention, Altmetric","Percent with any Academic Citation, Altmetric","Median Academic Citation, Altmetric","25th_Percentile Academic Citation, Altmetric","75th_Percentile Academic Citation, Altmetric","Percent with any Policy Citation, Altmetric, BMJ","Number of Publications, 2018-2023","Number of Publications, 2018-2020"
0,Alzheimer's Disease,0.91,14.0,6.0,69.5,,,,,,35,
1,Breast Cancer,0.85,6.5,1.0,38.25,,,,,,146,
2,Acute Rheumatic Fever and Chronic Rheumatic He...,0.93,14.5,3.5,97.0,,,,,,14,
3,Infections of Kidney,0.83,4.0,1.0,28.75,,,,,,24,
4,All CDC Publications,0.85,4.0,1.0,19.0,,,,,,20178,


### Acadmeic and Policy indicators must be calcuated using older publications so they have time to garner citation. The following indicators we be calculated using a subset of publications

In [None]:
# Academic indicators+ Year filter

# create a function that filters year
def year_subset(condition_df, column_name='Year'):
    # Subset df to include only rows where the Year is between 2018 and 2020 (inclusive)
    condition_df = condition_df[(condition_df[column_name] >= 2018) & (condition_df[column_name] <= 2020)]
    return condition_df
# create a function that calculates % with any academic (parsed from dimension citations) citations
def academic_citation(condition_df, column_name='Number of Dimensions citations'):
    academic_cit_greater_than_zero = (condition_df[column_name] > 0).sum()
    total_count = len(condition_df)
    if total_count == 0:
        return 0 
    
    return round(academic_cit_greater_than_zero / total_count,2)
# create a function that calculates median academic ciattion
def median_academic_citation(condition_df, column_name='Number of Dimensions citations'):
    median = condition_df[column_name].median()
    return median

# create a function that calculates 25th percentile academic ciattion
def percentile25_academic(condition_df, column_name='Number of Dimensions citations'):
    percentile_25 = condition_df[column_name].quantile(0.25)
    return percentile_25 

# create a function that calculates 75th percentile academic ciattion
def percentile75_academic(condition_df, column_name='Number of Dimensions citations'):
    percentile_75 = condition_df[column_name].quantile(.75)
    return percentile_75



In [None]:
#filter on year
df_sciclips=year_subset(df_sciclips)
df_alz=year_subset(df_alz)
df_breastcancer=year_subset(df_breastcancer)
df_kidney=year_subset(df_kidney)
df_rhu=year_subset(df_rhu)

In [None]:
#get data shape/
shape=df_sciclips.shape
shape
shape=df_alz.shape
shape
shape=df_breastcancer.shape
shape
shape=df_kidney.shape
shape
shape=df_rhu.shape
shape

(10268, 27)

(14, 25)

(82, 25)

(9, 25)

(4, 25)

In [None]:
#Number of Publications, 2018-2020
def numpub20182020(condition_df):
    count_row = condition_df.shape[0]
    return count_row


In [None]:
#populate table
bib_results.loc[0, 'Number of Publications, 2018-2020'] = numpub20182020(df_alz)
bib_results.loc[1, 'Number of Publications, 2018-2020'] = numpub20182020(df_breastcancer)
bib_results.loc[2, 'Number of Publications, 2018-2020'] = numpub20182020(df_rhu)
bib_results.loc[3, 'Number of Publications, 2018-2020'] = numpub20182020(df_kidney)
bib_results.loc[4, 'Number of Publications, 2018-2020'] = numpub20182020(df_sciclips)

In [None]:
#populate table
bib_results.loc[0, 'Percent with any Academic Citation, Altmetric'] = academic_citation(df_alz)
bib_results.loc[1, 'Percent with any Academic Citation, Altmetric'] = academic_citation(df_breastcancer)
bib_results.loc[2, 'Percent with any Academic Citation, Altmetric'] = academic_citation(df_rhu)
bib_results.loc[3, 'Percent with any Academic Citation, Altmetric'] = academic_citation(df_kidney)
bib_results.loc[4, 'Percent with any Academic Citation, Altmetric'] = academic_citation(df_sciclips)

In [None]:
#populate table
bib_results.loc[0, 'Median Academic Citation, Altmetric'] = median_academic_citation(df_alz)
bib_results.loc[1, 'Median Academic Citation, Altmetric'] = median_academic_citation(df_breastcancer)
bib_results.loc[2, 'Median Academic Citation, Altmetric'] = median_academic_citation(df_rhu)
bib_results.loc[3, 'Median Academic Citation, Altmetric'] = median_academic_citation(df_kidney)
bib_results.loc[4, 'Median Academic Citation, Altmetric'] = median_academic_citation(df_sciclips)

In [None]:
#populate table
bib_results.loc[0, '25th_Percentile Academic Citation, Altmetric'] = percentile25_academic(df_alz)
bib_results.loc[1, '25th_Percentile Academic Citation, Altmetric'] = percentile25_academic(df_breastcancer)
bib_results.loc[2, '25th_Percentile Academic Citation, Altmetric'] = percentile25_academic(df_rhu)
bib_results.loc[3, '25th_Percentile Academic Citation, Altmetric'] = percentile25_academic(df_kidney)
bib_results.loc[4, '25th_Percentile Academic Citation, Altmetric'] = percentile25_academic(df_sciclips)

In [None]:
#populate table
bib_results.loc[0, '75th_Percentile Academic Citation, Altmetric'] = percentile75_academic(df_alz)
bib_results.loc[1, '75th_Percentile Academic Citation, Altmetric'] = percentile75_academic(df_breastcancer)
bib_results.loc[2, '75th_Percentile Academic Citation, Altmetric'] = percentile75_academic(df_rhu)
bib_results.loc[3, '75th_Percentile Academic Citation, Altmetric'] = percentile75_academic(df_kidney)
bib_results.loc[4, '75th_Percentile Academic Citation, Altmetric'] = percentile75_academic(df_sciclips)

In [37]:
bib_results

Unnamed: 0,Disease Condition,"Percent with any Media Attention, Altmetric","Median Media Attention, Altmetric","25th_Percentile Media Attention, Altmetric","75th_Percentile Media Attention, Altmetric","Percent with any Academic Citation, Altmetric","Median Academic Citation, Altmetric","25th_Percentile Academic Citation, Altmetric","75th_Percentile Academic Citation, Altmetric","Percent with any Policy Citation, Altmetric, BMJ","Number of Publications, 2018-2023","Number of Publications, 2018-2020"
0,Alzheimer's Disease,0.91,14.0,6.0,69.5,0.93,33.0,14.0,75.75,,35,14
1,Breast Cancer,0.85,6.5,1.0,38.25,0.89,17.0,7.0,30.5,,146,82
2,Acute Rheumatic Fever and Chronic Rheumatic He...,0.93,14.5,3.5,97.0,0.75,26.5,16.5,59.25,,14,4
3,Infections of Kidney,0.83,4.0,1.0,28.75,1.0,21.0,13.0,31.0,,24,9
4,All CDC Publications,0.85,4.0,1.0,19.0,0.95,15.0,6.0,34.0,,20178,10268


In [38]:
academic_citation(df_sciclips)
academic_citation(df_alz)
academic_citation(df_breastcancer)
academic_citation(df_kidney)
academic_citation(df_rhu)

0.95

0.93

0.89

1.0

0.75

In [None]:
# Policy indicator flag
def any_policy(condition_df, column_name='Any Policy'):
    policy_greater_than_zero = (condition_df[column_name] > 0).sum()
    total_count = len(condition_df)
    if total_count == 0:
        return 0  
    return round(policy_greater_than_zero / total_count,2)

In [None]:
#Calculate percent with any policy citation using data from alt or bmj
bib_results.loc[0, 'Percent with any Policy Citation, Altmetric, BMJ'] = any_policy(df_alz)
bib_results.loc[1, 'Percent with any Policy Citation, Altmetric, BMJ'] = any_policy(df_breastcancer)
bib_results.loc[2, 'Percent with any Policy Citation, Altmetric, BMJ'] = any_policy(df_rhu)
bib_results.loc[3, 'Percent with any Policy Citation, Altmetric, BMJ'] = any_policy(df_kidney)
bib_results.loc[4, 'Percent with any Policy Citation, Altmetric, BMJ'] = any_policy(df_sciclips)
bib_results

Unnamed: 0,Disease Condition,"Percent with any Media Attention, Altmetric","Median Media Attention, Altmetric","25th_Percentile Media Attention, Altmetric","75th_Percentile Media Attention, Altmetric","Percent with any Academic Citation, Altmetric","Median Academic Citation, Altmetric","25th_Percentile Academic Citation, Altmetric","75th_Percentile Academic Citation, Altmetric","Percent with any Policy Citation, Altmetric, BMJ","Number of Publications, 2018-2023","Number of Publications, 2018-2020"
0,Alzheimer's Disease,0.91,14.0,6.0,69.5,0.93,33.0,14.0,75.75,0.5,35,14
1,Breast Cancer,0.85,6.5,1.0,38.25,0.89,17.0,7.0,30.5,0.46,146,82
2,Acute Rheumatic Fever and Chronic Rheumatic He...,0.93,14.5,3.5,97.0,0.75,26.5,16.5,59.25,0.5,14,4
3,Infections of Kidney,0.83,4.0,1.0,28.75,1.0,21.0,13.0,31.0,0.22,24,9
4,All CDC Publications,0.85,4.0,1.0,19.0,0.95,15.0,6.0,34.0,0.49,20178,10268


In [41]:
any_policy(df_sciclips)
any_policy(df_alz)
any_policy(df_breastcancer)
any_policy(df_kidney)
any_policy(df_rhu)

0.49

0.5

0.46

0.22

0.5

In [None]:
#Export out indicator table
bib_results.to_excel("Results/Bibliometric_Indicator_Table.xlsx")