In [1]:
#Code written by Victoria Dunkley

# Understanding the Reach and Impact of the Centers for Disease Control and Prevention’s Women’s Health Research, 2018–2023
#### Code Notebook Goal: The goal of this notbeook is to merge CDC-authored publications on conditions of interest to media, academic, and policy data queried from Altmetric and BMJ Impact Analytics.

##### **Conditions of Interest (identified using female relative mortality risk)**:
 - Alzheimer disease
 - Breast Cancer
 - Infections of the Kidney
 - Acute rheumatic fever and chronic rheumatic heart diseases

  **Bibliometric indicators for all CDC-authored publications published within the same timeframe will be calculated as a baseline comparision**

##### Data Sources Explained:
 1. __CDC Science Clips__: CDC-authored publications on codnitions of interest were identified by CDC librarians using CDC Science Clips (2018-2023).
    - Publications returned using systematic search terms were accessed from CDC Science Clips on September 24, 2024.      
 2. __Altmetric__: data pulled from Altemetric will be used to calulcate media attentnion, academic citation, and policy citation metrics.
    - Bibliometric indicators were calculated using data from publications indexed in Altmetric and BMJ Impact Analytics accessed on December 16, 2024.
 3. __BMJ Impact Analytics__:data pulled from BMJ will be used to calulcate the policy citation metric.
    - Bibliometric indicators were calculated using data from publications indexed in Altmetric and BMJ Impact Analytics accessed on December 16, 2024.


In [2]:
## import modules
import numpy as np
import pandas as pd
import os
from IPython.display import Image
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
## Enable multiple outputs from jupyter cells
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## disable the Pandas "setting a copy of a slice" warning
pd.options.mode.chained_assignment = None

## set default number of DataFrame rows printed to 20
pd.set_option('display.max_rows', 20)

In [4]:
# #get working directory
os.getcwd()
os.chdir('..')
os.getcwd()

'c:\\Users\\utu2\\OneDrive - CDC\\OS-OSQ-DataAnalytics - Documents\\Portfolio Analytics\\JWH Manuscript\\202509_REPO_for_GitHub_Share\\Code'

'c:\\Users\\utu2\\OneDrive - CDC\\OS-OSQ-DataAnalytics - Documents\\Portfolio Analytics\\JWH Manuscript\\202509_REPO_for_GitHub_Share'

### Science Clips quality check, cleaning, and combining

#### Data Import

In [5]:
# Science Clips data:
df_sciclips_total= pd.read_excel("Results/Bibliometrics/SciClips Publications/2018_2023_SciClips_Publications_SciClips.xlsx")
df_alz_total= pd.read_excel("Results/Bibliometrics/SciClips Publications/2018_2023_Alzheimer_Publications_SciClips.xlsx")
df_breastcancer_total= pd.read_excel("Results/Bibliometrics/SciClips Publications/2018_2023_BreastCancer_Publications_SciClips.xlsx")
df_kidney_total= pd.read_excel("Results/Bibliometrics/SciClips Publications/2018_2023_Kidney_Publications_SciClips.xlsx")
df_rhu_total= pd.read_excel("Results/Bibliometrics/SciClips Publications/2018_2023_Rheumatic_Publications_SciClips.xlsx")

In [6]:
# Altmetric data:
df_sciclips_alt= pd.read_excel("Results/Bibliometrics/Altmetric Data Pull/Altmetric_20241216_2018_2023_SciClips_Publications_SciClips.xlsx")
df_alz_alt= pd.read_excel("Results/Bibliometrics/Altmetric Data Pull/Altmetric_20241216_2018_2023_Alzheimers_Publications_SciClips.xlsx")
df_breastcancer_alt= pd.read_excel("Results/Bibliometrics/Altmetric Data Pull/Altmetric_20241216_2018_2023_BreastCancer_Publications_SciClips.xlsx")
df_kidney_alt= pd.read_excel("Results/Bibliometrics/Altmetric Data Pull/Altmetric_20241216_2018_2023_Kidney_Publications_SciClips.xlsx")
df_rhu_alt= pd.read_excel("Results/Bibliometrics/Altmetric Data Pull/Altmetric_20241216_2018_2023_Rheumatic_Publications_SciClips.xlsx")

In [7]:
#BMJ
df_sciclips_bmj= pd.read_excel("Results/Bibliometrics/BMJ Data Pull/BMJ_20241216_2018_2020_SciClips_Publications_SciClips.xlsx")
df_alz_bmj= pd.read_excel("Results/Bibliometrics/BMJ Data Pull/BMJ_20241216_2018_2020_Alzheimers_Publications_SciClips.xlsx")
df_breastcancer_bmj= pd.read_excel("Results/Bibliometrics/BMJ Data Pull/BMJ_20241216_2018_2020_BreastCancer_Publications_SciClips.xlsx")
df_kidney_bmj= pd.read_excel("Results/Bibliometrics/BMJ Data Pull/BMJ_20241216_2018_2020_Kidney_Publications_SciClips.xlsx")
df_rhu_bmj= pd.read_excel("Results/Bibliometrics/BMJ Data Pull/BMJ_20241216_2018_2020_Rheumatic_Publications_SciClips.xlsx")

#### Merge, Filter, and restructure Science Clips and bibliometric indicator data

In [8]:
#Examine shape
df_sciclips_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20178 entries, 0 to 20177
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         20178 non-null  int64         
 1   Authors                            20178 non-null  object        
 2   CDC Authors                        20178 non-null  object        
 3   Title                              20178 non-null  object        
 4   Abstract                           20178 non-null  object        
 5   Year                               20178 non-null  int64         
 6   Journal                            20178 non-null  object        
 7   Volume                             19790 non-null  object        
 8   Issue                              17213 non-null  object        
 9   Pages                              18521 non-null  object        
 10  PMID                              

In [9]:
#Examine shape
df_sciclips_alt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19803 entries, 0 to 19802
Data columns (total 50 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   Altmetric Attention Score       19803 non-null  int64         
 1   Title                           19803 non-null  object        
 2   Journal/Collection Title        19778 non-null  object        
 3   Journal ISSNs                   18900 non-null  object        
 4   Authors at my Institution       18533 non-null  object        
 5   Departments                     3881 non-null   object        
 6   Output Type                     19803 non-null  object        
 7   OA Status                       19803 non-null  bool          
 8   OA Type                         19802 non-null  object        
 9   Subjects (FoR)                  19712 non-null  object        
 10  Sustainable Development Goals   6691 non-null   object        
 11  Af

In [10]:
df_sciclips_alt.head()

Unnamed: 0,Altmetric Attention Score,Title,Journal/Collection Title,Journal ISSNs,Authors at my Institution,Departments,Output Type,OA Status,OA Type,Subjects (FoR),...,Q&A mentions,Video mentions,Clinical guidelines mentions,Bluesky mentions,Syllabi mentions,Number of Mendeley readers,Number of Dimensions citations,Details Page URL,Badge URL,Publisher Names
0,23250,Aerosol and Surface Stability of SARS-CoV-2 as...,New England Journal of Medicine,"0028-4793, 1533-4406","EOC, JICCLEARN/A; Holbrook, M. G.; EOC, JICCLE...",DVH; NCHHSTP; CDC; IOD,Article,True,green,32 Biomedical and Clinical Sciences; 42 Health...,...,6,18,34,6,0,11319,7952,https://www.altmetric.com/details/77699394,https://api.altmetric.com/v1/donut/77699394_24...,Massachusetts Medical Society; StatRef
1,19423,Preliminary Findings of mRNA Covid-19 Vaccine ...,New England Journal of Medicine,"0028-4793, 1533-4406","Olson, C. K.; Moro, P. L.; EOC, JICCLEARN/A; G...",,Article,True,green,32 Biomedical and Clinical Sciences; 3215 Repr...,...,1,5,1,0,0,1275,790,https://www.altmetric.com/details/104429885,https://api.altmetric.com/v1/donut/104429885_2...,Massachusetts Medical Society; StatRef
2,18273,"Outbreak of SARS-CoV-2 Infections, Including C...",MMWR: Morbidity & Mortality Weekly Report,"0149-2195, 1545-861X","Schubert, P. L.; Gharpure, R.; Brock-Fisher, T...",,Article,True,gold,32 Biomedical and Clinical Sciences; 3202 Clin...,...,0,5,0,5,0,394,509,https://www.altmetric.com/details/110687946,https://api.altmetric.com/v1/donut/110687946_2...,
3,17864,Myocarditis Cases Reported After mRNA-Based CO...,JAMA: Journal of the American Medical Association,"0098-7484, 1538-3598","Thompson, D. L.; Dendy, J. M.; Woo, J.; Ruberg...",,Article,True,bronze,42 Health Sciences; 4203 Health Services and S...,...,0,11,2,4,0,436,514,https://www.altmetric.com/details/121557704,https://api.altmetric.com/v1/donut/121557704_2...,American Medical Association; StatRef
4,13289,Laboratory-Confirmed COVID-19 Among Adults Hos...,MMWR: Morbidity & Mortality Weekly Report,"0149-2195, 1545-861X","Bozio, C. H.; Rao, S.; Azziz-Baumgartner, E.; ...",,Article,True,gold,32 Biomedical and Clinical Sciences; 3202 Clin...,...,0,4,1,2,0,142,98,https://www.altmetric.com/details/115962066,https://api.altmetric.com/v1/donut/115962066_2...,


In [11]:
#keep all but this row
df_sciclips_alt = df_sciclips_alt[df_sciclips_alt['Details Page URL'] != 'https://www.altmetric.com/details/72325352']

In [12]:
#ensure all DOI are lowercase prior to merge
df_sciclips_bmj['DOI'] = df_sciclips_bmj['DOI'].str.lower()
df_sciclips_total['DOI'] = df_sciclips_total['DOI'].str.lower()
df_sciclips_alt['DOI'] = df_sciclips_alt['DOI'].str.lower()

In [13]:
#merge in altmetric info
df_clips_DOI = pd.merge(df_sciclips_total,df_sciclips_alt,how='left',on='DOI')

In [14]:
#get post merge dataset shape
df_clips_DOI.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20178 entries, 0 to 20177
Data columns (total 68 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         20178 non-null  int64         
 1   Authors                            20178 non-null  object        
 2   CDC Authors                        20178 non-null  object        
 3   Title_x                            20178 non-null  object        
 4   Abstract                           20178 non-null  object        
 5   Year                               20178 non-null  int64         
 6   Journal                            20178 non-null  object        
 7   Volume                             19790 non-null  object        
 8   Issue                              17213 non-null  object        
 9   Pages                              18521 non-null  object        
 10  PMID                              

In [15]:
#filter to only keep rows that are not NA
df_clips_DOI = df_clips_DOI[~df_clips_DOI['Details Page URL'].isna()]

In [16]:
df_clips_DOI.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19810 entries, 0 to 20177
Data columns (total 68 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         19810 non-null  int64         
 1   Authors                            19810 non-null  object        
 2   CDC Authors                        19810 non-null  object        
 3   Title_x                            19810 non-null  object        
 4   Abstract                           19810 non-null  object        
 5   Year                               19810 non-null  int64         
 6   Journal                            19810 non-null  object        
 7   Volume                             19426 non-null  object        
 8   Issue                              16872 non-null  object        
 9   Pages                              18170 non-null  object        
 10  PMID                               1900

In [17]:
#only the rows where the 'DOI' is not present
df_sciclips_total = df_sciclips_total[~df_sciclips_total['DOI'].isin(df_clips_DOI['DOI'])]

In [18]:
df_sciclips_total.info()

<class 'pandas.core.frame.DataFrame'>
Index: 368 entries, 19 to 20161
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         368 non-null    int64         
 1   Authors                            368 non-null    object        
 2   CDC Authors                        368 non-null    object        
 3   Title                              368 non-null    object        
 4   Abstract                           368 non-null    object        
 5   Year                               368 non-null    int64         
 6   Journal                            368 non-null    object        
 7   Volume                             364 non-null    object        
 8   Issue                              341 non-null    object        
 9   Pages                              351 non-null    object        
 10  PMID                               269 n

In [19]:
 #only the rows where the 'Details Page URL' is not present
df_sciclips_alt = df_sciclips_alt[~df_sciclips_alt['Details Page URL'].isin(df_clips_DOI['Details Page URL'])]

In [20]:
df_sciclips_alt.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 2903 to 18274
Data columns (total 50 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   Altmetric Attention Score       3 non-null      int64         
 1   Title                           3 non-null      object        
 2   Journal/Collection Title        3 non-null      object        
 3   Journal ISSNs                   3 non-null      object        
 4   Authors at my Institution       2 non-null      object        
 5   Departments                     1 non-null      object        
 6   Output Type                     3 non-null      object        
 7   OA Status                       3 non-null      bool          
 8   OA Type                         3 non-null      object        
 9   Subjects (FoR)                  3 non-null      object        
 10  Sustainable Development Goals   2 non-null      object        
 11  Affiliat

In [21]:
#rename for easier merge
df_sciclips_alt.rename(columns={'PubMed ID': 'PMID'}, inplace=True)

In [22]:
#PMID merge
df_clips_PMID = pd.merge(df_sciclips_total,df_sciclips_alt,how = 'left',on='PMID')

In [23]:
df_clips_PMID.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368 entries, 0 to 367
Data columns (total 68 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         368 non-null    int64         
 1   Authors                            368 non-null    object        
 2   CDC Authors                        368 non-null    object        
 3   Title_x                            368 non-null    object        
 4   Abstract                           368 non-null    object        
 5   Year                               368 non-null    int64         
 6   Journal                            368 non-null    object        
 7   Volume                             364 non-null    object        
 8   Issue                              341 non-null    object        
 9   Pages                              351 non-null    object        
 10  PMID                               269

In [24]:
#keep is not NA rows
df_clips_PMID = df_clips_PMID[~df_clips_PMID['Details Page URL'].isna()]

In [25]:
#we will concat using this later
df_sciclips_total = df_sciclips_total[~df_sciclips_total['PMID'].isin(df_clips_PMID['PMID'])]

In [26]:
df_clips_DOI.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19810 entries, 0 to 20177
Data columns (total 68 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         19810 non-null  int64         
 1   Authors                            19810 non-null  object        
 2   CDC Authors                        19810 non-null  object        
 3   Title_x                            19810 non-null  object        
 4   Abstract                           19810 non-null  object        
 5   Year                               19810 non-null  int64         
 6   Journal                            19810 non-null  object        
 7   Volume                             19426 non-null  object        
 8   Issue                              16872 non-null  object        
 9   Pages                              18170 non-null  object        
 10  PMID                               1900

In [27]:
df_clips_DOI.rename(columns={'Title_x': 'Title'}, inplace=True)

In [28]:
df_clips_PMID.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 252 to 336
Data columns (total 68 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         3 non-null      int64         
 1   Authors                            3 non-null      object        
 2   CDC Authors                        3 non-null      object        
 3   Title_x                            3 non-null      object        
 4   Abstract                           3 non-null      object        
 5   Year                               3 non-null      int64         
 6   Journal                            3 non-null      object        
 7   Volume                             3 non-null      object        
 8   Issue                              2 non-null      object        
 9   Pages                              3 non-null      object        
 10  PMID                               3 non-nu

In [29]:
remove_columns_PMID = df_sciclips_alt.columns.to_list()
remove_columns_PMID.remove('Altmetric Attention Score')
remove_columns_PMID.remove('DOI')
remove_columns_PMID.remove('Title')
remove_columns_PMID.remove('PMID')
remove_columns_PMID.remove('Policy mentions')
remove_columns_PMID.remove('Number of Dimensions citations')
remove_columns_PMID.extend(['Title_y', 'DOI_y'])

In [30]:
df_clips_PMID.drop(remove_columns_PMID,axis=1,inplace=True)
df_clips_PMID.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 252 to 336
Data columns (total 22 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         3 non-null      int64         
 1   Authors                            3 non-null      object        
 2   CDC Authors                        3 non-null      object        
 3   Title_x                            3 non-null      object        
 4   Abstract                           3 non-null      object        
 5   Year                               3 non-null      int64         
 6   Journal                            3 non-null      object        
 7   Volume                             3 non-null      object        
 8   Issue                              2 non-null      object        
 9   Pages                              3 non-null      object        
 10  PMID                               3 non-nu

In [31]:
df_clips_PMID.rename(columns={'Title_x': 'Title', 'DOI_x': 'DOI'}, inplace=True)

In [32]:
#concat to get a full view of sciclips
df_clips = pd.concat([df_clips_DOI,df_clips_PMID,df_sciclips_total])

In [33]:
df_clips.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20178 entries, 0 to 20161
Data columns (total 68 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         20178 non-null  int64         
 1   Authors                            20178 non-null  object        
 2   CDC Authors                        20178 non-null  object        
 3   Title                              20178 non-null  object        
 4   Abstract                           20178 non-null  object        
 5   Year                               20178 non-null  int64         
 6   Journal                            20178 non-null  object        
 7   Volume                             19790 non-null  object        
 8   Issue                              17213 non-null  object        
 9   Pages                              18521 non-null  object        
 10  PMID                               1926

In [34]:
#drop BMJ columns that are not of interest or already in the original dataset
columns_to_drop = ['Title', 'Journal','Authors','ORCIDs','Your tags']  # Specify the columns you want to drop
df_sciclips_bmj = df_sciclips_bmj.drop(columns=columns_to_drop)
df_sciclips_bmj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4665 entries, 0 to 4664
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   DOI                    4665 non-null   object        
 1   Published on           4665 non-null   datetime64[ns]
 2   Policy citation count  4665 non-null   int64         
 3   Type                   4665 non-null   object        
 4   Publisher              4664 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 182.4+ KB


In [35]:
#merge in bmj info
df_clips = pd.merge(df_clips,df_sciclips_bmj,how = 'left',on='DOI')
df_clips.info()
df_clips.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20178 entries, 0 to 20177
Data columns (total 72 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         20178 non-null  int64         
 1   Authors                            20178 non-null  object        
 2   CDC Authors                        20178 non-null  object        
 3   Title                              20178 non-null  object        
 4   Abstract                           20178 non-null  object        
 5   Year                               20178 non-null  int64         
 6   Journal                            20178 non-null  object        
 7   Volume                             19790 non-null  object        
 8   Issue                              17213 non-null  object        
 9   Pages                              18521 non-null  object        
 10  PMID                              

Unnamed: 0.1,Unnamed: 0,Authors,CDC Authors,Title,Abstract,Year,Journal,Volume,Issue,Pages,...,Syllabi mentions,Number of Mendeley readers,Number of Dimensions citations,Details Page URL,Badge URL,Publisher Names,Published on,Policy citation count,Type,Publisher
0,1967,"Abad, NM, S. D. | Huang, Q. | Hendrich, M. A. ...","Abad, N. | Wilhelm, E. | Baack, B. | Bonner,...",A qualitative study of behavioral and social d...,INTRODUCTION: Around one-third of Americans re...,2023,PLoS One,18,2,e0281497,...,0.0,30.0,6.0,https://www.altmetric.com/details/143244031,https://api.altmetric.com/v1/donut/143244031_2...,Public Library of Science (PLOS),NaT,,,
1,1968,"Abah, AUO, A. | Adewole, A. | Usifoh, N. | Iya...","Usifoh, N.",Environmental and psychosocial predictors of c...,Cervical cancer (CC) is the second leading cau...,2023,Afr J Reprod Health,27,7,32-42,...,0.0,18.0,2.0,https://www.altmetric.com/details/155195527,https://api.altmetric.com/v1/donut/155195527_2...,,NaT,,,
2,1969,"Abara, WEB, K. T. | Lewis, F. M. T. | Pathela,...","Abara, W. E. | Bernstein, K. T. | Kirkcaldy, R.",Healthy vaccinee bias and MenB-FHbp vaccine ef...,Observational studies demonstrated 30-40% effe...,2023,Sex Transm Dis,50,6,e8-e10,...,0.0,7.0,8.0,https://www.altmetric.com/details/144295219,https://api.altmetric.com/v1/donut/144295219_2...,Lippincott Williams & Wilkins (LWW); Wolters K...,NaT,,,
3,1970,"Abara, WEG, J. | Marquez, P. | Woo, J. | Myers...","Abara, W. E. | Gee, J. | Marquez, P. | Woo, J....",Reports of Guillain-Barré Syndrome after COVID...,IMPORTANCE: Because of historical associations...,2023,JAMA Netw Open,6,2,e2253845,...,0.0,64.0,57.0,https://www.altmetric.com/details/142033151,https://api.altmetric.com/v1/donut/142033151_2...,American Medical Association,NaT,,,
4,1971,"Abara, WES, P. | Carpino, T. | Sanchez, T. | A...","Abara, W. E. | Delaney, K. | Ogale, Y. | Gallo...",Characteristics of mpox vaccine recipients amo...,Mpox vaccination is recommended for persons ex...,2023,Sex Transm Dis,50,7,458-461,...,0.0,18.0,2.0,https://www.altmetric.com/details/151021228,https://api.altmetric.com/v1/donut/151021228_2...,Lippincott Williams & Wilkins (LWW); Wolters K...,NaT,,,


In [36]:
#Ensure numeric columns that will be used to calculate indicators have 0 in place of NA
df_clips['Altmetric Attention Score'] = df_clips['Altmetric Attention Score'].fillna(0)
df_clips['Number of Dimensions citations'] = df_clips['Number of Dimensions citations'].fillna(0)
df_clips['Policy mentions'] = df_clips['Policy mentions'].fillna(0)
df_clips['Policy citation count'] = df_clips['Policy citation count'].fillna(0)

In [None]:
# add an any policy flag column
df_clips['Any Policy'] = ((df_clips['Policy mentions'] > 0) | (df_clips['Policy citation count'] > 0)).astype(int)
df_clips.info()
df_clips.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20178 entries, 0 to 20177
Data columns (total 73 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         20178 non-null  int64         
 1   Authors                            20178 non-null  object        
 2   CDC Authors                        20178 non-null  object        
 3   Title                              20178 non-null  object        
 4   Abstract                           20178 non-null  object        
 5   Year                               20178 non-null  int64         
 6   Journal                            20178 non-null  object        
 7   Volume                             19790 non-null  object        
 8   Issue                              17213 non-null  object        
 9   Pages                              18521 non-null  object        
 10  PMID                              

Unnamed: 0.1,Unnamed: 0,Authors,CDC Authors,Title,Abstract,Year,Journal,Volume,Issue,Pages,...,Number of Mendeley readers,Number of Dimensions citations,Details Page URL,Badge URL,Publisher Names,Published on,Policy citation count,Type,Publisher,Any Policy
0,1967,"Abad, NM, S. D. | Huang, Q. | Hendrich, M. A. ...","Abad, N. | Wilhelm, E. | Baack, B. | Bonner,...",A qualitative study of behavioral and social d...,INTRODUCTION: Around one-third of Americans re...,2023,PLoS One,18,2,e0281497,...,30.0,6.0,https://www.altmetric.com/details/143244031,https://api.altmetric.com/v1/donut/143244031_2...,Public Library of Science (PLOS),NaT,0.0,,,0
1,1968,"Abah, AUO, A. | Adewole, A. | Usifoh, N. | Iya...","Usifoh, N.",Environmental and psychosocial predictors of c...,Cervical cancer (CC) is the second leading cau...,2023,Afr J Reprod Health,27,7,32-42,...,18.0,2.0,https://www.altmetric.com/details/155195527,https://api.altmetric.com/v1/donut/155195527_2...,,NaT,0.0,,,0
2,1969,"Abara, WEB, K. T. | Lewis, F. M. T. | Pathela,...","Abara, W. E. | Bernstein, K. T. | Kirkcaldy, R.",Healthy vaccinee bias and MenB-FHbp vaccine ef...,Observational studies demonstrated 30-40% effe...,2023,Sex Transm Dis,50,6,e8-e10,...,7.0,8.0,https://www.altmetric.com/details/144295219,https://api.altmetric.com/v1/donut/144295219_2...,Lippincott Williams & Wilkins (LWW); Wolters K...,NaT,0.0,,,0
3,1970,"Abara, WEG, J. | Marquez, P. | Woo, J. | Myers...","Abara, W. E. | Gee, J. | Marquez, P. | Woo, J....",Reports of Guillain-Barré Syndrome after COVID...,IMPORTANCE: Because of historical associations...,2023,JAMA Netw Open,6,2,e2253845,...,64.0,57.0,https://www.altmetric.com/details/142033151,https://api.altmetric.com/v1/donut/142033151_2...,American Medical Association,NaT,0.0,,,1
4,1971,"Abara, WES, P. | Carpino, T. | Sanchez, T. | A...","Abara, W. E. | Delaney, K. | Ogale, Y. | Gallo...",Characteristics of mpox vaccine recipients amo...,Mpox vaccination is recommended for persons ex...,2023,Sex Transm Dis,50,7,458-461,...,18.0,2.0,https://www.altmetric.com/details/151021228,https://api.altmetric.com/v1/donut/151021228_2...,Lippincott Williams & Wilkins (LWW); Wolters K...,NaT,0.0,,,0


In [None]:
#Putting the final columns in the final df
df_clips = df_clips.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 
                              19,46,64,65,66,67,68,69,72]] 
                           
df_clips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20178 entries, 0 to 20177
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Authors                            20178 non-null  object        
 1   CDC Authors                        20178 non-null  object        
 2   Title                              20178 non-null  object        
 3   Abstract                           20178 non-null  object        
 4   Year                               20178 non-null  int64         
 5   Journal                            20178 non-null  object        
 6   Volume                             19790 non-null  object        
 7   Issue                              17213 non-null  object        
 8   Pages                              18521 non-null  object        
 9   PMID                               19269 non-null  float64       
 10  PMCID                             

In [None]:
#export pubs of interest and indicators
df_clips.to_excel("Results/Bibliometrics/SciClips Publication plus Indicators/20250116_2018_2023_SciClips_Publications_SciClips_Indicators.xlsx")

## Conditions of Interest
CDC-auhtored publications will be merged to bibliometric data from Altermetric and BMJ Impact Analytics.

#### Alzheimer's disease

In [42]:
#look at all datasets pertaining to this condition
df_alz_total.info()
df_alz_alt.info()
df_alz_bmj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Unnamed: 0                         35 non-null     int64         
 1   Authors                            35 non-null     object        
 2   CDC Authors                        35 non-null     object        
 3   Title                              35 non-null     object        
 4   Abstract                           35 non-null     object        
 5   Year                               35 non-null     int64         
 6   Journal                            35 non-null     object        
 7   Volume                             35 non-null     object        
 8   Issue                              29 non-null     object        
 9   Pages                              33 non-null     object        
 10  PMID                               35 no

In [None]:
#merge pubs and altmetric info
df_alz = pd.merge(df_alz_total,df_alz_alt,how='left',on='DOI')
df_alz.info()

In [None]:
#drop BMJ columns that are not of interest or already in the original dataset
columns_to_drop = ['Title', 'Journal','Authors','ORCIDs','Your tags']  
df_alz_bmj = df_alz_bmj.drop(columns=columns_to_drop)
df_alz_bmj.info()

In [None]:
#merge in bmj info
df_alz = pd.merge(df_alz,df_alz_bmj,how='left',on='DOI')
df_alz.info()

In [None]:
# fill numeric variables that will be used to calculate indicators with 0
df_alz['Altmetric Attention Score'] = df_alz['Altmetric Attention Score'].fillna(0)
df_alz['Number of Dimensions citations'] = df_alz['Number of Dimensions citations'].fillna(0)
df_alz['Policy mentions'] = df_alz['Policy mentions'].fillna(0)
df_alz['Policy citation count'] = df_alz['Policy citation count'].fillna(0)

In [None]:
# add an any policy flag column
df_alz['Any Policy'] = ((df_alz['Policy mentions'] > 0) | (df_alz['Policy citation count'] > 0)).astype(int)
df_alz.info()
df_alz.head()

In [None]:
#only keep relevant columns
df_alz = df_alz.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 
                              19,46,64,65,66,67,68,69,72]] 
                           
df_alz.info()

In [None]:
#export pubs of interest and indicators

df_alz.to_excel("Results/Bibliometrics/SciClips Publication plus Indicators/20250116_2018_2023_Alzheimer_Publications_SciClips_Indicators.xlsx")

### Breast cancer

In [None]:
#look at all datasets pertaining to this condition
df_breastcancer_total.info()
df_breastcancer_alt.info()
df_breastcancer_bmj.info()

In [None]:
#merge in the altmetric info
df_breastcancer = pd.merge(df_breastcancer_total,df_breastcancer_alt,how='left',on='DOI')
df_breastcancer.info()

In [None]:
#drop BMJ columns that are not of interest or already in the original dataset
columns_to_drop = ['Title', 'Journal','Authors','ORCIDs','Your tags']  # Specify the columns you want to drop
df_breastcancer_bmj = df_breastcancer_bmj.drop(columns=columns_to_drop)
df_breastcancer_bmj.info()

In [None]:
#merge in bmj
df_breastcancer = pd.merge(df_breastcancer,df_breastcancer_bmj,how='left',on='DOI')
df_breastcancer.info()

In [None]:
# fill numeric variables that will be used to calculate indicators with 0
df_breastcancer['Altmetric Attention Score'] = df_breastcancer['Altmetric Attention Score'].fillna(0)
df_breastcancer['Number of Dimensions citations'] = df_breastcancer['Number of Dimensions citations'].fillna(0)
df_breastcancer['Policy mentions'] = df_breastcancer['Policy mentions'].fillna(0)
df_breastcancer['Policy citation count'] = df_breastcancer['Policy citation count'].fillna(0)

In [None]:
# add an any policy flag column
df_breastcancer['Any Policy'] = ((df_breastcancer['Policy mentions'] > 0) | (df_breastcancer['Policy citation count'] > 0)).astype(int)
df_breastcancer.info()
df_breastcancer.head()

In [None]:
#only keep relevant columns
df_breastcancer = df_breastcancer.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 
                              19,46,64,65,66,67,68,69,72]] 
                           
df_breastcancer.info()

In [None]:
#export pubs of interest and indicators
df_breastcancer.to_excel("Results/Bibliometrics/SciClips Publication plus Indicators/20250116_2018_2023_BreastCancer_Publications_SciClips_Indicators.xlsx")

### Kidney

In [None]:
df_kidney_total.info()
df_kidney_alt.info()
df_kidney_bmj.info()

In [None]:
#merge in altmetric
df_kidney = pd.merge(df_kidney_total,df_kidney_alt,how='left',on='DOI')
df_kidney.info()

In [None]:
#drop BMJ columns that are not of interest or already in the original dataset
columns_to_drop = ['Title', 'Journal','Authors','ORCIDs','Your tags']  # Specify the columns you want to drop
df_kidney_bmj = df_kidney_bmj.drop(columns=columns_to_drop)
df_kidney_bmj.info()

In [None]:
#merge in bmj
df_kidney = pd.merge(df_kidney,df_kidney_bmj,how='left',on='DOI')
df_kidney.info()

In [None]:
# fill numeric variables that will be used to calculate indicators with 0
df_kidney['Altmetric Attention Score'] = df_kidney['Altmetric Attention Score'].fillna(0)
df_kidney['Number of Dimensions citations'] = df_kidney['Number of Dimensions citations'].fillna(0)
df_kidney['Policy mentions'] = df_kidney['Policy mentions'].fillna(0)
df_kidney['Policy citation count'] = df_kidney['Policy citation count'].fillna(0)

In [None]:
df_kidney['Any Policy'] = ((df_kidney['Policy mentions'] > 0) | (df_kidney['Policy citation count'] > 0)).astype(int)
df_kidney.info()
df_kidney.head()

In [None]:
#only keep relevant columns
df_kidney = df_kidney.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 
                              19,46,64,65,66,67,68,69,72]] 
                           
df_kidney.info()

In [None]:
#export pubs of interest and indicators
df_kidney.to_excel("Results/Bibliometrics/SciClips Publication plus Indicators/20250116_2018_2023_Kidney_Publications_SciClips_Indicators.xlsx")

### Acute rheumatic fever and chronic rheumatic heart diseases

In [None]:
df_rhu_total.info()
df_rhu_alt.info()
df_rhu_bmj.info()

In [None]:
# merge in altmetric
df_rhu = pd.merge(df_rhu_total,df_rhu_alt,how='left',on='DOI')
df_rhu.info()

In [None]:
#drop BMJ columns that are not of interest or already in the original dataset
columns_to_drop = ['Title', 'Journal','Authors','ORCIDs','Your tags']  # Specify the columns you want to drop
df_rhu_bmj = df_rhu_bmj.drop(columns=columns_to_drop)
df_rhu_bmj.info()

In [None]:
#merge in bmj
df_rhu = pd.merge(df_rhu,df_rhu_bmj,how='left',on='DOI')
df_rhu.info()

In [None]:
# fill numeric variables that will be used to calculate indicators with 0
df_rhu['Altmetric Attention Score'] = df_rhu['Altmetric Attention Score'].fillna(0)
df_rhu['Number of Dimensions citations'] = df_rhu['Number of Dimensions citations'].fillna(0)
df_rhu['Policy mentions'] = df_rhu['Policy mentions'].fillna(0)
df_rhu['Policy citation count'] = df_rhu['Policy citation count'].fillna(0)

In [None]:
#create policy flag
df_rhu['Any Policy'] = ((df_rhu['Policy mentions'] > 0) | (df_rhu['Policy citation count'] > 0)).astype(int)
df_rhu.info()
df_rhu.head()

In [None]:
#only keep relevant columns
df_rhu = df_rhu.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 
                              19,46,64,65,66,67,68,69,72]] 
                           
df_rhu.info()

In [None]:
#export pubs of interest and indicators
df_rhu.to_excel("Results/Bibliometrics/SciClips Publication plus Indicators/20250116_2018_2023_Rheumatic_Publications_SciClips_Indicators.xlsx")