# XML Final Analysis
### Daina Bouquin, Daniel Chivvis

Scripts below were used to generate all .csv files in the XML_RESULTS_060619/ folder

In [153]:
import pandas as pd
import numpy as np
import sys
import csv

In [154]:
XML_results = pd.read_csv("XML_CLEAN_INPUT_053119.csv") 

In [155]:
list(XML_results.columns.values)

['Index',
 'Alias',
 'Software_Package',
 'Identifier',
 'Pub_Year',
 'DOI',
 'Journal_Title',
 'Article_ID',
 'File_Name',
 'Parent1_Tag',
 'Parent2_Tag',
 'Parent3_Tag',
 'Parent4_Tag',
 'Parent1_Content',
 'Parent2_Content',
 'Parent3_Content',
 'Author(s)',
 'Publisher',
 'Title']

In [156]:
XML_results.head(5)

Unnamed: 0,Index,Alias,Software_Package,Identifier,Pub_Year,DOI,Journal_Title,Article_ID,File_Name,Parent1_Tag,Parent2_Tag,Parent3_Tag,Parent4_Tag,Parent1_Content,Parent2_Content,Parent3_Content,Author(s),Publisher,Title
0,I 1,astroblend,AstroBlend,0,2016,10.3847/0004-637X/818/2/115,The Astrophysical Journal,"apj521773, 10.3847/0004-637X/818/2/115, 521773...",apj_818_2_115.xml,sc,p,sec,sec,['astroblend'],['We note that our example scripts only explor...,['\n<label>3.5.</label>\n<title>From <sans-ser...,Vogt Frédéric P. A. Owen Chris I. Verdes-Mon...,The American Astronomical Society,ADVANCED DATA VISUALIZATION IN ASTROPHYSICS: T...
1,I 2,astroblend.com,AstroBlend,0,2016,10.3847/0004-637X/818/2/115,The Astrophysical Journal,"apj521773, 10.3847/0004-637X/818/2/115, 521773...",apj_818_2_115.xml,ext-link,p,fn,p,['http://www.astroblend.com'],"['\n<ext-link ext-link-type=""uri"" xlink:href=""...",['\n<label><sup>26</sup></label>\n<p>\n<ext-li...,Vogt Frédéric P. A. Owen Chris I. Verdes-Mon...,The American Astronomical Society,ADVANCED DATA VISUALIZATION IN ASTROPHYSICS: T...
2,I 3,10.1051/0004-6361/201322068,Astropy,1,2014,10.1088/0004-6256/148/1/13,The Astronomical Journal,"aj493368, ANJOAA, 10.1088/0004-6256/148/1/13, ...",aj_148_1_13.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...",Rodney Steven A.Riess Adam G.Strolger Louis-Gr...,The American Astronomical Society,TYPE Ia SUPERNOVA RATE MEASUREMENTS TO REDSHIF...
3,I 4,10.1051/0004-6361/201322068,Astropy,1,2014,10.1088/0004-6256/148/1/14,The Astronomical Journal,"aj495229, ANJOAA, 10.1088/0004-6256/148/1/14, ...",aj_148_1_14.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...",Bañados E.Venemans B. P.Morganson E.Decarli R....,The American Astronomical Society,DISCOVERY OF EIGHT z ∼ 6 QUASARS FROM Pan-STARRS1
4,I 5,10.1051/0004-6361/201322068,Astropy,1,2014,10.1088/0004-6256/148/3/53,The Astronomical Journal,"aj499538, ANJOAA, 10.1088/0004-6256/148/3/53, ...",aj_148_3_53.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...",Gullikson KevinDodson-Robinson SarahKraus Adam...,The American Astronomical Society,"CORRECTING FOR TELLURIC ABSORPTION: METHODS, C..."


In [157]:
# Convert tag, content, identifier, pub_year cols to strings

XML_results['Parent1_Tag'] = XML_results['Parent1_Tag'].astype('|S')
XML_results['Parent2_Tag'] = XML_results['Parent2_Tag'].astype('|S')
XML_results['Parent3_Tag'] = XML_results['Parent3_Tag'].astype('|S')
XML_results['Parent4_Tag'] = XML_results['Parent4_Tag'].astype('|S')

XML_results['Parent1_Content'] = XML_results['Parent1_Content'].astype('|S')
XML_results['Parent2_Content'] = XML_results['Parent2_Content'].astype('|S')
XML_results['Parent3_Content'] = XML_results['Parent3_Content'].astype('|S')

XML_results['Identifier'] = XML_results['Identifier'].astype('|S')
XML_results['Pub_Year'] = XML_results['Pub_Year'].astype('|S')
XML_results['Alias'] = XML_results['Alias'].astype('|S')

In [158]:
# Create column for references

# If the tag lable or tag content contain any of the following reference elements it will be marked "yes":
# element-citation 
# mixed-citation
# nlm-citation
# ref
# citation-alternatives
# ref-list 
# xref
# source
# bibr

references = ['Parent1_Tag','Parent2_Tag','Parent3_Tag','Parent4_Tag','Parent1_Content','Parent2_Content']
XML_results["ref"] = np.where((XML_results[references] == "element-citation").any(axis=1) | (XML_results[references]== "mixed-citation").any(axis=1) | (XML_results[references]== "nlm-citation").any(axis=1) | (XML_results[references]== "ref").any(axis=1) | (XML_results[references]== "citation-alternatives").any(axis=1) | (XML_results[references]== "ref-list").any(axis=1) | (XML_results[references]== "xref").any(axis=1) | (XML_results[references]== "source").any(axis=1) | (XML_results[references]== "bibr").any(axis=1), "yes", "no")

In [159]:
# Create column for acknowledgements
# If the tag lable or tag content contains "ack" it will be marked "yes"

acknowledgements = ['Parent1_Tag','Parent2_Tag','Parent3_Tag','Parent4_Tag','Parent1_Content','Parent2_Content']
XML_results["ack"] = np.where((XML_results[acknowledgements] == "ack").any(axis=1), "yes", "no")

In [160]:
# Create column for footnotes
# If the tag lable or tag content contains "fn" or "fn-group" it will be marked "yes"

footnotes = ['Parent1_Tag','Parent2_Tag','Parent3_Tag','Parent4_Tag','Parent1_Content','Parent2_Content']
XML_results["fn"] = np.where((XML_results[footnotes] == "fn").any(axis=1) | (XML_results[footnotes]== "fn-group").any(axis=1), "yes", "no")

In [161]:
# Check new cols

list(XML_results.columns.values)

['Index',
 'Alias',
 'Software_Package',
 'Identifier',
 'Pub_Year',
 'DOI',
 'Journal_Title',
 'Article_ID',
 'File_Name',
 'Parent1_Tag',
 'Parent2_Tag',
 'Parent3_Tag',
 'Parent4_Tag',
 'Parent1_Content',
 'Parent2_Content',
 'Parent3_Content',
 'Author(s)',
 'Publisher',
 'Title',
 'ref',
 'ack',
 'fn']

In [162]:
XML_results.to_csv("XML_FINAL_ANALYSIS_061019.csv")

## Summary of Results

In [163]:
# How many papers did we find for each software package?

XML_results.groupby('Software_Package')['File_Name'].nunique()

Software_Package
AstroBlend        1
Astropy         538
RADMC-3D        214
SAOImage DS9    341
Spec2d          304
Stingray          2
TARDIS            4
WCSTools        123
Name: File_Name, dtype: int64

In [169]:
# Total number of unique alias

XML_unique_alias = pd.DataFrame(XML_results.groupby(['Software_Package','Alias'])['Index'].nunique())
XML_unique_alias.to_csv("XML_unique_alias_061019.csv")
XML_unique_alias

Unnamed: 0_level_0,Unnamed: 1_level_0,Index
Software_Package,Alias,Unnamed: 2_level_1
AstroBlend,astroblend,1
AstroBlend,astroblend.com,1
Astropy,10.1051/0004-6361/201322068,449
Astropy,2013A&A...558A..33A,441
Astropy,AstroPy,38
Astropy,Astropy,554
Astropy,Astropy Collaboration,443
Astropy,Astropy Collaboration 2013,1
Astropy,astropy,190
Astropy,astropy cosmology,1


In [182]:
#Alias per paper

XML_alias_paper = pd.DataFrame(XML_results.groupby(['DOI','Software_Package'])['Alias'].nunique())
XML_alias_paper.to_csv("XML_alias_paper_061019.csv")
XML_alias_paper

Unnamed: 0_level_0,Unnamed: 1_level_0,Alias
DOI,Software_Package,Unnamed: 2_level_1
#VALUE!,SAOImage DS9,7
#VALUE!,Spec2d,4
#VALUE!,WCSTools,6
10.1086/304882,SAOImage DS9,1
10.1086/312617,SAOImage DS9,1
10.1086/313204,SAOImage DS9,1
10.1086/321561,SAOImage DS9,1
10.1086/339572,WCSTools,1
10.1086/340936,SAOImage DS9,2
10.1086/342702,WCSTools,1


In [228]:
# Identifiers per package

XML_unique_id = pd.DataFrame(XML_results.groupby(['Software_Package','Identifier'])['Index'].nunique())
XML_unique_id.to_csv("XML_unique_id_061019.csv")
XML_unique_id

Unnamed: 0_level_0,Unnamed: 1_level_0,Index
Software_Package,Identifier,Unnamed: 2_level_1
AstroBlend,0,2
Astropy,0,1341
Astropy,1,892
RADMC-3D,0,549
RADMC-3D,1,225
SAOImage DS9,0,445
SAOImage DS9,1,57
Spec2d,0,527
Spec2d,1,278
Stingray,0,9


In [193]:
# Tag count per parent

XML_unique_tags1 = pd.DataFrame(XML_results.groupby(['Parent1_Tag'])['Index'].nunique())
XML_unique_tags2 = pd.DataFrame(XML_results.groupby(['Parent2_Tag'])['Index'].nunique())
XML_unique_tags3 = pd.DataFrame(XML_results.groupby(['Parent3_Tag'])['Index'].nunique())
XML_unique_tags4 = pd.DataFrame(XML_results.groupby(['Parent4_Tag'])['Index'].nunique())

XML_unique_tags1.to_csv("XML_unique_tags1_061019.csv")
XML_unique_tags2.to_csv("XML_unique_tags2_061019.csv")
XML_unique_tags3.to_csv("XML_unique_tags3_061019.csv")
XML_unique_tags4.to_csv("XML_unique_tags4_061019.csv")

In [196]:
#Tags per paper

XML_tags_paper1 = pd.DataFrame(XML_results.groupby(['DOI'])['Parent1_Tag'].nunique())
XML_tags_paper2 = pd.DataFrame(XML_results.groupby(['DOI'])['Parent2_Tag'].nunique())
XML_tags_paper3 = pd.DataFrame(XML_results.groupby(['DOI'])['Parent3_Tag'].nunique())
XML_tags_paper4 = pd.DataFrame(XML_results.groupby(['DOI'])['Parent4_Tag'].nunique())

XML_tags_paper1.to_csv("XML_tags_paper1_061019.csv")
XML_tags_paper2.to_csv("XML_tags_paper2_061019.csv")
XML_tags_paper3.to_csv("XML_tags_paper3_061019.csv")
XML_tags_paper4.to_csv("XML_tags_paper4_061019.csv")

In [208]:
# Unique files for each alias per year
# Is this useful?

XML_unique_alias_year = pd.DataFrame(XML_results.groupby(['Alias', 'Pub_Year'])['File_Name'].nunique())
XML_unique_alias_year.to_csv("XML_unique_alias_year_061019.csv")
XML_unique_alias_year

Unnamed: 0_level_0,Unnamed: 1_level_0,File_Name
Alias,Pub_Year,Unnamed: 2_level_1
10.1051/0004-6361/201322068,2013,2
10.1051/0004-6361/201322068,2014,35
10.1051/0004-6361/201322068,2015,53
10.1051/0004-6361/201322068,2016,91
10.1051/0004-6361/201322068,2017,165
10.1051/0004-6361/201322068,2018,103
10.1051/0004-6361:20031768,2009,13
10.1051/0004-6361:20031768,2010,17
10.1051/0004-6361:20031768,2011,12
10.1051/0004-6361:20031768,2012,18


In [209]:
# Total number of unique papers with software aliases by year

XML_unique_paper_year = pd.DataFrame(XML_results.groupby(['Pub_Year'])['File_Name'].nunique())
XML_unique_paper_year.to_csv("XML_unique_paper_year_061019.csv")
XML_unique_paper_year

Unnamed: 0_level_0,File_Name
Pub_Year,Unnamed: 1_level_1
1997,1
1998,4
1999,2
2000,2
2001,4
2002,8
2003,7
2004,16
2005,18
2006,31


In [212]:
# Total number of unique papers with software aliases in the references section by year


XML_unique_paper_ref_year = pd.DataFrame(XML_results.groupby(['ref', 'Pub_Year'])['File_Name'].nunique())
XML_unique_paper_ref_year.to_csv("XML_unique_paper_ref_year_061019.csv")
XML_unique_paper_ref_year

Unnamed: 0_level_0,Unnamed: 1_level_0,File_Name
ref,Pub_Year,Unnamed: 2_level_1
no,1997,1
no,1998,4
no,1999,2
no,2000,2
no,2001,4
no,2002,8
no,2003,7
no,2004,16
no,2005,18
no,2006,31


In [214]:
# Trends over time for each AAS Journal



XML_journal_year = pd.DataFrame(XML_results.groupby(['Journal_Title', 'Pub_Year'])['File_Name'].nunique())
XML_journal_year.to_csv("XML_journal_year_061019.csv")
XML_journal_year

Unnamed: 0_level_0,Unnamed: 1_level_0,File_Name
Journal_Title,Pub_Year,Unnamed: 2_level_1
The Astronomical Journal,1998,4
The Astronomical Journal,1999,1
The Astronomical Journal,2000,1
The Astronomical Journal,2001,3
The Astronomical Journal,2002,4
The Astronomical Journal,2003,2
The Astronomical Journal,2004,4
The Astronomical Journal,2005,3
The Astronomical Journal,2006,8
The Astronomical Journal,2007,9


In [224]:
# Which software packages showed up in which journals

XML_package_per_journal = pd.DataFrame(XML_results.groupby(['Journal_Title', 'Software_Package'])['File_Name'].nunique())
XML_package_per_journal.to_csv("XML_package_per_journal_061019.csv")
XML_package_per_journal

Unnamed: 0_level_0,Unnamed: 1_level_0,File_Name
Journal_Title,Software_Package,Unnamed: 2_level_1
The Astronomical Journal,Astropy,76
The Astronomical Journal,RADMC-3D,7
The Astronomical Journal,SAOImage DS9,68
The Astronomical Journal,Spec2d,25
The Astronomical Journal,WCSTools,43
The Astrophysical Journal,AstroBlend,1
The Astrophysical Journal,Astropy,364
The Astrophysical Journal,RADMC-3D,168
The Astrophysical Journal,SAOImage DS9,196
The Astrophysical Journal,Spec2d,229


In [226]:
# Total number of articles with software mentions (1527) per journal

XML_articles_mentions_per_journal = pd.DataFrame(XML_results.groupby(['Journal_Title','Software_Package'])['File_Name'].nunique())
XML_articles_mentions_per_journal.to_csv("XML_articles_mentions_per_journal_061019.csv")
XML_articles_mentions_per_journal

Unnamed: 0_level_0,Unnamed: 1_level_0,File_Name
Journal_Title,Software_Package,Unnamed: 2_level_1
The Astronomical Journal,Astropy,76
The Astronomical Journal,RADMC-3D,7
The Astronomical Journal,SAOImage DS9,68
The Astronomical Journal,Spec2d,25
The Astronomical Journal,WCSTools,43
The Astrophysical Journal,AstroBlend,1
The Astrophysical Journal,Astropy,364
The Astrophysical Journal,RADMC-3D,168
The Astrophysical Journal,SAOImage DS9,196
The Astrophysical Journal,Spec2d,229


In [79]:
# Which tags were associated with each software package?

XML_tags1 = pd.DataFrame({'tags' : XML_results.groupby( [ "Software_Package", "Parent1_Tag"] ).size()}).reset_index()
XML_tags2 = pd.DataFrame({'tags' : XML_results.groupby( [ "Software_Package", "Parent2_Tag"] ).size()}).reset_index()
XML_tags3 = pd.DataFrame({'tags' : XML_results.groupby( [ "Software_Package", "Parent3_Tag"] ).size()}).reset_index()
XML_tags4 = pd.DataFrame({'tags' : XML_results.groupby( [ "Software_Package", "Parent4_Tag"] ).size()}).reset_index()
XML_all_tags = [XML_tags1, XML_tags2, XML_tags3, XML_tags4]
XML_all_tags = pd.concat(XML_all_tags)
# write tags to csv
XML_all_tags.to_csv("XML_all_tags_061019.csv")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [80]:
# AstroBlend
AstroBlend_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("AstroBlend")]
AstroBlend_Tags = pd.concat([AstroBlend_Tags['Parent1_Tag'], AstroBlend_Tags['Parent2_Tag'], AstroBlend_Tags['Parent3_Tag'], AstroBlend_Tags['Parent4_Tag']]).unique()
AstroBlend_Tags = np.ndarray.tolist(AstroBlend_Tags)
AstroBlend_Tags = pd.DataFrame(AstroBlend_Tags, columns = ["AstroBlend_XML_Tags"])
AstroBlend_Tags = AstroBlend_Tags[AstroBlend_Tags.AstroBlend_XML_Tags.notnull()]
AstroBlend_Tags = AstroBlend_Tags.sort_values(by=['AstroBlend_XML_Tags']).reset_index(drop=True)
# write tags to csv
AstroBlend_Tags.to_csv("AstroBlend_Tags_061019.csv")
AstroBlend_Tags

Unnamed: 0,AstroBlend_XML_Tags
0,ext-link
1,fn
2,p
3,sc
4,sec


In [81]:
# AstroPy
AstroPy_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("AstroPy")]
AstroPy_Tags = pd.concat([AstroPy_Tags['Parent1_Tag'], AstroPy_Tags['Parent2_Tag'], AstroPy_Tags['Parent3_Tag'], AstroPy_Tags['Parent4_Tag']]).unique()
AstroPy_Tags = np.ndarray.tolist(AstroPy_Tags)
AstroPy_Tags = pd.DataFrame(AstroPy_Tags, columns = ["AstroPy_XML_Tags"])
AstroPy_Tags = AstroPy_Tags[AstroPy_Tags.AstroPy_XML_Tags.notnull()]
AstroPy_Tags = AstroPy_Tags.sort_values(by=['AstroPy_XML_Tags']).reset_index(drop=True)
# write tags to csv
AstroPy_Tags.to_csv("AstroPy_Tags_061019.csv")
AstroPy_Tags

Unnamed: 0,AstroPy_XML_Tags


In [82]:
# RADMC-3D
RADMC3D_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("RADMC-3D")]
RADMC3D_Tags = pd.concat([RADMC3D_Tags['Parent1_Tag'], RADMC3D_Tags['Parent2_Tag'], RADMC3D_Tags['Parent3_Tag'], RADMC3D_Tags['Parent4_Tag']]).unique()
RADMC3D_Tags = np.ndarray.tolist(RADMC3D_Tags)
RADMC3D_Tags = pd.DataFrame(RADMC3D_Tags, columns = ["RADMC3D_XML_Tags"])
RADMC3D_Tags = RADMC3D_Tags[RADMC3D_Tags.RADMC3D_XML_Tags.notnull()]
RADMC3D_Tags = RADMC3D_Tags.sort_values(by=['RADMC3D_XML_Tags']).reset_index(drop=True)
# write tags to csv
RADMC3D_Tags.to_csv("RADMC3D_Tags_061019.csv")
RADMC3D_Tags

Unnamed: 0,RADMC3D_XML_Tags
0,abstract
1,ack
2,app
3,app-group
4,article
5,article-meta
6,article-title
7,back
8,body
9,caption


In [83]:
# DS9
SAOImageDS9_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("SAOImage DS9")]
SAOImageDS9_Tags = pd.concat([SAOImageDS9_Tags['Parent1_Tag'], SAOImageDS9_Tags['Parent2_Tag'], SAOImageDS9_Tags['Parent3_Tag'], SAOImageDS9_Tags['Parent4_Tag']]).unique()
SAOImageDS9_Tags = np.ndarray.tolist(SAOImageDS9_Tags)
SAOImageDS9_Tags = pd.DataFrame(SAOImageDS9_Tags, columns = ["SAOImageDS9_XML_Tags"])
SAOImageDS9_Tags = SAOImageDS9_Tags[SAOImageDS9_Tags.SAOImageDS9_XML_Tags.notnull()]
SAOImageDS9_Tags = SAOImageDS9_Tags.sort_values(by=['SAOImageDS9_XML_Tags']).reset_index(drop=True)
# write tags to csv
SAOImageDS9_Tags.to_csv("SAOImageDS9_Tags_061019.csv")
SAOImageDS9_Tags

Unnamed: 0,SAOImageDS9_XML_Tags
0,ack
1,app
2,app-group
3,article
4,back
5,body
6,caption
7,element-citation
8,ext-link
9,fig


In [84]:
# Spec2d
Spec2d_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("Spec2d")]
Spec2d_Tags = pd.concat([Spec2d_Tags['Parent1_Tag'], Spec2d_Tags['Parent2_Tag'], Spec2d_Tags['Parent3_Tag'], Spec2d_Tags['Parent4_Tag']]).unique()
Spec2d_Tags = np.ndarray.tolist(Spec2d_Tags)
Spec2d_Tags = pd.DataFrame(Spec2d_Tags, columns = ["Spec2d_XML_Tags"])
Spec2d_Tags = Spec2d_Tags[Spec2d_Tags.Spec2d_XML_Tags.notnull()]
Spec2d_Tags = Spec2d_Tags.sort_values(by=['Spec2d_XML_Tags']).reset_index(drop=True)
# write tags to csv
Spec2d_Tags.to_csv("Spec2d_Tags_061019.csv")
Spec2d_Tags

Unnamed: 0,Spec2d_XML_Tags
0,abstract
1,ack
2,app
3,app-group
4,article
5,article-id
6,article-meta
7,article-title
8,back
9,body


In [85]:
# Stingray
Stingray_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("Stingray")]
Stingray_Tags = pd.concat([Stingray_Tags['Parent1_Tag'], Stingray_Tags['Parent2_Tag'], Stingray_Tags['Parent3_Tag'], Stingray_Tags['Parent4_Tag']]).unique()
Stingray_Tags = np.ndarray.tolist(Stingray_Tags)
Stingray_Tags = pd.DataFrame(Stingray_Tags, columns = ["Stingray_XML_Tags"])
Stingray_Tags = Stingray_Tags[Stingray_Tags.Stingray_XML_Tags.notnull()]
Stingray_Tags = Stingray_Tags.sort_values(by=['Stingray_XML_Tags']).reset_index(drop=True)
# write tags to csv
Stingray_Tags.to_csv("Stingray_Tags_061019.csv")
Stingray_Tags

Unnamed: 0,Stingray_XML_Tags
0,article
1,body
2,element-citation
3,monospace
4,p
5,ref
6,ref-list
7,sec
8,source


In [86]:
# tardis
TARDIS_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("TARDIS")]
TARDIS_Tags = pd.concat([TARDIS_Tags['Parent1_Tag'], TARDIS_Tags['Parent2_Tag'], TARDIS_Tags['Parent3_Tag'], TARDIS_Tags['Parent4_Tag']]).unique()
TARDIS_Tags = np.ndarray.tolist(TARDIS_Tags)
TARDIS_Tags = pd.DataFrame(TARDIS_Tags, columns = ["TARDIS_XML_Tags"])
TARDIS_Tags = TARDIS_Tags[TARDIS_Tags.TARDIS_XML_Tags.notnull()]
TARDIS_Tags = TARDIS_Tags.sort_values(by=['TARDIS_XML_Tags']).reset_index(drop=True)
# write tags to csv
TARDIS_Tags.to_csv("TARDIS_Tags_061019.csv")
TARDIS_Tags

Unnamed: 0,TARDIS_XML_Tags
0,article
1,body
2,caption
3,element-citation
4,fig
5,p
6,pub-id
7,ref
8,ref-list
9,sc


In [87]:
# wcs tools
WCSTools_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("WCSTools")]
WCSTools_Tags = pd.concat([WCSTools_Tags['Parent1_Tag'], WCSTools_Tags['Parent2_Tag'], WCSTools_Tags['Parent3_Tag'], WCSTools_Tags['Parent4_Tag']]).unique()
WCSTools_Tags = np.ndarray.tolist(WCSTools_Tags)
WCSTools_Tags = pd.DataFrame(WCSTools_Tags, columns = ["WCSTools_XML_Tags"])
WCSTools_Tags = WCSTools_Tags[WCSTools_Tags.WCSTools_XML_Tags.notnull()]
WCSTools_Tags = WCSTools_Tags.sort_values(by=['WCSTools_XML_Tags']).reset_index(drop=True)
# write tags to csv
WCSTools_Tags.to_csv("WCSTools_Tags_061019.csv")
WCSTools_Tags

Unnamed: 0,WCSTools_XML_Tags
0,ack
1,app
2,app-group
3,article
4,back
5,body
6,comment
7,element-citation
8,ext-link
9,fn


In [88]:
# Proportion of articles with aliases in refrences
XML_all_refs = XML_results.loc[XML_results['ref'] == "yes"]
XML_ref_proportion = XML_all_refs.groupby('Software_Package')['File_Name'].nunique()/XML_results.groupby('Software_Package')['File_Name'].nunique()
XML_ref_proportion

Software_Package
AstroBlend           NaN
Astropy         0.858736
RADMC-3D        0.682243
SAOImage DS9    0.167155
Spec2d          0.467105
Stingray        0.500000
TARDIS          1.000000
WCSTools        0.300813
Name: File_Name, dtype: float64

In [89]:
# Total number of unique papers with software aliases in the references section

XML_all_refs = XML_results.loc[XML_results['ref'] == "yes"]
XML_ref_count = XML_all_refs.groupby('Software_Package')['File_Name'].nunique()
XML_ref_count

Software_Package
Astropy         462
RADMC-3D        146
SAOImage DS9     57
Spec2d          142
Stingray          1
TARDIS            4
WCSTools         37
Name: File_Name, dtype: int64

In [90]:
# How many unique aliases were used in the references for each package?
print (XML_all_refs.groupby('Software_Package')['Alias'].nunique())

Software_Package
Astropy          9
RADMC-3D        11
SAOImage DS9     3
Spec2d           6
Stingray         1
TARDIS           1
WCSTools         5
Name: Alias, dtype: int64


In [91]:
# Which aliases were used in the papers with aliases in references? Write results to csv
XML_ref_aliases = pd.DataFrame({'ref_count' : XML_all_refs.groupby(['Software_Package', 'Alias', 'Identifier'])['File_Name'].nunique()}).reset_index()
XML_ref_aliases.to_csv("XML_ref_aliases_061019.csv")
XML_ref_aliases

Unnamed: 0,Software_Package,Alias,Identifier,ref_count
0,Astropy,10.1051/0004-6361/201322068,1,449
1,Astropy,2013A&A...558A..33A,1,441
2,Astropy,Astropy,0,3
3,Astropy,Astropy Collaboration,0,433
4,Astropy,Astropy Collaboration 2013,0,1
5,Astropy,astropy/astroplan,0,1
6,Astropy,astropy/astroquery,0,2
7,Astropy,astropy/halotools,0,1
8,Astropy,astropy/photutils,0,3
9,RADMC-3D,10.1051/0004-6361:20031768,1,96


In [92]:
# Proportion of articles containing software aliases with the software mentioned in an acknowledgement
XML_all_ack = XML_results.loc[XML_results['ack'] == 'yes']
XML_ack_proportion = XML_all_ack.groupby('Software_Package')['File_Name'].nunique()/XML_results.groupby('Software_Package')['File_Name'].nunique()
XML_ack_proportion

Software_Package
AstroBlend           NaN
Astropy         0.903346
RADMC-3D        0.102804
SAOImage DS9    0.486804
Spec2d          0.138158
Stingray             NaN
TARDIS               NaN
WCSTools        0.081301
Name: File_Name, dtype: float64

In [93]:
# Total number of unique papers with software aliases in acknowledgements

XML_all_ack = XML_results.loc[XML_results['ack'] == "yes"]
XML_ack_count = XML_all_ack.groupby('Software_Package')['File_Name'].nunique()
XML_ack_count

Software_Package
Astropy         486
RADMC-3D         22
SAOImage DS9    166
Spec2d           42
WCSTools         10
Name: File_Name, dtype: int64

In [94]:
# Which aliases were used in the papers with aliases in acknowledgements? Write results to csv
XML_ack_aliases = pd.DataFrame({'ack_count' : XML_all_ack.groupby([ "Software_Package", "Alias", "Identifier"])['File_Name'].nunique()}).reset_index()
XML_ack_aliases.to_csv("XML_ack_aliases_061019.csv")
XML_ack_aliases

Unnamed: 0,Software_Package,Alias,Identifier,ack_count
0,Astropy,AstroPy,0,23
1,Astropy,Astropy,0,398
2,Astropy,Astropy Collaboration,0,5
3,Astropy,astropy,0,68
4,Astropy,astropy.org,0,35
5,Astropy,doi.org/10.1051/0004-6361/201322068,1,2
6,RADMC-3D,RADMC,0,10
7,RADMC-3D,RADMC-3D,0,11
8,RADMC-3D,RADMC3D,0,1
9,RADMC-3D,ita.uni-heidelberg.de/dullemond/software/radmc-3d,0,2


In [95]:
# Proportion of articles containing software aliases with the software mentioned in a footnote
XML_all_fn = XML_results.loc[XML_results['fn'] == 'yes']
XML_fn_proportion = XML_all_fn.groupby('Software_Package')['File_Name'].nunique()/XML_results.groupby('Software_Package')['File_Name'].nunique()
XML_fn_proportion

Software_Package
AstroBlend      1.000000
Astropy         0.091078
RADMC-3D        0.252336
SAOImage DS9    0.140762
Spec2d          0.243421
Stingray             NaN
TARDIS               NaN
WCSTools        0.260163
Name: File_Name, dtype: float64

In [96]:
# Total number of unique papers with software aliases in footnotes

XML_all_fn = XML_results.loc[XML_results['fn'] == "yes"]
XML_fn_count = XML_all_fn.groupby('Software_Package')['File_Name'].nunique()
XML_fn_count

Software_Package
AstroBlend       1
Astropy         49
RADMC-3D        54
SAOImage DS9    48
Spec2d          74
WCSTools        32
Name: File_Name, dtype: int64

In [97]:
# Which aliases were used in the papers with aliases in footnotes? Write results to csv
XML_fn_aliases = pd.DataFrame({'fn_count' : XML_all_fn.groupby( [ "Software_Package", "Alias", "Identifier"])['File_Name'].nunique()}).reset_index()
XML_fn_aliases.to_csv("XML_fn_aliases_061019.csv")
XML_fn_aliases

Unnamed: 0,Software_Package,Alias,Identifier,fn_count
0,AstroBlend,astroblend.com,0,1
1,Astropy,Astropy,0,4
2,Astropy,astropy,0,2
3,Astropy,astropy.org,0,40
4,Astropy,github.com/StingraySoftware/HENDRIC,0,1
5,Astropy,github.com/StingraySoftware/notebooks,0,1
6,Astropy,github.com/astropy/astroplan,0,1
7,Astropy,github.com/astropy/astroquery,0,1
8,Astropy,github.com/astropy/astroscrappy,0,2
9,Astropy,github.com/astropy/photutils,0,1


In [98]:
# Did any articles have no reference, acknowledgement, or footnote?
no_credit = XML_results.loc[(XML_results['fn'] == 'no') & (XML_results['ref'] == 'no') & (XML_results['ack'] == 'no')]
no_credit = no_credit.loc[~(no_credit['File_Name'].isin(XML_all_fn['File_Name']))]
no_credit = no_credit.loc[~(no_credit['File_Name'].isin(XML_all_refs['File_Name']))]
no_credit = no_credit.loc[~(no_credit['File_Name'].isin(XML_all_ack['File_Name']))]
no_credit.groupby('Software_Package')['File_Name'].nunique()

Software_Package
Astropy          7
RADMC-3D        23
SAOImage DS9    98
Spec2d          77
Stingray         1
WCSTools        51
Name: File_Name, dtype: int64

In [100]:
# total mentions of software aliases over time
XML_over_time = pd.DataFrame({'year_count' : XML_results.groupby(["Software_Package", "Pub_Year"])['File_Name'].nunique()}).reset_index()
# write results
XML_over_time.to_csv("XML_over_time_061019.csv")
# show results over time
XML_over_time

Unnamed: 0,Software_Package,Pub_Year,year_count
0,AstroBlend,2016,1
1,Astropy,2012,1
2,Astropy,2013,3
3,Astropy,2014,37
4,Astropy,2015,63
5,Astropy,2016,117
6,Astropy,2017,199
7,Astropy,2018,118
8,RADMC-3D,2004,1
9,RADMC-3D,2005,1
