# XML Final Analysis
### Daina Bouquin, Daniel Chivvis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
XML_results = pd.read_csv("XML_CLEAN_RESULTS_032519.csv") 

In [3]:
list(XML_results.columns.values)

['Alias',
 'Software_Package',
 'Identifier',
 'Pub_Year',
 'Journal_Title',
 'Article_id',
 'File_Name',
 'Parent1_Tag',
 'Parent2_Tag',
 'Parent3_Tag',
 'Parent4_Tag',
 'Parent1_Content',
 'Parent2_Content',
 'Parent3_Content',
 'Parent4_Content',
 'Title',
 'Author(s)',
 'Publisher',
 'Citation']

In [4]:
XML_results.head(5)

Unnamed: 0,Alias,Software_Package,Identifier,Pub_Year,Journal_Title,Article_id,File_Name,Parent1_Tag,Parent2_Tag,Parent3_Tag,Parent4_Tag,Parent1_Content,Parent2_Content,Parent3_Content,Parent4_Content,Title,Author(s),Publisher,Citation
0,astroblend.com,AstroBlend,0,2016,The Astrophysical Journal,"apj521773, 10.3847/0004-637X/818/2/115, 521773...",apj_818_2_115.xml,ext-link,p,fn,p,['http://www.astroblend.com'],"['\n<ext-link ext-link-type=""uri"" xlink:href=""...",['\n<label><sup>26</sup></label>\n<p>\n<ext-li...,['We note that our example scripts only explor...,ADVANCED DATA VISUALIZATION IN ASTROPHYSICS: T...,Vogt Frédéric P. A. Owen Chris I. Verdes-Mo...,The American Astronomical Society,False
1,10.1051/0004-6361/201322068,AstroPy,1,2014,The Astronomical Journal,"aj493368, ANJOAA, 10.1088/0004-6256/148/1/13, ...",aj_148_1_13.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...","['\n<title>References</title>\n<ref id=""aj4933...",TYPE Ia SUPERNOVA RATE MEASUREMENTS TO REDSHIF...,Rodney Steven A.Riess Adam G.Strolger Louis-Gr...,The American Astronomical Society,False
2,10.1051/0004-6361/201322068,AstroPy,1,2014,The Astronomical Journal,"aj495229, ANJOAA, 10.1088/0004-6256/148/1/14, ...",aj_148_1_14.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...","['\n<title>References</title>\n<ref id=""aj4952...",DISCOVERY OF EIGHT z ∼ 6 QUASARS FROM Pan-STARRS1,Bañados E.Venemans B. P.Morganson E.Decarli R....,The American Astronomical Society,False
3,10.1051/0004-6361/201322068,AstroPy,1,2014,The Astronomical Journal,"aj499538, ANJOAA, 10.1088/0004-6256/148/3/53, ...",aj_148_3_53.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...","['\n<title>References</title>\n<ref id=""aj4995...","CORRECTING FOR TELLURIC ABSORPTION: METHODS, C...",Gullikson KevinDodson-Robinson SarahKraus Adam...,The American Astronomical Society,False
4,10.1051/0004-6361/201322068,AstroPy,1,2014,The Astronomical Journal,"aj503145, ANJOAA, 10.1088/0004-6256/148/6/122,...",aj_148_6_122.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...","['\n<title>References</title>\n<ref id=""aj5031...",YSOVAR: MID-INFRARED VARIABILITY IN THE STAR-F...,Günther H. M.Cody A. M.Covey K. R.Hillenbrand ...,The American Astronomical Society,False


In [5]:
# Convert tag and content cols to strings

XML_results['Parent1_Tag'] = XML_results['Parent1_Tag'].astype('|S')
XML_results['Parent2_Tag'] = XML_results['Parent2_Tag'].astype('|S')
XML_results['Parent3_Tag'] = XML_results['Parent3_Tag'].astype('|S')
XML_results['Parent4_Tag'] = XML_results['Parent4_Tag'].astype('|S')

XML_results['Parent1_Content'] = XML_results['Parent1_Content'].astype('|S')
XML_results['Parent2_Content'] = XML_results['Parent2_Content'].astype('|S')
XML_results['Parent3_Content'] = XML_results['Parent3_Content'].astype('|S')
XML_results['Parent4_Content'] = XML_results['Parent4_Content'].astype('|S')

In [6]:
# Create new columns

# If the tag label or tag content contain any of the following reference elements it will be marked "yes":
# element-citation 
# mixed-citation
# nlm-citation
# ref
# citation-alternatives
# ref-list 
# xref
# source
# bibr

# Tags - check all parent tags
# Tag1
XML_results['ref'] = np.where((XML_results['Parent1_Tag']== "element-citation") | (XML_results['Parent1_Tag']== "mixed-citation") | (XML_results['Parent1_Tag']== "nlm-citation") | (XML_results['Parent1_Tag']== "ref") | (XML_results['Parent1_Tag']== "citation-alternatives") | (XML_results['Parent1_Tag']== "ref-list") | (XML_results['Parent1_Tag']== "xref") | (XML_results['Parent1_Tag']== "source") | (XML_results['Parent1_Tag']== "bibr"), 'yes', 'no')

# Tag2
XML_results['ref'] = np.where((XML_results['Parent2_Tag']== "element-citation") | (XML_results['Parent2_Tag']== "mixed-citation") | (XML_results['Parent2_Tag']== "nlm-citation") | (XML_results['Parent2_Tag']== "ref") | (XML_results['Parent2_Tag']== "citation-alternatives") | (XML_results['Parent2_Tag']== "ref-list") | (XML_results['Parent2_Tag']== "xref") | (XML_results['Parent2_Tag']== "source") | (XML_results['Parent2_Tag']== "bibr"), 'yes', 'no')

# Tag3
XML_results['ref'] = np.where((XML_results['Parent3_Tag']== "element-citation") | (XML_results['Parent3_Tag']== "mixed-citation") | (XML_results['Parent3_Tag']== "nlm-citation") | (XML_results['Parent3_Tag']== "ref") | (XML_results['Parent3_Tag']== "citation-alternatives") | (XML_results['Parent3_Tag']== "ref-list") | (XML_results['Parent3_Tag']== "xref") | (XML_results['Parent3_Tag']== "source") | (XML_results['Parent3_Tag']== "bibr"), 'yes', 'no')

# Tag4
XML_results['ref'] = np.where((XML_results['Parent4_Tag']== "element-citation") | (XML_results['Parent4_Tag']== "mixed-citation") | (XML_results['Parent4_Tag']== "nlm-citation") | (XML_results['Parent4_Tag']== "ref") | (XML_results['Parent4_Tag']== "citation-alternatives") | (XML_results['Parent4_Tag']== "ref-list") | (XML_results['Parent4_Tag']== "xref") | (XML_results['Parent4_Tag']== "source") | (XML_results['Parent4_Tag']== "bibr"), 'yes', 'no')

In [7]:
# Tag Content check
# Tag1 content
XML_results['ref'] = np.where((XML_results['Parent1_Content'].str.contains("element-citation")) | (XML_results['Parent1_Content'].str.contains("mixed-citation")) | (XML_results['Parent1_Content'].str.contains("nlm-citation")) | (XML_results['Parent1_Content'].str.contains("ref")) | (XML_results['Parent1_Content'].str.contains("citation-alternatives")) | (XML_results['Parent1_Content'].str.contains("ref-list")) | (XML_results['Parent1_Content'].str.contains("xref")) | (XML_results['Parent1_Content'].str.contains("source")) | (XML_results['Parent1_Content'].str.contains("bibr")), 'yes', 'no')

# Tag2 content
XML_results['ref'] = np.where((XML_results['Parent2_Content'].str.contains("element-citation")) | (XML_results['Parent2_Content'].str.contains("mixed-citation")) | (XML_results['Parent2_Content'].str.contains("nlm-citation")) | (XML_results['Parent2_Content'].str.contains("ref")) | (XML_results['Parent2_Content'].str.contains("citation-alternatives")) | (XML_results['Parent2_Content'].str.contains("ref-list")) | (XML_results['Parent2_Content'].str.contains("xref")) | (XML_results['Parent2_Content'].str.contains("source")) | (XML_results['Parent2_Content'].str.contains("bibr")), 'yes', 'no')

# Tag3 content
XML_results['ref'] = np.where((XML_results['Parent3_Content'].str.contains("element-citation")) | (XML_results['Parent3_Content'].str.contains("mixed-citation")) | (XML_results['Parent3_Content'].str.contains("nlm-citation")) | (XML_results['Parent3_Content'].str.contains("ref")) | (XML_results['Parent3_Content'].str.contains("citation-alternatives")) | (XML_results['Parent3_Content'].str.contains("ref-list")) | (XML_results['Parent3_Content'].str.contains("xref")) | (XML_results['Parent3_Content'].str.contains("source")) | (XML_results['Parent3_Content'].str.contains("bibr")), 'yes', 'no')

# Tag4 content
XML_results['ref'] = np.where((XML_results['Parent4_Content'].str.contains("element-citation")) | (XML_results['Parent4_Content'].str.contains("mixed-citation")) | (XML_results['Parent4_Content'].str.contains("nlm-citation")) | (XML_results['Parent4_Content'].str.contains("ref")) | (XML_results['Parent4_Content'].str.contains("citation-alternatives")) | (XML_results['Parent4_Content'].str.contains("ref-list")) | (XML_results['Parent4_Content'].str.contains("xref")) | (XML_results['Parent4_Content'].str.contains("source")) | (XML_results['Parent4_Content'].str.contains("bibr")), 'yes', 'no')

In [8]:
# Create column for acknowledgements
# If the tag lable or tag content contains "ack" it will be marked "yes"

# tags
XML_results['ack'] = np.where((XML_results['Parent1_Tag']== "ack") | (XML_results['Parent2_Tag']== "ack") | (XML_results['Parent3_Tag']== "ack") | (XML_results['Parent4_Tag']== "ack"), 'yes', 'no')

# content
XML_results['ack'] = np.where((XML_results['Parent1_Content'].str.contains("ack")) | (XML_results['Parent2_Content'].str.contains("ack")) | (XML_results['Parent3_Content'].str.contains("ack")) | (XML_results['Parent4_Content'].str.contains("ack")), 'yes', 'no')

In [9]:
# Create column for footnotes
# If the tag lable or tag content contains "fn" or "fn-group" it will be marked "yes"

# tags
XML_results['fn'] = np.where((XML_results['Parent1_Tag']== "fn") | (XML_results['Parent2_Tag']== "fn") | (XML_results['Parent3_Tag']== "fn") | (XML_results['Parent4_Tag']== "fn"), 'yes', 'no')
XML_results['fn'] = np.where((XML_results['Parent1_Tag']== "fn-group") | (XML_results['Parent2_Tag']== "fn-group") | (XML_results['Parent3_Tag']== "fn-group") | (XML_results['Parent4_Tag']== "fn-group"), 'yes', 'no')

# content
XML_results['fn'] = np.where((XML_results['Parent1_Content'].str.contains("fn")) | (XML_results['Parent2_Content'].str.contains("fn")) | (XML_results['Parent3_Content'].str.contains("fn")) | (XML_results['Parent4_Content'].str.contains("fn")), 'yes', 'no')
XML_results['fn'] = np.where((XML_results['Parent1_Content'].str.contains("fn-group")) | (XML_results['Parent2_Content'].str.contains("fn-group")) | (XML_results['Parent3_Content'].str.contains("fn-group")) | (XML_results['Parent4_Content'].str.contains("fn-group")), 'yes', 'no')

In [10]:
# Check new cols
list(XML_results.columns.values)

['Alias',
 'Software_Package',
 'Identifier',
 'Pub_Year',
 'Journal_Title',
 'Article_id',
 'File_Name',
 'Parent1_Tag',
 'Parent2_Tag',
 'Parent3_Tag',
 'Parent4_Tag',
 'Parent1_Content',
 'Parent2_Content',
 'Parent3_Content',
 'Parent4_Content',
 'Title',
 'Author(s)',
 'Publisher',
 'Citation',
 'ref',
 'ack',
 'fn']

In [11]:
XML_results.to_csv("XML_RESULTS_FINAL_040419.csv")

## Summary of Results

In [49]:
# How many papers did we find for each software package?
XML_results.groupby('Software_Package')['File_Name'].nunique()

Software_Package
AstroBlend        1
AstroPy         483
RADMC-3D        200
SAOImage DS9    323
Spec2d          252
Stingray          1
TARDIS            5
WCS Tools       113
Name: File_Name, dtype: int64

In [50]:
# Which tags were associated with each software package?
XML_tags1 = pd.DataFrame({'tags' : XML_results.groupby( [ "Software_Package", "Parent1_Tag"] ).size()}).reset_index()
XML_tags2 = pd.DataFrame({'tags' : XML_results.groupby( [ "Software_Package", "Parent2_Tag"] ).size()}).reset_index()
XML_tags3 = pd.DataFrame({'tags' : XML_results.groupby( [ "Software_Package", "Parent3_Tag"] ).size()}).reset_index()
XML_tags4 = pd.DataFrame({'tags' : XML_results.groupby( [ "Software_Package", "Parent4_Tag"] ).size()}).reset_index()
XML_all_tags = [XML_tags1, XML_tags2, XML_tags3, XML_tags4]
XML_all_tags = pd.concat(XML_all_tags)
# write tags to csv
XML_all_tags.to_csv("XML_all_tags_040419.csv")

In [51]:
# AstroBlend
AstroBlend_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("AstroBlend")]
AstroBlend_Tags = pd.concat([AstroBlend_Tags['Parent1_Tag'], AstroBlend_Tags['Parent2_Tag'], AstroBlend_Tags['Parent3_Tag'], AstroBlend_Tags['Parent4_Tag']]).unique()
AstroBlend_Tags = np.ndarray.tolist(AstroBlend_Tags)
AstroBlend_Tags = pd.DataFrame(AstroBlend_Tags, columns = ["AstroBlend_XML_Tags"])
AstroBlend_Tags = AstroBlend_Tags[AstroBlend_Tags.AstroBlend_XML_Tags.notnull()]
AstroBlend_Tags = AstroBlend_Tags.sort_values(by=['AstroBlend_XML_Tags']).reset_index(drop=True)
# write tags to csv
AstroBlend_Tags.to_csv("AstroBlend_Tags_040419.csv")
AstroBlend_Tags

Unnamed: 0,AstroBlend_XML_Tags
0,ext-link
1,fn
2,p


In [52]:
# AstroPy
AstroPy_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("AstroPy")]
AstroPy_Tags = pd.concat([AstroPy_Tags['Parent1_Tag'], AstroPy_Tags['Parent2_Tag'], AstroPy_Tags['Parent3_Tag'], AstroPy_Tags['Parent4_Tag']]).unique()
AstroPy_Tags = np.ndarray.tolist(AstroPy_Tags)
AstroPy_Tags = pd.DataFrame(AstroPy_Tags, columns = ["AstroPy_XML_Tags"])
AstroPy_Tags = AstroPy_Tags[AstroPy_Tags.AstroPy_XML_Tags.notnull()]
AstroPy_Tags = AstroPy_Tags.sort_values(by=['AstroPy_XML_Tags']).reset_index(drop=True)
# write tags to csv
AstroPy_Tags.to_csv("AstroPy_Tags_040419.csv")
AstroPy_Tags

Unnamed: 0,AstroPy_XML_Tags
0,abstract
1,ack
2,article
3,article-meta
4,back
5,body
6,element-citation
7,ext-link
8,fn
9,list


In [53]:
# RADMC-3D
RADMC3D_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("RADMC-3D")]
RADMC3D_Tags = pd.concat([RADMC3D_Tags['Parent1_Tag'], RADMC3D_Tags['Parent2_Tag'], RADMC3D_Tags['Parent3_Tag'], RADMC3D_Tags['Parent4_Tag']]).unique()
RADMC3D_Tags = np.ndarray.tolist(RADMC3D_Tags)
RADMC3D_Tags = pd.DataFrame(RADMC3D_Tags, columns = ["RADMC3D_XML_Tags"])
RADMC3D_Tags = RADMC3D_Tags[RADMC3D_Tags.RADMC3D_XML_Tags.notnull()]
RADMC3D_Tags = RADMC3D_Tags.sort_values(by=['RADMC3D_XML_Tags']).reset_index(drop=True)
# write tags to csv
RADMC3D_Tags.to_csv("RADMC3D_Tags_040419.csv")
RADMC3D_Tags

Unnamed: 0,RADMC3D_XML_Tags
0,abstract
1,ack
2,app
3,app-group
4,article
5,article-meta
6,article-title
7,back
8,body
9,caption


In [54]:
# DS9
SAOImageDS9_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("SAOImage DS9")]
SAOImageDS9_Tags = pd.concat([SAOImageDS9_Tags['Parent1_Tag'], SAOImageDS9_Tags['Parent2_Tag'], SAOImageDS9_Tags['Parent3_Tag'], SAOImageDS9_Tags['Parent4_Tag']]).unique()
SAOImageDS9_Tags = np.ndarray.tolist(SAOImageDS9_Tags)
SAOImageDS9_Tags = pd.DataFrame(SAOImageDS9_Tags, columns = ["SAOImageDS9_XML_Tags"])
SAOImageDS9_Tags = SAOImageDS9_Tags[SAOImageDS9_Tags.SAOImageDS9_XML_Tags.notnull()]
SAOImageDS9_Tags = SAOImageDS9_Tags.sort_values(by=['SAOImageDS9_XML_Tags']).reset_index(drop=True)
# write tags to csv
SAOImageDS9_Tags.to_csv("SAOImageDS9_Tags_040419.csv")
SAOImageDS9_Tags

Unnamed: 0,SAOImageDS9_XML_Tags
0,ack
1,app
2,app-group
3,article
4,back
5,body
6,caption
7,element-citation
8,ext-link
9,fig


In [55]:
# Spec2d
Spec2d_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("Spec2d")]
Spec2d_Tags = pd.concat([Spec2d_Tags['Parent1_Tag'], Spec2d_Tags['Parent2_Tag'], Spec2d_Tags['Parent3_Tag'], Spec2d_Tags['Parent4_Tag']]).unique()
Spec2d_Tags = np.ndarray.tolist(Spec2d_Tags)
Spec2d_Tags = pd.DataFrame(Spec2d_Tags, columns = ["Spec2d_XML_Tags"])
Spec2d_Tags = Spec2d_Tags[Spec2d_Tags.Spec2d_XML_Tags.notnull()]
Spec2d_Tags = Spec2d_Tags.sort_values(by=['Spec2d_XML_Tags']).reset_index(drop=True)
# write tags to csv
Spec2d_Tags.to_csv("Spec2d_Tags_040419.csv")
Spec2d_Tags

Unnamed: 0,Spec2d_XML_Tags
0,abstract
1,ack
2,app
3,app-group
4,article
5,article-id
6,article-meta
7,article-title
8,back
9,body


In [56]:
# Stingray
Stingray_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("Stingray")]
Stingray_Tags = pd.concat([Stingray_Tags['Parent1_Tag'], Stingray_Tags['Parent2_Tag'], Stingray_Tags['Parent3_Tag'], Stingray_Tags['Parent4_Tag']]).unique()
Stingray_Tags = np.ndarray.tolist(Stingray_Tags)
Stingray_Tags = pd.DataFrame(Stingray_Tags, columns = ["Stingray_XML_Tags"])
Stingray_Tags = Stingray_Tags[Stingray_Tags.Stingray_XML_Tags.notnull()]
Stingray_Tags = Stingray_Tags.sort_values(by=['Stingray_XML_Tags']).reset_index(drop=True)
# write tags to csv
Stingray_Tags.to_csv("Stingray_Tags_040419.csv")
Stingray_Tags

Unnamed: 0,Stingray_XML_Tags
0,ack
1,article
2,back
3,element-citation
4,ext-link
5,fn
6,monospace
7,p
8,ref
9,ref-list


In [57]:
# tardis
TARDIS_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("TARDIS")]
TARDIS_Tags = pd.concat([TARDIS_Tags['Parent1_Tag'], TARDIS_Tags['Parent2_Tag'], TARDIS_Tags['Parent3_Tag'], TARDIS_Tags['Parent4_Tag']]).unique()
TARDIS_Tags = np.ndarray.tolist(TARDIS_Tags)
TARDIS_Tags = pd.DataFrame(TARDIS_Tags, columns = ["TARDIS_XML_Tags"])
TARDIS_Tags = TARDIS_Tags[TARDIS_Tags.TARDIS_XML_Tags.notnull()]
TARDIS_Tags = TARDIS_Tags.sort_values(by=['TARDIS_XML_Tags']).reset_index(drop=True)
# write tags to csv
TARDIS_Tags.to_csv("TARDIS_Tags_040419.csv")
TARDIS_Tags

Unnamed: 0,TARDIS_XML_Tags
0,article
1,body
2,element-citation
3,p
4,pub-id
5,ref
6,ref-list
7,sec


In [58]:
# wcs tools
WCSTools_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("WCS Tools")]
WCSTools_Tags = pd.concat([WCSTools_Tags['Parent1_Tag'], WCSTools_Tags['Parent2_Tag'], WCSTools_Tags['Parent3_Tag'], WCSTools_Tags['Parent4_Tag']]).unique()
WCSTools_Tags = np.ndarray.tolist(WCSTools_Tags)
WCSTools_Tags = pd.DataFrame(WCSTools_Tags, columns = ["WCSTools_XML_Tags"])
WCSTools_Tags = WCSTools_Tags[WCSTools_Tags.WCSTools_XML_Tags.notnull()]
WCSTools_Tags = WCSTools_Tags.sort_values(by=['WCSTools_XML_Tags']).reset_index(drop=True)
# write tags to csv
WCSTools_Tags.to_csv("WCSTools_Tags_040419.csv")
WCSTools_Tags

Unnamed: 0,WCSTools_XML_Tags
0,ack
1,app
2,app-group
3,article
4,back
5,body
6,comment
7,element-citation
8,ext-link
9,fn


In [103]:
# Proportion of articles with a software aliase mentioned in the references
XML_all_refs = XML_results.loc[XML_results['ref'] == 'yes']
XML_ref_proportion = XML_all_refs.groupby('Software_Package')['File_Name'].nunique()/XML_results.groupby('Software_Package')['File_Name'].nunique()
XML_ref_proportion

Software_Package
AstroBlend      1.000000
AstroPy         1.000000
RADMC-3D        1.000000
SAOImage DS9    1.000000
Spec2d          0.996032
Stingray        1.000000
TARDIS          1.000000
WCS Tools       1.000000
Name: File_Name, dtype: float64

In [104]:
# How many unique aliases were used in the references for each package?
print (XML_all_refs.groupby('Software_Package')['Alias'].nunique())

Software_Package
AstroBlend       1
AstroPy          6
RADMC-3D        12
SAOImage DS9     8
Spec2d          12
Stingray         2
TARDIS           3
WCS Tools       12
Name: Alias, dtype: int64


In [154]:
# Which aliases were used in the papers with aliases in references? Write results to csv
XML_ref_aliases = pd.DataFrame({'ref_count' : XML_all_refs.groupby(['Software_Package', 'Alias', 'Identifier'])['File_Name'].nunique()}).reset_index()
XML_ref_aliases.to_csv("XML_ref_aliases_040419.csv")
XML_ref_aliases

Unnamed: 0,Software_Package,Alias,Identifier,ref_count
0,AstroBlend,astroblend.com,0,1
1,AstroPy,10.1051/0004-6361/201322068,1,449
2,AstroPy,2013A&A...558A..33A,1,441
3,AstroPy,AstroPy,0,27
4,AstroPy,astropy.org,0,74
5,AstroPy,doi.org/10.1051/0004-6361/201322068,1,2
6,AstroPy,github.com/astropy,0,6
7,RADMC-3D,10.1051/0004-6361:20031768,1,96
8,RADMC-3D,10.1051/0004-6361:20040017,1,11
9,RADMC-3D,2004A&A...417..159D,1,96


In [155]:
# Proportion of articles containing software aliases with the software mentioned in an acknowledgement
XML_all_ack = XML_results.loc[XML_results['ack'] == 'yes']
XML_ack_proportion = XML_all_ack.groupby('Software_Package')['File_Name'].nunique()/XML_results.groupby('Software_Package')['File_Name'].nunique()
XML_ack_proportion

Software_Package
AstroBlend           NaN
AstroPy         0.374741
RADMC-3D        0.595000
SAOImage DS9    0.941176
Spec2d          0.563492
Stingray        1.000000
TARDIS          0.400000
WCS Tools       0.893805
Name: File_Name, dtype: float64

In [156]:
# Which aliases were used in the papers with aliases in acknowledgements? Write results to csv
XML_ack_aliases = pd.DataFrame({'ack_count' : XML_all_ack.groupby([ "Software_Package", "Alias", "Identifier"])['File_Name'].nunique()}).reset_index()
XML_ack_aliases.to_csv("XML_ack_aliases_040419.csv")
XML_ack_aliases

Unnamed: 0,Software_Package,Alias,Identifier,ack_count
0,AstroPy,10.1051/0004-6361/201322068,1,95
1,AstroPy,2013A&A...558A..33A,1,96
2,AstroPy,AstroPy,0,27
3,AstroPy,astropy.org,0,66
4,AstroPy,doi.org/10.1051/0004-6361/201322068,1,2
5,AstroPy,github.com/astropy,0,5
6,RADMC-3D,10.1051/0004-6361:20031768,1,15
7,RADMC-3D,10.1051/0004-6361:20040017,1,10
8,RADMC-3D,2004A&A...417..159D,1,15
9,RADMC-3D,2004A&A...417..793P,1,10


In [157]:
# Proportion of articles containing software aliases with the software mentioned in a footnote
XML_all_fn = XML_results.loc[XML_results['fn'] == 'yes']
XML_fn_proportion = XML_all_fn.groupby('Software_Package')['File_Name'].nunique()/XML_results.groupby('Software_Package')['File_Name'].nunique()
XML_fn_proportion

Software_Package
AstroBlend           NaN
AstroPy              NaN
RADMC-3D             NaN
SAOImage DS9    0.099071
Spec2d          0.103175
Stingray             NaN
TARDIS               NaN
WCS Tools       0.221239
Name: File_Name, dtype: float64

In [159]:
# Which aliases were used in the papers with aliases in footnotes? Write results to csv
XML_fn_aliases = pd.DataFrame({'fn_count' : XML_all_fn.groupby( [ "Software_Package", "Alias", "Identifier"])['File_Name'].nunique()}).reset_index()
XML_fn_aliases.to_csv("XML_fn_aliases_040419.csv")
XML_fn_aliases

Unnamed: 0,Software_Package,Alias,Identifier,fn_count
0,SAOImage DS9,DS9,0,15
1,SAOImage DS9,ds9,0,6
2,SAOImage DS9,SAOImage,0,2
3,SAOImage DS9,SAOImage DS9,0,10
4,Spec2d,DEEP2 DEIMOS data pipeline,0,1
5,Spec2d,DEEP2 DEIMOS pipeline,0,2
6,Spec2d,DEIMOS pipeline,0,3
7,Spec2d,Spec2d,0,1
8,Spec2d,spec2d,0,20
9,WCS Tools,WCS Tools,0,1


In [163]:
# Did any articles have no reference, acknowledgement, or footnote?
no_credit = XML_results.loc[(XML_results['fn'] == 'no') & (XML_results['ref'] == 'no') & (XML_results['ack'] == 'no')]
no_credit = no_credit.loc[~(no_credit['File_Name'].isin(XML_all_fn['File_Name']))]
no_credit = no_credit.loc[~(no_credit['File_Name'].isin(XML_all_refs['File_Name']))]
no_credit = no_credit.loc[~(no_credit['File_Name'].isin(XML_all_ack['File_Name']))]
no_credit.groupby('Software_Package')['File_Name'].nunique()

Software_Package
Spec2d    1
Name: File_Name, dtype: int64

In [167]:
# total mentions of software aliases over time
XML_over_time = pd.DataFrame({'year_count' : XML_results.groupby(["Software_Package", "Pub_Year"])['File_Name'].nunique()}).reset_index()
# write results
XML_over_time.to_csv("XML_over_time_040419.csv")
# show results over time
XML_over_time

Unnamed: 0,Software_Package,Pub_Year,year_count
0,AstroBlend,2016,1
1,AstroPy,2013,3
2,AstroPy,2014,36
3,AstroPy,2015,60
4,AstroPy,2016,100
5,AstroPy,2017,176
6,AstroPy,2018,108
7,RADMC-3D,2009,13
8,RADMC-3D,2010,19
9,RADMC-3D,2011,13
