# XML Final Analysis
### Daina Bouquin, Daniel Chivvis

Scripts below were used to generate all .csv files in the XML_RESULTS_082019/ folder

In [139]:
import pandas as pd
import numpy as np
import sys
import csv

In [140]:
XML_results = pd.read_csv("XML_CLEAN_INPUT_082019.csv") 

In [141]:
list(XML_results.columns.values)

['Alias',
 'Software_Package',
 'Identifier',
 'Pub_Year',
 'DOI',
 'Journal_Title',
 'Article_ID',
 'File_Name',
 'Parent1_Tag',
 'Parent2_Tag',
 'Parent3_Tag',
 'Parent4_Tag',
 'Parent1_Content',
 'Parent2_Content',
 'Parent3_Content',
 'Author(s)',
 'Publisher',
 'Title']

In [142]:
XML_results.head(5)

Unnamed: 0,Alias,Software_Package,Identifier,Pub_Year,DOI,Journal_Title,Article_ID,File_Name,Parent1_Tag,Parent2_Tag,Parent3_Tag,Parent4_Tag,Parent1_Content,Parent2_Content,Parent3_Content,Author(s),Publisher,Title
0,astroblend,AstroBlend,0,2016,10.3847/0004-637X/818/2/115,The Astrophysical Journal,"apj521773, 10.3847/0004-637X/818/2/115, 521773...",apj_818_2_115.xml,sc,p,sec,sec,['astroblend'],['We note that our example scripts only explor...,['\n<label>3.5.</label>\n<title>From <sans-ser...,Vogt Frédéric P. A. Owen Chris I. Verdes-Mon...,The American Astronomical Society,ADVANCED DATA VISUALIZATION IN ASTROPHYSICS: T...
1,astroblend.com,AstroBlend,0,2016,10.3847/0004-637X/818/2/115,The Astrophysical Journal,"apj521773, 10.3847/0004-637X/818/2/115, 521773...",apj_818_2_115.xml,ext-link,p,fn,p,['http://www.astroblend.com'],"['\n<ext-link ext-link-type=""uri"" xlink:href=""...",['\n<label><sup>26</sup></label>\n<p>\n<ext-li...,Vogt Frédéric P. A. Owen Chris I. Verdes-Mon...,The American Astronomical Society,ADVANCED DATA VISUALIZATION IN ASTROPHYSICS: T...
2,10.1051/0004-6361/201322068,Astropy,1,2014,10.1088/0004-6256/148/1/13,The Astronomical Journal,"aj493368, ANJOAA, 10.1088/0004-6256/148/1/13, ...",aj_148_1_13.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...",Rodney Steven A.Riess Adam G.Strolger Louis-Gr...,The American Astronomical Society,TYPE Ia SUPERNOVA RATE MEASUREMENTS TO REDSHIF...
3,10.1051/0004-6361/201322068,Astropy,1,2014,10.1088/0004-6256/148/1/14,The Astronomical Journal,"aj495229, ANJOAA, 10.1088/0004-6256/148/1/14, ...",aj_148_1_14.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...",Bañados E.Venemans B. P.Morganson E.Decarli R....,The American Astronomical Society,DISCOVERY OF EIGHT z ∼ 6 QUASARS FROM Pan-STARRS1
4,10.1051/0004-6361/201322068,Astropy,1,2014,10.1088/0004-6256/148/3/53,The Astronomical Journal,"aj499538, ANJOAA, 10.1088/0004-6256/148/3/53, ...",aj_148_3_53.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...",Gullikson KevinDodson-Robinson SarahKraus Adam...,The American Astronomical Society,"CORRECTING FOR TELLURIC ABSORPTION: METHODS, C..."


In [143]:
# Convert tag, content, identifier, pub_year cols to strings

XML_results['Parent1_Tag'] = XML_results['Parent1_Tag'].astype('|S')
XML_results['Parent2_Tag'] = XML_results['Parent2_Tag'].astype('|S')
XML_results['Parent3_Tag'] = XML_results['Parent3_Tag'].astype('|S')
XML_results['Parent4_Tag'] = XML_results['Parent4_Tag'].astype('|S')

XML_results['Parent1_Content'] = XML_results['Parent1_Content'].astype('|S')
XML_results['Parent2_Content'] = XML_results['Parent2_Content'].astype('|S')
XML_results['Parent3_Content'] = XML_results['Parent3_Content'].astype('|S')

XML_results['Identifier'] = XML_results['Identifier'].astype('|S')
XML_results['Pub_Year'] = XML_results['Pub_Year'].astype('|S')
XML_results['Alias'] = XML_results['Alias'].astype('|S')

In [144]:
# Create column for reference section

# If the tag lable or tag content contain any of the following reference elements it will be marked "yes":
# element-citation 
# mixed-citation
# nlm-citation
# ref
# citation-alternatives
# ref-list 
# xref
# source
# bibr
# collab
# contrib-group
# person-group
# pub-id

references = ['Parent1_Tag','Parent2_Tag','Parent3_Tag','Parent4_Tag','Parent1_Content','Parent2_Content']
XML_results["ref"] = np.where((XML_results[references] == "element-citation").any(axis=1) | (XML_results[references]== "mixed-citation").any(axis=1) | (XML_results[references]== "nlm-citation").any(axis=1) | (XML_results[references]== "ref").any(axis=1) | (XML_results[references]== "citation-alternatives").any(axis=1) | (XML_results[references]== "ref-list").any(axis=1) | (XML_results[references]== "xref").any(axis=1) | (XML_results[references]== "source").any(axis=1) | (XML_results[references]== "collab").any(axis=1) | (XML_results[references]== "contrib-group").any(axis=1) | (XML_results[references]== "person-group").any(axis=1) | (XML_results[references]== "pub-id").any(axis=1) | (XML_results[references]== "bibr").any(axis=1), "yes", "no")

In [145]:
# Create column for acknowledgements
# If the tag lable or tag content contains "ack" it will be marked "yes"

acknowledgements = ['Parent1_Tag','Parent2_Tag','Parent3_Tag','Parent4_Tag','Parent1_Content','Parent2_Content']
XML_results["ack"] = np.where((XML_results[acknowledgements] == "ack").any(axis=1), "yes", "no")

In [146]:
# Create column for footnotes
# If the tag lable or tag content contains "fn" or "fn-group" it will be marked "yes"

footnotes = ['Parent1_Tag','Parent2_Tag','Parent3_Tag','Parent4_Tag','Parent1_Content','Parent2_Content']
XML_results["fn"] = np.where((XML_results[footnotes] == "fn").any(axis=1) | (XML_results[footnotes]== "fn-group").any(axis=1), "yes", "no")

In [147]:
# Create column for attempt at recognizable credit (ref + ack + fn + ext-link + back)

# element-citation 
# mixed-citation
# nlm-citation
# ref
# citation-alternatives
# ref-list 
# xref
# source
# bibr
# collab
# contrib-group
# person-group
# pub-id
# ext-link
# back

rec_credit = ['Parent1_Tag','Parent2_Tag','Parent3_Tag','Parent4_Tag','Parent1_Content','Parent2_Content']
XML_results["rec_credit"] = np.where((XML_results[rec_credit] == "element-citation").any(axis=1) | (XML_results[rec_credit]== "mixed-citation").any(axis=1) | (XML_results[rec_credit]== "nlm-citation").any(axis=1) | (XML_results[rec_credit]== "ref").any(axis=1) | (XML_results[rec_credit]== "citation-alternatives").any(axis=1) | (XML_results[rec_credit]== "ref-list").any(axis=1) | (XML_results[rec_credit]== "xref").any(axis=1) | (XML_results[rec_credit]== "source").any(axis=1) | (XML_results[rec_credit]== "collab").any(axis=1) | (XML_results[rec_credit]== "contrib-group").any(axis=1) | (XML_results[rec_credit]== "person-group").any(axis=1) | (XML_results[rec_credit]== "pub-id").any(axis=1) | (XML_results[rec_credit] == "fn").any(axis=1) | (XML_results[rec_credit]== "fn-group").any(axis=1) | (XML_results[rec_credit]== "ack").any(axis=1) |(XML_results[rec_credit]== "back").any(axis=1) |(XML_results[rec_credit]== "ext-link").any(axis=1) | (XML_results[rec_credit]== "bibr").any(axis=1), "yes", "no")

In [148]:
# Check new cols

list(XML_results.columns.values)

['Alias',
 'Software_Package',
 'Identifier',
 'Pub_Year',
 'DOI',
 'Journal_Title',
 'Article_ID',
 'File_Name',
 'Parent1_Tag',
 'Parent2_Tag',
 'Parent3_Tag',
 'Parent4_Tag',
 'Parent1_Content',
 'Parent2_Content',
 'Parent3_Content',
 'Author(s)',
 'Publisher',
 'Title',
 'ref',
 'ack',
 'fn',
 'rec_credit']

In [149]:
XML_results.to_csv("XML_FINAL_ANALYSIS_082019.csv")

## Summary of Results

In [150]:
XML_alias_per_paper = pd.DataFrame({'count' : XML_results.groupby(["Software_Package","File_Name"])['Alias'].nunique()})
XML_alias_per_paper.to_csv("XML_alias_per_paper_082019.csv")
XML_alias_per_paper

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Software_Package,File_Name,Unnamed: 2_level_1
AstroBlend,apj_818_2_115.xml,2
Astropy,aj_148_1_13.xml,4
Astropy,aj_148_1_14.xml,5
Astropy,aj_148_3_53.xml,4
Astropy,aj_148_6_122.xml,4
Astropy,aj_150_4_118.xml,4
Astropy,aj_150_4_132.xml,4
Astropy,aj_150_5_145.xml,4
Astropy,aj_150_6_179.xml,4
Astropy,aj_150_6_189.xml,4


In [57]:
# How many papers did we find for each software package?

XML_results.groupby('Software_Package')['File_Name'].nunique()

Software_Package
AstroBlend        1
Astropy         538
RADMC-3D        214
SAOImage DS9    341
Spec2d          304
Stingray          2
TARDIS            4
WCSTools        123
Name: File_Name, dtype: int64

In [12]:
# Total number of unique XML files

XML_results.File_Name.nunique()

1469

In [119]:
# Date range of all XML files

XML_results.groupby('Pub_Year')['File_Name'].nunique()

Pub_Year
1997      1
1998      4
1999      2
2000      2
2001      4
2002      8
2003      7
2004     16
2005     18
2006     31
2007     49
2008     36
2009     63
2010     76
2011     72
2012     94
2013     82
2014    103
2015    160
2016    205
2017    285
2018    151
Name: File_Name, dtype: int64

In [19]:
# Date range of all XML files (REVISIT THIS)

XML_results.groupby('Pub_Year')['DOI'].nunique()

Pub_Year
1997      1
1998      1
1999      2
2000      2
2001      2
2002      5
2003      6
2004     13
2005     16
2006     24
2007     41
2008     36
2009     63
2010     76
2011     72
2012     94
2013     82
2014    103
2015    160
2016    205
2017    285
2018    151
Name: DOI, dtype: int64

In [16]:
# Alias per package

XML_results.groupby('Software_Package')['Alias'].nunique()

Software_Package
AstroBlend       2
Astropy         21
RADMC-3D        18
SAOImage DS9    21
Spec2d          22
Stingray         5
TARDIS           3
WCSTools        17
Name: Alias, dtype: int64

In [27]:
# Alias per journal (REVISIT THIS)

XML_results.groupby('Journal_Title')['DOI'].nunique()

Journal_Title
The Astronomical Journal                       176
The Astrophysical Journal                      979
The Astrophysical Journal Letters              128
The Astrophysical Journal Supplement Series    148
Name: DOI, dtype: int64

In [30]:


XML_results.groupby('Journal_Title')['DOI'].isna().sum()

AttributeError: Cannot access callable attribute 'isna' of 'SeriesGroupBy' objects, try using the 'apply' method

In [18]:
# Alias per journal

XML_results.groupby('Journal_Title')['File_Name'].nunique()

Journal_Title
The Astronomical Journal                       214
The Astrophysical Journal                      979
The Astrophysical Journal Letters              128
The Astrophysical Journal Supplement Series    148
Name: File_Name, dtype: int64

In [25]:
# Most common identifier per package

XML_alias_paper = pd.DataFrame(XML_results.groupby(['Alias','Software_Package'])['Identifier'].nunique())
XML_alias_paper

SyntaxError: invalid syntax (<ipython-input-25-45b66a564767>, line 3)

In [60]:
# Alias per package

XML_results.groupby('Pub_Year')['Alias'].nunique()
XML_alias_paper

Unnamed: 0_level_0,Unnamed: 1_level_0,Identifier
Alias,Software_Package,Unnamed: 2_level_1
10.1051/0004-6361/201322068,Astropy,1
10.1051/0004-6361:20031768,RADMC-3D,1
10.1051/0004-6361:20040017,RADMC-3D,1
10.1088/0067-0049/208/1/5,Spec2d,1
10.1093/mnras/stu055,TARDIS,1
1997ASPC..125..249M,WCSTools,1
1999ASPC..172..498M,WCSTools,1
2000ascl.soft03002S,SAOImage DS9,1
2002ASPC..281..169M,WCSTools,1
2003ASPC..295..489J,SAOImage DS9,1


In [24]:
# Total number of unique alias

XML_unique_alias = pd.DataFrame(XML_results.groupby(['Software_Package','Alias'])['Index'].nunique())
#XML_unique_alias.to_csv("XML_unique_alias_082019.csv")
XML_unique_alias

KeyError: 'Column not found: Index'

In [61]:
#Alias per paper

XML_alias_paper = pd.DataFrame(XML_results.groupby(['DOI','Software_Package'])['Alias'].nunique())
#XML_alias_paper.to_csv("XML_alias_paper_082019.csv")
XML_alias_paper

Unnamed: 0_level_0,Unnamed: 1_level_0,Alias
DOI,Software_Package,Unnamed: 2_level_1
#VALUE!,SAOImage DS9,7
#VALUE!,Spec2d,4
#VALUE!,WCSTools,6
10.1086/304882,SAOImage DS9,1
10.1086/312617,SAOImage DS9,1
10.1086/313204,SAOImage DS9,1
10.1086/321561,SAOImage DS9,1
10.1086/339572,WCSTools,1
10.1086/340936,SAOImage DS9,2
10.1086/342702,WCSTools,1


In [62]:
# Identifiers per package

XML_unique_id = pd.DataFrame(XML_results.groupby(['Software_Package','Identifier'])['Index'].nunique())
#XML_unique_id.to_csv("XML_unique_id_082019.csv")
XML_unique_id

KeyError: 'Column not found: Index'

In [16]:
# Tag count per parent

XML_unique_tags1 = pd.DataFrame(XML_results.groupby(['Parent1_Tag'])['Index'].nunique())
XML_unique_tags2 = pd.DataFrame(XML_results.groupby(['Parent2_Tag'])['Index'].nunique())
XML_unique_tags3 = pd.DataFrame(XML_results.groupby(['Parent3_Tag'])['Index'].nunique())
XML_unique_tags4 = pd.DataFrame(XML_results.groupby(['Parent4_Tag'])['Index'].nunique())

XML_unique_tags1.to_csv("XML_unique_tags1_082019.csv")
XML_unique_tags2.to_csv("XML_unique_tags2_082019.csv")
XML_unique_tags3.to_csv("XML_unique_tags3_082019.csv")
XML_unique_tags4.to_csv("XML_unique_tags4_082019.csv")

In [63]:
#Tags per paper

XML_tags_paper1 = pd.DataFrame(XML_results.groupby(['File_Name'])['Parent1_Tag'].nunique())
XML_tags_paper2 = pd.DataFrame(XML_results.groupby(['File_Name'])['Parent2_Tag'].nunique())
XML_tags_paper3 = pd.DataFrame(XML_results.groupby(['File_Name'])['Parent3_Tag'].nunique())
XML_tags_paper4 = pd.DataFrame(XML_results.groupby(['File_Name'])['Parent4_Tag'].nunique())

XML_tags_paper1.to_csv("XML_tags_paper1_082019.csv")
XML_tags_paper2.to_csv("XML_tags_paper2_082019.csv")
XML_tags_paper3.to_csv("XML_tags_paper3_082019.csv")
XML_tags_paper4.to_csv("XML_tags_paper4_082019.csv")

In [13]:
#Tags per package

XML_unique_tags_package1 = pd.DataFrame(XML_results.groupby(['Software_Package', 'Parent1_Tag'])['Index'].nunique())
XML_unique_tags_package2 = pd.DataFrame(XML_results.groupby(['Software_Package', 'Parent2_Tag'])['Index'].nunique())
XML_unique_tags_package3 = pd.DataFrame(XML_results.groupby(['Software_Package', 'Parent3_Tag'])['Index'].nunique())
XML_unique_tags_package4 = pd.DataFrame(XML_results.groupby(['Software_Package', 'Parent4_Tag'])['Index'].nunique())

XML_unique_tags_package1.to_csv("XML_unique_tags_package1_082019.csv")
XML_unique_tags_package2.to_csv("XML_unique_tags_package2_082019.csv")
XML_unique_tags_package3.to_csv("XML_unique_tags_package3_082019.csv")
XML_unique_tags_package4.to_csv("XML_unique_tags_package4_082019.csv")

In [64]:
# Unique files for each alias per year
# Is this useful?

XML_unique_alias_year = pd.DataFrame(XML_results.groupby(['Alias', 'Pub_Year'])['File_Name'].nunique())
#XML_unique_alias_year.to_csv("XML_unique_alias_year_082019.csv")
XML_unique_alias_year

Unnamed: 0_level_0,Unnamed: 1_level_0,File_Name
Alias,Pub_Year,Unnamed: 2_level_1
10.1051/0004-6361/201322068,2013,2
10.1051/0004-6361/201322068,2014,35
10.1051/0004-6361/201322068,2015,53
10.1051/0004-6361/201322068,2016,91
10.1051/0004-6361/201322068,2017,165
10.1051/0004-6361/201322068,2018,103
10.1051/0004-6361:20031768,2009,13
10.1051/0004-6361:20031768,2010,17
10.1051/0004-6361:20031768,2011,12
10.1051/0004-6361:20031768,2012,18


In [65]:
# Total number of unique papers with software aliases by year

XML_unique_paper_year = pd.DataFrame(XML_results.groupby(['Pub_Year'])['File_Name'].nunique())
#XML_unique_paper_year.to_csv("XML_unique_paper_year_082019.csv")
XML_unique_paper_year

Unnamed: 0_level_0,File_Name
Pub_Year,Unnamed: 1_level_1
1997,1
1998,4
1999,2
2000,2
2001,4
2002,8
2003,7
2004,16
2005,18
2006,31


In [66]:
# Total number of unique papers with software aliases in the references section by year


XML_unique_paper_ref_year = pd.DataFrame(XML_results.groupby(['ref', 'Pub_Year'])['File_Name'].nunique())
#XML_unique_paper_ref_year.to_csv("XML_unique_paper_ref_year_082019.csv")
XML_unique_paper_ref_year

Unnamed: 0_level_0,Unnamed: 1_level_0,File_Name
ref,Pub_Year,Unnamed: 2_level_1
no,1997,1
no,1998,4
no,1999,2
no,2000,2
no,2001,3
no,2002,8
no,2003,7
no,2004,13
no,2005,17
no,2006,30


In [67]:
# Trends over time for each AAS Journal



XML_journal_year = pd.DataFrame(XML_results.groupby(['Journal_Title', 'Pub_Year'])['File_Name'].nunique())
#XML_journal_year.to_csv("XML_journal_year_082019.csv")
XML_journal_year

Unnamed: 0_level_0,Unnamed: 1_level_0,File_Name
Journal_Title,Pub_Year,Unnamed: 2_level_1
The Astronomical Journal,1998,4
The Astronomical Journal,1999,1
The Astronomical Journal,2000,1
The Astronomical Journal,2001,3
The Astronomical Journal,2002,4
The Astronomical Journal,2003,2
The Astronomical Journal,2004,4
The Astronomical Journal,2005,3
The Astronomical Journal,2006,8
The Astronomical Journal,2007,9


In [68]:
# Which software packages showed up in which journals

XML_package_per_journal = pd.DataFrame(XML_results.groupby(['Journal_Title', 'Software_Package'])['File_Name'].nunique())
XML_package_per_journal.to_csv("XML_package_per_journal_082019.csv")
XML_package_per_journal

Unnamed: 0_level_0,Unnamed: 1_level_0,File_Name
Journal_Title,Software_Package,Unnamed: 2_level_1
The Astronomical Journal,Astropy,76
The Astronomical Journal,RADMC-3D,7
The Astronomical Journal,SAOImage DS9,68
The Astronomical Journal,Spec2d,25
The Astronomical Journal,WCSTools,43
The Astrophysical Journal,AstroBlend,1
The Astrophysical Journal,Astropy,364
The Astrophysical Journal,RADMC-3D,168
The Astrophysical Journal,SAOImage DS9,196
The Astrophysical Journal,Spec2d,229


In [69]:
# Total number of articles with software mentions (1527) per journal

XML_articles_mentions_per_journal = pd.DataFrame(XML_results.groupby(['Journal_Title','Software_Package'])['File_Name'].nunique())
XML_articles_mentions_per_journal.to_csv("XML_articles_mentions_per_journal_082019.csv")
XML_articles_mentions_per_journal

Unnamed: 0_level_0,Unnamed: 1_level_0,File_Name
Journal_Title,Software_Package,Unnamed: 2_level_1
The Astronomical Journal,Astropy,76
The Astronomical Journal,RADMC-3D,7
The Astronomical Journal,SAOImage DS9,68
The Astronomical Journal,Spec2d,25
The Astronomical Journal,WCSTools,43
The Astrophysical Journal,AstroBlend,1
The Astrophysical Journal,Astropy,364
The Astrophysical Journal,RADMC-3D,168
The Astrophysical Journal,SAOImage DS9,196
The Astrophysical Journal,Spec2d,229


In [70]:
# Which tags were associated with each software package?

XML_tags1 = pd.DataFrame({'tags' : XML_results.groupby( [ "Software_Package", "Parent1_Tag"] ).size()}).reset_index()
XML_tags2 = pd.DataFrame({'tags' : XML_results.groupby( [ "Software_Package", "Parent2_Tag"] ).size()}).reset_index()
XML_tags3 = pd.DataFrame({'tags' : XML_results.groupby( [ "Software_Package", "Parent3_Tag"] ).size()}).reset_index()
XML_tags4 = pd.DataFrame({'tags' : XML_results.groupby( [ "Software_Package", "Parent4_Tag"] ).size()}).reset_index()
XML_all_tags = [XML_tags1, XML_tags2, XML_tags3, XML_tags4]
XML_all_tags = pd.concat(XML_all_tags)
# write tags to csv
XML_all_tags.to_csv("XML_all_tags_082019.csv")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [71]:
# Parent 1 tags per package

XML_tags1 = pd.DataFrame({'Count' : XML_results.groupby( [ "Software_Package", "Parent1_Tag"] ).size()})
XML_tags1.to_csv("XML_tags1_082019.csv")
XML_tags1

Unnamed: 0_level_0,Unnamed: 1_level_0,Count
Software_Package,Parent1_Tag,Unnamed: 2_level_1
AstroBlend,ext-link,1
AstroBlend,sc,1
Astropy,collab,389
Astropy,comment,41
Astropy,conf-name,1
Astropy,disp-formula,3
Astropy,ext-link,576
Astropy,institution,1
Astropy,italic,14
Astropy,monospace,108


In [72]:
# Parent 2 tags per package

XML_tags2 = pd.DataFrame({'Count' : XML_results.groupby( [ "Software_Package", "Parent2_Tag"] ).size()})
XML_tags2.to_csv("XML_tags2_082019.csv")
XML_tags2

Unnamed: 0_level_0,Unnamed: 1_level_0,Count
Software_Package,Parent2_Tag,Unnamed: 2_level_1
AstroBlend,p,2
Astropy,abstract,1
Astropy,ack,549
Astropy,aff,1
Astropy,app,4
Astropy,disp-formula,3
Astropy,element-citation,852
Astropy,fn,4
Astropy,list-item,6
Astropy,name,9


In [73]:
# Parent 3 tags per package

XML_tags3 = pd.DataFrame({'Count' : XML_results.groupby( [ "Software_Package", "Parent3_Tag"] ).size()})
XML_tags3.to_csv("XML_tags3_082019.csv")
XML_tags3

Unnamed: 0_level_0,Unnamed: 1_level_0,Count
Software_Package,Parent3_Tag,Unnamed: 2_level_1
AstroBlend,fn,1
AstroBlend,sec,1
Astropy,abstract,2
Astropy,ack,114
Astropy,app,5
Astropy,app-group,4
Astropy,article,31
Astropy,article-meta,1
Astropy,back,549
Astropy,caption,1


In [74]:
# Parent 4 tags per package

XML_tags4 = pd.DataFrame({'Count' : XML_results.groupby( [ "Software_Package", "Parent4_Tag"] ).size()})
XML_tags4.to_csv("XML_tags4_082019.csv")
XML_tags4

Unnamed: 0_level_0,Unnamed: 1_level_0,Count
Software_Package,Parent4_Tag,Unnamed: 2_level_1
AstroBlend,p,1
AstroBlend,sec,1
Astropy,app-group,5
Astropy,article,619
Astropy,article-meta,3
Astropy,back,118
Astropy,body,31
Astropy,element-citation,7
Astropy,fig,1
Astropy,front,1


In [75]:
# AstroBlend
AstroBlend_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("AstroBlend")]
AstroBlend_Tags = pd.concat([AstroBlend_Tags['Parent1_Tag'], AstroBlend_Tags['Parent2_Tag'], AstroBlend_Tags['Parent3_Tag'], AstroBlend_Tags['Parent4_Tag']]).unique()
AstroBlend_Tags = np.ndarray.tolist(AstroBlend_Tags)
AstroBlend_Tags = pd.DataFrame(AstroBlend_Tags, columns = ["AstroBlend_XML_Tags"])
AstroBlend_Tags = AstroBlend_Tags[AstroBlend_Tags.AstroBlend_XML_Tags.notnull()]
AstroBlend_Tags = AstroBlend_Tags.sort_values(by=['AstroBlend_XML_Tags']).reset_index(drop=True)
# write tags to csv
AstroBlend_Tags.to_csv("AstroBlend_Tags_082019.csv")
AstroBlend_Tags

Unnamed: 0,AstroBlend_XML_Tags
0,ext-link
1,fn
2,p
3,sc
4,sec


In [76]:
# AstroPy
AstroPy_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("AstroPy")]
AstroPy_Tags = pd.concat([AstroPy_Tags['Parent1_Tag'], AstroPy_Tags['Parent2_Tag'], AstroPy_Tags['Parent3_Tag'], AstroPy_Tags['Parent4_Tag']]).unique()
AstroPy_Tags = np.ndarray.tolist(AstroPy_Tags)
AstroPy_Tags = pd.DataFrame(AstroPy_Tags, columns = ["AstroPy_XML_Tags"])
AstroPy_Tags = AstroPy_Tags[AstroPy_Tags.AstroPy_XML_Tags.notnull()]
AstroPy_Tags = AstroPy_Tags.sort_values(by=['AstroPy_XML_Tags']).reset_index(drop=True)
# write tags to csv
AstroPy_Tags.to_csv("AstroPy_Tags_082019.csv")
AstroPy_Tags

Unnamed: 0,AstroPy_XML_Tags


In [77]:
# RADMC-3D
RADMC3D_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("RADMC-3D")]
RADMC3D_Tags = pd.concat([RADMC3D_Tags['Parent1_Tag'], RADMC3D_Tags['Parent2_Tag'], RADMC3D_Tags['Parent3_Tag'], RADMC3D_Tags['Parent4_Tag']]).unique()
RADMC3D_Tags = np.ndarray.tolist(RADMC3D_Tags)
RADMC3D_Tags = pd.DataFrame(RADMC3D_Tags, columns = ["RADMC3D_XML_Tags"])
RADMC3D_Tags = RADMC3D_Tags[RADMC3D_Tags.RADMC3D_XML_Tags.notnull()]
RADMC3D_Tags = RADMC3D_Tags.sort_values(by=['RADMC3D_XML_Tags']).reset_index(drop=True)
# write tags to csv
RADMC3D_Tags.to_csv("RADMC3D_Tags_082019.csv")
RADMC3D_Tags

Unnamed: 0,RADMC3D_XML_Tags
0,abstract
1,ack
2,app
3,app-group
4,article
5,article-meta
6,article-title
7,back
8,body
9,caption


In [78]:
# DS9
SAOImageDS9_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("SAOImage DS9")]
SAOImageDS9_Tags = pd.concat([SAOImageDS9_Tags['Parent1_Tag'], SAOImageDS9_Tags['Parent2_Tag'], SAOImageDS9_Tags['Parent3_Tag'], SAOImageDS9_Tags['Parent4_Tag']]).unique()
SAOImageDS9_Tags = np.ndarray.tolist(SAOImageDS9_Tags)
SAOImageDS9_Tags = pd.DataFrame(SAOImageDS9_Tags, columns = ["SAOImageDS9_XML_Tags"])
SAOImageDS9_Tags = SAOImageDS9_Tags[SAOImageDS9_Tags.SAOImageDS9_XML_Tags.notnull()]
SAOImageDS9_Tags = SAOImageDS9_Tags.sort_values(by=['SAOImageDS9_XML_Tags']).reset_index(drop=True)
# write tags to csv
SAOImageDS9_Tags.to_csv("SAOImageDS9_Tags_082019.csv")
SAOImageDS9_Tags

Unnamed: 0,SAOImageDS9_XML_Tags
0,ack
1,app
2,app-group
3,article
4,back
5,body
6,caption
7,element-citation
8,ext-link
9,fig


In [79]:
# Spec2d
Spec2d_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("Spec2d")]
Spec2d_Tags = pd.concat([Spec2d_Tags['Parent1_Tag'], Spec2d_Tags['Parent2_Tag'], Spec2d_Tags['Parent3_Tag'], Spec2d_Tags['Parent4_Tag']]).unique()
Spec2d_Tags = np.ndarray.tolist(Spec2d_Tags)
Spec2d_Tags = pd.DataFrame(Spec2d_Tags, columns = ["Spec2d_XML_Tags"])
Spec2d_Tags = Spec2d_Tags[Spec2d_Tags.Spec2d_XML_Tags.notnull()]
Spec2d_Tags = Spec2d_Tags.sort_values(by=['Spec2d_XML_Tags']).reset_index(drop=True)
# write tags to csv
Spec2d_Tags.to_csv("Spec2d_Tags_082019.csv")
Spec2d_Tags

Unnamed: 0,Spec2d_XML_Tags
0,abstract
1,ack
2,app
3,app-group
4,article
5,article-id
6,article-meta
7,article-title
8,back
9,body


In [80]:
# Stingray
Stingray_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("Stingray")]
Stingray_Tags = pd.concat([Stingray_Tags['Parent1_Tag'], Stingray_Tags['Parent2_Tag'], Stingray_Tags['Parent3_Tag'], Stingray_Tags['Parent4_Tag']]).unique()
Stingray_Tags = np.ndarray.tolist(Stingray_Tags)
Stingray_Tags = pd.DataFrame(Stingray_Tags, columns = ["Stingray_XML_Tags"])
Stingray_Tags = Stingray_Tags[Stingray_Tags.Stingray_XML_Tags.notnull()]
Stingray_Tags = Stingray_Tags.sort_values(by=['Stingray_XML_Tags']).reset_index(drop=True)
# write tags to csv
Stingray_Tags.to_csv("Stingray_Tags_082019.csv")
Stingray_Tags

Unnamed: 0,Stingray_XML_Tags
0,article
1,body
2,element-citation
3,ext-link
4,fn
5,monospace
6,p
7,ref
8,ref-list
9,sec


In [81]:
# tardis
TARDIS_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("TARDIS")]
TARDIS_Tags = pd.concat([TARDIS_Tags['Parent1_Tag'], TARDIS_Tags['Parent2_Tag'], TARDIS_Tags['Parent3_Tag'], TARDIS_Tags['Parent4_Tag']]).unique()
TARDIS_Tags = np.ndarray.tolist(TARDIS_Tags)
TARDIS_Tags = pd.DataFrame(TARDIS_Tags, columns = ["TARDIS_XML_Tags"])
TARDIS_Tags = TARDIS_Tags[TARDIS_Tags.TARDIS_XML_Tags.notnull()]
TARDIS_Tags = TARDIS_Tags.sort_values(by=['TARDIS_XML_Tags']).reset_index(drop=True)
# write tags to csv
TARDIS_Tags.to_csv("TARDIS_Tags_082019.csv")
TARDIS_Tags

Unnamed: 0,TARDIS_XML_Tags
0,article
1,body
2,caption
3,element-citation
4,fig
5,p
6,pub-id
7,ref
8,ref-list
9,sc


In [82]:
# wcs tools
WCSTools_Tags = XML_all_tags.loc[XML_all_tags['Software_Package'].str.contains("WCSTools")]
WCSTools_Tags = pd.concat([WCSTools_Tags['Parent1_Tag'], WCSTools_Tags['Parent2_Tag'], WCSTools_Tags['Parent3_Tag'], WCSTools_Tags['Parent4_Tag']]).unique()
WCSTools_Tags = np.ndarray.tolist(WCSTools_Tags)
WCSTools_Tags = pd.DataFrame(WCSTools_Tags, columns = ["WCSTools_XML_Tags"])
WCSTools_Tags = WCSTools_Tags[WCSTools_Tags.WCSTools_XML_Tags.notnull()]
WCSTools_Tags = WCSTools_Tags.sort_values(by=['WCSTools_XML_Tags']).reset_index(drop=True)
# write tags to csv
WCSTools_Tags.to_csv("WCSTools_Tags_082019.csv")
WCSTools_Tags

Unnamed: 0,WCSTools_XML_Tags
0,ack
1,app
2,app-group
3,article
4,back
5,body
6,comment
7,element-citation
8,ext-link
9,fn


In [83]:
# Proportion of articles with aliases in refrences
XML_all_refs = XML_results.loc[XML_results['ref'] == "yes"]
XML_ref_proportion = XML_all_refs.groupby('Software_Package')['File_Name'].nunique()/XML_results.groupby('Software_Package')['File_Name'].nunique()
XML_ref_proportion

Software_Package
AstroBlend      1.000000
Astropy         0.907063
RADMC-3D        0.883178
SAOImage DS9    0.299120
Spec2d          0.631579
Stingray        0.500000
TARDIS          1.000000
WCSTools        0.544715
Name: File_Name, dtype: float64

In [85]:
# Total number of unique papers with software aliases in the references section

XML_all_refs = XML_results.loc[XML_results['ref'] == "yes"]
XML_ref_count = XML_all_refs.groupby('Software_Package')['File_Name'].nunique()
XML_ref_count.to_csv("XML_ref_count_082019.csv")
XML_ref_count

  """


Software_Package
AstroBlend        1
Astropy         488
RADMC-3D        189
SAOImage DS9    102
Spec2d          192
Stingray          1
TARDIS            4
WCSTools         67
Name: File_Name, dtype: int64

In [86]:
# How many unique aliases were used in the references for each package?
print (XML_all_refs.groupby('Software_Package')['Alias'].nunique())

Software_Package
AstroBlend       1
Astropy         17
RADMC-3D        16
SAOImage DS9    10
Spec2d          15
Stingray         3
TARDIS           1
WCSTools        10
Name: Alias, dtype: int64


In [87]:
# Which aliases were used in the papers with aliases in references? Write results to csv
XML_ref_aliases = pd.DataFrame({'ref_count' : XML_all_refs.groupby(['Software_Package', 'Alias', 'Identifier'])['File_Name'].nunique()}).reset_index()
XML_ref_aliases.to_csv("XML_ref_aliases_082019.csv")
XML_ref_aliases

Unnamed: 0,Software_Package,Alias,Identifier,ref_count
0,AstroBlend,astroblend.com,0,1
1,Astropy,10.1051/0004-6361/201322068,1,449
2,Astropy,2013A&A...558A..33A,1,441
3,Astropy,Astropy,0,6
4,Astropy,Astropy Collaboration,0,433
5,Astropy,Astropy Collaboration 2013,0,1
6,Astropy,astropy,0,2
7,Astropy,astropy.org,0,73
8,Astropy,astropy/astroplan,0,1
9,Astropy,astropy/astroquery,0,2


In [88]:
# Proportion of articles containing software aliases with the software mentioned in an acknowledgement
XML_all_ack = XML_results.loc[XML_results['ack'] == 'yes']
XML_ack_proportion = XML_all_ack.groupby('Software_Package')['File_Name'].nunique()/XML_results.groupby('Software_Package')['File_Name'].nunique()
XML_ack_proportion

Software_Package
AstroBlend           NaN
Astropy         0.903346
RADMC-3D        0.102804
SAOImage DS9    0.486804
Spec2d          0.138158
Stingray             NaN
TARDIS               NaN
WCSTools        0.081301
Name: File_Name, dtype: float64

In [91]:
# Total number of unique papers with software aliases in acknowledgements

XML_all_ack = XML_results.loc[XML_results['ack'] == "yes"]
XML_ack_count = XML_all_ack.groupby('Software_Package')['File_Name'].nunique()
XML_ack_count.to_csv("XML_ack_count_082019.csv")
XML_ack_count

  """


Software_Package
Astropy         486
RADMC-3D         22
SAOImage DS9    166
Spec2d           42
WCSTools         10
Name: File_Name, dtype: int64

In [92]:
# Which aliases were used in the papers with aliases in acknowledgements? Write results to csv
XML_ack_aliases = pd.DataFrame({'ack_count' : XML_all_ack.groupby([ "Software_Package", "Alias", "Identifier"])['File_Name'].nunique()}).reset_index()
XML_ack_aliases.to_csv("XML_ack_aliases_082019.csv")
XML_ack_aliases

Unnamed: 0,Software_Package,Alias,Identifier,ack_count
0,Astropy,AstroPy,0,23
1,Astropy,Astropy,0,398
2,Astropy,Astropy Collaboration,0,5
3,Astropy,astropy,0,68
4,Astropy,astropy.org,0,35
5,Astropy,doi.org/10.1051/0004-6361/201322068,1,2
6,RADMC-3D,RADMC,0,10
7,RADMC-3D,RADMC-3D,0,11
8,RADMC-3D,RADMC3D,0,1
9,RADMC-3D,ita.uni-heidelberg.de/dullemond/software/radmc-3d,0,2


In [93]:
# Proportion of articles containing software aliases with the software mentioned in a footnote
XML_all_fn = XML_results.loc[XML_results['fn'] == 'yes']
XML_fn_proportion = XML_all_fn.groupby('Software_Package')['File_Name'].nunique()/XML_results.groupby('Software_Package')['File_Name'].nunique()
XML_fn_proportion

Software_Package
AstroBlend      1.000000
Astropy         0.089219
RADMC-3D        0.252336
SAOImage DS9    0.140762
Spec2d          0.243421
Stingray        0.500000
TARDIS               NaN
WCSTools        0.260163
Name: File_Name, dtype: float64

In [96]:
# Total number of unique papers with software aliases in footnotes

XML_all_fn = XML_results.loc[XML_results['fn'] == "yes"]
XML_fn_count = XML_all_fn.groupby('Software_Package')['File_Name'].nunique()
XML_fn_count.to_csv("XML_fn_count_082019.csv")
XML_fn_count

  """


Software_Package
AstroBlend       1
Astropy         48
RADMC-3D        54
SAOImage DS9    48
Spec2d          74
Stingray         1
WCSTools        32
Name: File_Name, dtype: int64

In [97]:
# Which aliases were used in the papers with aliases in footnotes? Write results to csv
XML_fn_aliases = pd.DataFrame({'fn_count' : XML_all_fn.groupby( [ "Software_Package", "Alias", "Identifier"])['File_Name'].nunique()}).reset_index()
XML_fn_aliases.to_csv("XML_fn_aliases_082019.csv")
XML_fn_aliases

Unnamed: 0,Software_Package,Alias,Identifier,fn_count
0,AstroBlend,astroblend.com,0,1
1,Astropy,Astropy,0,4
2,Astropy,astropy,0,2
3,Astropy,astropy.org,0,40
4,Astropy,github.com/astropy/astroplan,0,1
5,Astropy,github.com/astropy/astroquery,0,1
6,Astropy,github.com/astropy/astroscrappy,0,2
7,Astropy,github.com/astropy/photutils,0,1
8,RADMC-3D,RADMC,0,2
9,RADMC-3D,RADMC-3D,0,3


In [98]:
# Did any articles have no reference, acknowledgement, or footnote?
no_credit = XML_results.loc[(XML_results['fn'] == 'no') & (XML_results['ref'] == 'no') & (XML_results['ack'] == 'no')]
no_credit = no_credit.loc[~(no_credit['File_Name'].isin(XML_all_fn['File_Name']))]
no_credit = no_credit.loc[~(no_credit['File_Name'].isin(XML_all_refs['File_Name']))]
no_credit = no_credit.loc[~(no_credit['File_Name'].isin(XML_all_ack['File_Name']))]
no_credit.groupby('Software_Package')['File_Name'].nunique()

Software_Package
Astropy          6
RADMC-3D        20
SAOImage DS9    94
Spec2d          77
Stingray         1
WCSTools        49
Name: File_Name, dtype: int64

In [99]:
# total mentions of software aliases over time
XML_over_time_journal = pd.DataFrame({'year_count' : XML_results.groupby(["Journal_Title", "Software_Package", "Pub_Year"])['File_Name'].nunique()}).reset_index()
# write results
XML_over_time_journal.to_csv("XML_over_time_journal_082019.csv")
# show results over time
XML_over_time_journal

Unnamed: 0,Journal_Title,Software_Package,Pub_Year,year_count
0,The Astronomical Journal,Astropy,2014,4
1,The Astronomical Journal,Astropy,2015,5
2,The Astronomical Journal,Astropy,2016,14
3,The Astronomical Journal,Astropy,2017,32
4,The Astronomical Journal,Astropy,2018,21
5,The Astronomical Journal,RADMC-3D,2010,2
6,The Astronomical Journal,RADMC-3D,2016,1
7,The Astronomical Journal,RADMC-3D,2017,3
8,The Astronomical Journal,RADMC-3D,2018,1
9,The Astronomical Journal,SAOImage DS9,1998,3


In [125]:
#XML_over_time_journal = pd.DataFrame({'year_count' : XML_results.groupby(["Journal_Title", "Software_Package", "Pub_Year"])['File_Name'].nunique()}).reset_index()

XML_alias_per_paper = pd.DataFrame({'count' : XML_results.groupby('File_Name')['Alias'].count()})
# XML_alias_per_paper.to_csv("XML_alias_per_paper_082019.csv")
XML_alias_per_paper

Unnamed: 0_level_0,count
File_Name,Unnamed: 1_level_1
10.1086_300314.xml,1
10.1086_300338.xml,1
10.1086_300462.xml,1
10.1086_300535.xml,1
10.1086_300997.xml,1
10.1086_301493.xml,1
10.1086_304882.xml,2
10.1086_312617.xml,1
10.1086_313204.xml,1
10.1086_318776.xml,1


In [123]:
#Alias per paper

XML_papers_over_time = pd.DataFrame(XML_results.groupby(['Software_Package','Pub_Year'])['File_Name'].nunique())
XML_papers_over_time.to_csv("XML_papers_over_time_082019.csv")
XML_papers_over_time

Unnamed: 0_level_0,Unnamed: 1_level_0,File_Name
Software_Package,Pub_Year,Unnamed: 2_level_1
AstroBlend,2016,1
Astropy,2012,1
Astropy,2013,3
Astropy,2014,37
Astropy,2015,63
Astropy,2016,117
Astropy,2017,199
Astropy,2018,118
RADMC-3D,2004,1
RADMC-3D,2005,1


In [130]:

XML_alias_per_paper = pd.DataFrame({'count' : XML_results.groupby(["Software_Package","File_Name"])['Alias'].nunique()})
XML_alias_per_paper.to_csv("XML_alias_per_paper_082019.csv")
XML_alias_per_paper

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Software_Package,File_Name,Unnamed: 2_level_1
AstroBlend,apj_818_2_115.xml,2
Astropy,aj_148_1_13.xml,4
Astropy,aj_148_1_14.xml,5
Astropy,aj_148_3_53.xml,4
Astropy,aj_148_6_122.xml,4
Astropy,aj_150_4_118.xml,4
Astropy,aj_150_4_132.xml,4
Astropy,aj_150_5_145.xml,4
Astropy,aj_150_6_179.xml,4
Astropy,aj_150_6_189.xml,4
