## Software Citation Study - XML Analysis
Daina Bouquin, Daniel Chivvis

In [70]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Data Clean Up

In [71]:
XML_results = pd.read_csv("XML_RESULTS_2_22-XML_RESULTS_2_22.csv") 

In [72]:
list(XML_results.columns.values)

['Alias',
 'Software_Package',
 'Identifier',
 'Pub_Year',
 'Journal_Title',
 'Article_id',
 'File_Name',
 'Parent1_Tag',
 'Parent2_Tag',
 'Parent3_Tag',
 'Parent4_Tag',
 'Parent1_Content',
 'Parent2_Content',
 'Parent3_Content',
 'Parent4_Content',
 'Title',
 'Author(s)',
 'Publisher']

In [73]:
XML_results.head(5)

Unnamed: 0,Alias,Software_Package,Identifier,Pub_Year,Journal_Title,Article_id,File_Name,Parent1_Tag,Parent2_Tag,Parent3_Tag,Parent4_Tag,Parent1_Content,Parent2_Content,Parent3_Content,Parent4_Content,Title,Author(s),Publisher
0,astroblend.com,AstroBlend,0,2016,The Astrophysical Journal,"apj521773, 10.3847/0004-637X/818/2/115, 521773...",apj_818_2_115.xml,ext-link,p,fn,p,['http://www.astroblend.com'],"['\n<ext-link ext-link-type=""uri"" xlink:href=""...",['\n<label><sup>26</sup></label>\n<p>\n<ext-li...,['We note that our example scripts only explor...,ADVANCED DATA VISUALIZATION IN ASTROPHYSICS: T...,Vogt Frédéric P. A. Owen Chris I. Verdes-Mon...,The American Astronomical Society
1,10.1051/0004-6361/201322068,AstroPy,1,2014,The Astronomical Journal,"aj493368, ANJOAA, 10.1088/0004-6256/148/1/13, ...",aj_148_1_13.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...","['\n<title>References</title>\n<ref id=""aj4933...",TYPE Ia SUPERNOVA RATE MEASUREMENTS TO REDSHIF...,Rodney Steven A.Riess Adam G.Strolger Louis-Gr...,The American Astronomical Society
2,10.1051/0004-6361/201322068,AstroPy,1,2014,The Astronomical Journal,"aj495229, ANJOAA, 10.1088/0004-6256/148/1/14, ...",aj_148_1_14.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...","['\n<title>References</title>\n<ref id=""aj4952...",DISCOVERY OF EIGHT z ∼ 6 QUASARS FROM Pan-STARRS1,Bañados E.Venemans B. P.Morganson E.Decarli R....,The American Astronomical Society
3,10.1051/0004-6361/201322068,AstroPy,1,2014,The Astronomical Journal,"aj499538, ANJOAA, 10.1088/0004-6256/148/3/53, ...",aj_148_3_53.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...","['\n<title>References</title>\n<ref id=""aj4995...","CORRECTING FOR TELLURIC ABSORPTION: METHODS, C...",Gullikson KevinDodson-Robinson SarahKraus Adam...,The American Astronomical Society
4,10.1051/0004-6361/201322068,AstroPy,1,2014,The Astronomical Journal,"aj503145, ANJOAA, 10.1088/0004-6256/148/6/122,...",aj_148_6_122.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...","['\n<title>References</title>\n<ref id=""aj5031...",YSOVAR: MID-INFRARED VARIABILITY IN THE STAR-F...,Günther H. M.Cody A. M.Covey K. R.Hillenbrand ...,The American Astronomical Society


In [74]:
# Convert tag and content cols to strings

XML_results['Parent1_Tag'] = XML_results['Parent1_Tag'].astype('|S')
XML_results['Parent2_Tag'] = XML_results['Parent2_Tag'].astype('|S')
XML_results['Parent3_Tag'] = XML_results['Parent3_Tag'].astype('|S')
XML_results['Parent4_Tag'] = XML_results['Parent4_Tag'].astype('|S')

XML_results['Parent1_Content'] = XML_results['Parent1_Content'].astype('|S')
XML_results['Parent2_Content'] = XML_results['Parent2_Content'].astype('|S')
XML_results['Parent3_Content'] = XML_results['Parent3_Content'].astype('|S')
XML_results['Parent4_Content'] = XML_results['Parent4_Content'].astype('|S')

### Create columns for different types software mentions

In [75]:
# Create column for references

# If the tag lable or tag content contain any of the following reference elements it will be marked "yes":
# element-citation 
# mixed-citation
# nlm-citation
# ref
# citation-alternatives
# ref-list 
# xref
# source
# bibr

# Tags - check all parent tags
# Tag1
XML_results['ref'] = np.where((XML_results['Parent1_Tag']== "element-citation") | (XML_results['Parent1_Tag']== "mixed-citation") | (XML_results['Parent1_Tag']== "nlm-citation") | (XML_results['Parent1_Tag']== "ref") | (XML_results['Parent1_Tag']== "citation-alternatives") | (XML_results['Parent1_Tag']== "ref-list") | (XML_results['Parent1_Tag']== "xref") | (XML_results['Parent1_Tag']== "source") | (XML_results['Parent1_Tag']== "bibr"), 'yes', 'no')

# Tag2
XML_results['ref'] = np.where((XML_results['Parent2_Tag']== "element-citation") | (XML_results['Parent2_Tag']== "mixed-citation") | (XML_results['Parent2_Tag']== "nlm-citation") | (XML_results['Parent2_Tag']== "ref") | (XML_results['Parent2_Tag']== "citation-alternatives") | (XML_results['Parent2_Tag']== "ref-list") | (XML_results['Parent2_Tag']== "xref") | (XML_results['Parent2_Tag']== "source") | (XML_results['Parent2_Tag']== "bibr"), 'yes', 'no')

# Tag3
XML_results['ref'] = np.where((XML_results['Parent3_Tag']== "element-citation") | (XML_results['Parent3_Tag']== "mixed-citation") | (XML_results['Parent3_Tag']== "nlm-citation") | (XML_results['Parent3_Tag']== "ref") | (XML_results['Parent3_Tag']== "citation-alternatives") | (XML_results['Parent3_Tag']== "ref-list") | (XML_results['Parent3_Tag']== "xref") | (XML_results['Parent3_Tag']== "source") | (XML_results['Parent3_Tag']== "bibr"), 'yes', 'no')

# Tag4
XML_results['ref'] = np.where((XML_results['Parent4_Tag']== "element-citation") | (XML_results['Parent4_Tag']== "mixed-citation") | (XML_results['Parent4_Tag']== "nlm-citation") | (XML_results['Parent4_Tag']== "ref") | (XML_results['Parent4_Tag']== "citation-alternatives") | (XML_results['Parent4_Tag']== "ref-list") | (XML_results['Parent4_Tag']== "xref") | (XML_results['Parent4_Tag']== "source") | (XML_results['Parent4_Tag']== "bibr"), 'yes', 'no')

In [76]:
# Tag Content check
# Tag1 content
XML_results['ref'] = np.where((XML_results['Parent1_Content'].str.contains("element-citation")) | (XML_results['Parent1_Content'].str.contains("mixed-citation")) | (XML_results['Parent1_Content'].str.contains("nlm-citation")) | (XML_results['Parent1_Content'].str.contains("ref")) | (XML_results['Parent1_Content'].str.contains("citation-alternatives")) | (XML_results['Parent1_Content'].str.contains("ref-list")) | (XML_results['Parent1_Content'].str.contains("xref")) | (XML_results['Parent1_Content'].str.contains("source")) | (XML_results['Parent1_Content'].str.contains("bibr")), 'yes', 'no')

# Tag2 content
XML_results['ref'] = np.where((XML_results['Parent2_Content'].str.contains("element-citation")) | (XML_results['Parent2_Content'].str.contains("mixed-citation")) | (XML_results['Parent2_Content'].str.contains("nlm-citation")) | (XML_results['Parent2_Content'].str.contains("ref")) | (XML_results['Parent2_Content'].str.contains("citation-alternatives")) | (XML_results['Parent2_Content'].str.contains("ref-list")) | (XML_results['Parent2_Content'].str.contains("xref")) | (XML_results['Parent2_Content'].str.contains("source")) | (XML_results['Parent2_Content'].str.contains("bibr")), 'yes', 'no')

# Tag3 content
XML_results['ref'] = np.where((XML_results['Parent3_Content'].str.contains("element-citation")) | (XML_results['Parent3_Content'].str.contains("mixed-citation")) | (XML_results['Parent3_Content'].str.contains("nlm-citation")) | (XML_results['Parent3_Content'].str.contains("ref")) | (XML_results['Parent3_Content'].str.contains("citation-alternatives")) | (XML_results['Parent3_Content'].str.contains("ref-list")) | (XML_results['Parent3_Content'].str.contains("xref")) | (XML_results['Parent3_Content'].str.contains("source")) | (XML_results['Parent3_Content'].str.contains("bibr")), 'yes', 'no')

# Tag4 content
XML_results['ref'] = np.where((XML_results['Parent4_Content'].str.contains("element-citation")) | (XML_results['Parent4_Content'].str.contains("mixed-citation")) | (XML_results['Parent4_Content'].str.contains("nlm-citation")) | (XML_results['Parent4_Content'].str.contains("ref")) | (XML_results['Parent4_Content'].str.contains("citation-alternatives")) | (XML_results['Parent4_Content'].str.contains("ref-list")) | (XML_results['Parent4_Content'].str.contains("xref")) | (XML_results['Parent4_Content'].str.contains("source")) | (XML_results['Parent4_Content'].str.contains("bibr")), 'yes', 'no')


In [77]:
# Create column for acknowledgements
# If the tag lable or tag content contains "ack" it will be marked "yes"

# tags
XML_results['ack'] = np.where((XML_results['Parent1_Tag']== "ack") | (XML_results['Parent2_Tag']== "ack") | (XML_results['Parent3_Tag']== "ack") | (XML_results['Parent4_Tag']== "ack"), 'yes', 'no')

# content
XML_results['ack'] = np.where((XML_results['Parent1_Content'].str.contains("ack")) | (XML_results['Parent2_Content'].str.contains("ack")) | (XML_results['Parent3_Content'].str.contains("ack")) | (XML_results['Parent4_Content'].str.contains("ack")), 'yes', 'no')

In [78]:
# Create column for footnotes
# If the tag lable or tag content contains "fn" or "fn-group" it will be marked "yes"

# tags
XML_results['fn'] = np.where((XML_results['Parent1_Tag']== "fn") | (XML_results['Parent2_Tag']== "fn") | (XML_results['Parent3_Tag']== "fn") | (XML_results['Parent4_Tag']== "fn"), 'yes', 'no')
XML_results['fn'] = np.where((XML_results['Parent1_Tag']== "fn-group") | (XML_results['Parent2_Tag']== "fn-group") | (XML_results['Parent3_Tag']== "fn-group") | (XML_results['Parent4_Tag']== "fn-group"), 'yes', 'no')

# content
XML_results['fn'] = np.where((XML_results['Parent1_Content'].str.contains("fn")) | (XML_results['Parent2_Content'].str.contains("fn")) | (XML_results['Parent3_Content'].str.contains("fn")) | (XML_results['Parent4_Content'].str.contains("fn")), 'yes', 'no')
XML_results['fn'] = np.where((XML_results['Parent1_Content'].str.contains("fn-group")) | (XML_results['Parent2_Content'].str.contains("fn-group")) | (XML_results['Parent3_Content'].str.contains("fn-group")) | (XML_results['Parent4_Content'].str.contains("fn-group")), 'yes', 'no')

In [91]:
# Check new cols
list(XML_results.columns.values)

['Alias',
 'Software_Package',
 'Identifier',
 'Pub_Year',
 'Journal_Title',
 'Article_id',
 'File_Name',
 'Parent1_Tag',
 'Parent2_Tag',
 'Parent3_Tag',
 'Parent4_Tag',
 'Parent1_Content',
 'Parent2_Content',
 'Parent3_Content',
 'Parent4_Content',
 'Title',
 'Author(s)',
 'Publisher',
 'ref',
 'ack',
 'fn']

### Remove confounding records

In [80]:
# drop known confounds
# tardis emails
XML_results = XML_results[~XML_results.Parent1_Content.str.contains("@tardis.byu.edu")]
XML_results = XML_results[~XML_results.Parent1_Content.str.contains("@tardis.ln.byu.edu")]
XML_results = XML_results[~XML_results.Parent1_Content.str.contains("@tardis.pha.jhu.edu")]
XML_results = XML_results[~XML_results.Parent1_Content.str.contains("tardis.ln.byu.edu")]

XML_results = XML_results[~XML_results.Parent2_Content.str.contains("@tardis.byu.edu")]
XML_results = XML_results[~XML_results.Parent2_Content.str.contains("@tardis.ln.byu.edu")]
XML_results = XML_results[~XML_results.Parent2_Content.str.contains("@tardis.pha.jhu.edu")]
XML_results = XML_results[~XML_results.Parent2_Content.str.contains("tardis.ln.byu.edu")]

XML_results = XML_results[~XML_results.Parent3_Content.str.contains("@tardis.byu.edu")]
XML_results = XML_results[~XML_results.Parent3_Content.str.contains("@tardis.ln.byu.edu")]
XML_results = XML_results[~XML_results.Parent3_Content.str.contains("@tardis.pha.jhu.edu")]
XML_results = XML_results[~XML_results.Parent3_Content.str.contains("tardis.ln.byu.edu")]

XML_results = XML_results[~XML_results.Parent4_Content.str.contains("@tardis.byu.edu")]
XML_results = XML_results[~XML_results.Parent4_Content.str.contains("@tardis.ln.byu.edu")]
XML_results = XML_results[~XML_results.Parent4_Content.str.contains("@tardis.pha.jhu.edu")]
XML_results = XML_results[~XML_results.Parent4_Content.str.contains("tardis.ln.byu.edu")]

In [82]:
# more tardis
# detector
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("detector") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("Detector") & XML_results.Software_Package.str.contains("TARDIS"))]

XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("detector") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("Detector") & XML_results.Software_Package.str.contains("TARDIS"))]

XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("detector") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("Detector") & XML_results.Software_Package.str.contains("TARDIS"))]

XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("detector") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("Detector") & XML_results.Software_Package.str.contains("TARDIS"))]

# detection
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("detection") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("Detection") & XML_results.Software_Package.str.contains("TARDIS"))]

XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("detection") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("Detection") & XML_results.Software_Package.str.contains("TARDIS"))]

XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("detection") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("Detection") & XML_results.Software_Package.str.contains("TARDIS"))]

XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("detection") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("Detection") & XML_results.Software_Package.str.contains("TARDIS"))]

# stardisk
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("stardisk") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("stardisk") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("stardisk") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("stardisk") & XML_results.Software_Package.str.contains("TARDIS"))]

XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("STARDISK") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("STARDISK") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("STARDISK") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("STARDISK") & XML_results.Software_Package.str.contains("TARDIS"))]

XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("Stardisk") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("Stardisk") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("Stardisk") & XML_results.Software_Package.str.contains("TARDIS"))]
XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("Stardisk") & XML_results.Software_Package.str.contains("TARDIS"))]

In [84]:
# stingray
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("Stingray Nebula") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("stingray nebula") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("Stingray nebula") & XML_results.Software_Package.str.contains("Stingray"))]

XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("Stingray Nebula") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("stingray nebula") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("Stingray nebula") & XML_results.Software_Package.str.contains("Stingray"))]

XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("Stingray Nebula") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("stingray nebula") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("Stingray nebula") & XML_results.Software_Package.str.contains("Stingray"))]

XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("Stingray Nebula") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("stingray nebula") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("Stingray nebula") & XML_results.Software_Package.str.contains("Stingray"))]

XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("The Stingray") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("The stingray") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("the stingray") & XML_results.Software_Package.str.contains("Stingray"))]

XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("The Stingray") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("The stingray") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("the stingray") & XML_results.Software_Package.str.contains("Stingray"))]

XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("The Stingray") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("The stingray") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("the stingray") & XML_results.Software_Package.str.contains("Stingray"))]

XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("The Stingray") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("The stingray") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("the stingray") & XML_results.Software_Package.str.contains("Stingray"))]

# detector
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("Stingray Light Curve") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("Stingray light curve") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("stingray light curve") & XML_results.Software_Package.str.contains("Stingray"))]

XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("Stingray Light Curve") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("Stingray light curve") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("stingray light curve") & XML_results.Software_Package.str.contains("Stingray"))]

XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("Stingray Light Curve") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("Stingray light curve") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("stingray light curve") & XML_results.Software_Package.str.contains("Stingray"))]

XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("Stingray Light Curve") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("Stingray light curve") & XML_results.Software_Package.str.contains("Stingray"))]
XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("stingray light curve") & XML_results.Software_Package.str.contains("Stingray"))]

In [88]:
# Spec2d (IDL library - related software but separate)
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("idlspec2d") & XML_results.Software_Package.str.contains("Spec2d"))]
XML_results = XML_results[~(XML_results.Parent1_Content.str.contains("IDLspec2d") & XML_results.Software_Package.str.contains("Spec2d"))]
XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("idlspec2d") & XML_results.Software_Package.str.contains("Spec2d"))]
XML_results = XML_results[~(XML_results.Parent2_Content.str.contains("IDLspec2d") & XML_results.Software_Package.str.contains("Spec2d"))]
XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("idlspec2d") & XML_results.Software_Package.str.contains("Spec2d"))]
XML_results = XML_results[~(XML_results.Parent3_Content.str.contains("IDLspec2d") & XML_results.Software_Package.str.contains("Spec2d"))]
XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("idlspec2d") & XML_results.Software_Package.str.contains("Spec2d"))]
XML_results = XML_results[~(XML_results.Parent4_Content.str.contains("IDLspec2d") & XML_results.Software_Package.str.contains("Spec2d"))]


### Write final (clean) results

In [93]:
XML_results.to_csv("XML_RESULTS_CLEAN_022619.csv")

### Summary of results

In [180]:
# How many papers did we find for each software package?
XML_results.groupby('Software_Package')['File_Name'].count()

Software_Package
AstroBlend         1
AstroPy         1009
RADMC-3D         573
SAOImage DS9    1328
Spec2d           581
Stingray          13
TARDIS            11
WCS Tools        207
Name: File_Name, dtype: int64

In [181]:
# How many articles containing software aliases had a software aliases mentioned in the references?
XML_all_refs = XML_results.loc[XML_results['ref'] == 'yes']
XML_all_refs.groupby('Software_Package')['File_Name'].count()

Software_Package
AstroBlend         1
AstroPy         1009
RADMC-3D         572
SAOImage DS9    1290
Spec2d           575
Stingray          13
TARDIS            10
WCS Tools        207
Name: File_Name, dtype: int64

In [182]:
# Proportion of articles containing software aliases with the software mentioned in the references
XML_ref_proportion = XML_all_refs.groupby('Software_Package')['File_Name'].count()/XML_results.groupby('Software_Package')['File_Name'].count()
XML_ref_proportion

Software_Package
AstroBlend      1.000000
AstroPy         1.000000
RADMC-3D        0.998255
SAOImage DS9    0.971386
Spec2d          0.989673
Stingray        1.000000
TARDIS          0.909091
WCS Tools       1.000000
Name: File_Name, dtype: float64

In [183]:
# How many unique aliases were used in the references for each package?
print (XML_all_refs.groupby('Software_Package')['Alias'].nunique())

Software_Package
AstroBlend       1
AstroPy          6
RADMC-3D        12
SAOImage DS9     7
Spec2d          12
Stingray         2
TARDIS           2
WCS Tools       12
Name: Alias, dtype: int64


In [231]:
# Which aliases were used in the papers with aliases in references? Write results to csv
XML_ref_aliases = pd.DataFrame({'count' : XML_all_refs.groupby( [ "Software_Package", "Alias", "Identifier", 'File_Name'] ).size()}).reset_index()
XML_ref_aliases.to_csv("XML_ref_aliases_022719.csv")

# show readable results w/o file names
XML_ref_aliases_summary = pd.DataFrame({'count' : XML_all_refs.groupby( [ "Software_Package", "Alias", 'Identifier'] ).size()}).reset_index()
XML_ref_aliases_summary

Unnamed: 0,Software_Package,Alias,Identifier,count
0,AstroBlend,astroblend.com,0,1
1,AstroPy,10.1051/0004-6361/201322068,1,449
2,AstroPy,2013A&A...558A..33A,1,441
3,AstroPy,AstroPy,0,31
4,AstroPy,astropy.org,0,78
5,AstroPy,doi.org/10.1051/0004-6361/201322068,1,2
6,AstroPy,github.com/astropy,0,8
7,RADMC-3D,10.1051/0004-6361:20031768,1,96
8,RADMC-3D,10.1051/0004-6361:20040017,1,11
9,RADMC-3D,2004A&A...417..159D,1,96


In [105]:
# How many articles containing software aliases had a software aliases mentioned in acknowledgements?
XML_all_ack = XML_results.loc[XML_results['ack'] == 'yes']
XML_all_ack.groupby('Software_Package')['File_Name'].count()

Software_Package
AstroPy         299
RADMC-3D        288
SAOImage DS9    880
Spec2d          228
Stingray          4
TARDIS            6
WCS Tools       142
Name: File_Name, dtype: int64

In [106]:
# Proportion of articles containing software aliases with the software mentioned in an acknowledgement
XML_ack_proportion = XML_all_ack.groupby('Software_Package')['File_Name'].count()/XML_results.groupby('Software_Package')['File_Name'].count()
XML_ack_proportion

Software_Package
AstroBlend           NaN
AstroPy         0.296333
RADMC-3D        0.502618
SAOImage DS9    0.662651
Spec2d          0.392427
Stingray        0.307692
TARDIS          0.545455
WCS Tools       0.685990
Name: File_Name, dtype: float64

In [234]:
# Which aliases were used in the papers with aliases in acknowledgements? Write results to csv
XML_ack_aliases = pd.DataFrame({'count' : XML_all_ack.groupby( [ "Software_Package", "Alias", "Identifier", 'File_Name'] ).size()}).reset_index()
XML_ack_aliases.to_csv("XML_ack_aliases_022719.csv")

# show readable results w/o file names
XML_ack_aliases_summary = pd.DataFrame({'count' : XML_all_ack.groupby( [ "Software_Package", "Alias", "Identifier"] ).size()}).reset_index()
XML_ack_aliases_summary

Unnamed: 0,Software_Package,Alias,Identifier,count
0,AstroPy,10.1051/0004-6361/201322068,1,95
1,AstroPy,2013A&A...558A..33A,1,96
2,AstroPy,AstroPy,0,31
3,AstroPy,astropy.org,0,68
4,AstroPy,doi.org/10.1051/0004-6361/201322068,1,2
5,AstroPy,github.com/astropy,0,7
6,RADMC-3D,10.1051/0004-6361:20031768,1,15
7,RADMC-3D,10.1051/0004-6361:20040017,1,10
8,RADMC-3D,2004A&A...417..159D,1,15
9,RADMC-3D,2004A&A...417..793P,1,10


In [107]:
# How many articles containing software aliases had a software aliases mentioned in a footnote?
XML_all_fn = XML_results.loc[XML_results['fn'] == 'yes']
XML_all_fn.groupby('Software_Package')['File_Name'].count()

Software_Package
SAOImage DS9    223
Spec2d           36
WCS Tools        34
Name: File_Name, dtype: int64

In [108]:
# Proportion of articles containing software aliases with the software mentioned in a footnote
XML_fn_proportion = XML_all_fn.groupby('Software_Package')['File_Name'].count()/XML_results.groupby('Software_Package')['File_Name'].count()
XML_fn_proportion

Software_Package
AstroBlend           NaN
AstroPy              NaN
RADMC-3D             NaN
SAOImage DS9    0.167922
Spec2d          0.061962
Stingray             NaN
TARDIS               NaN
WCS Tools       0.164251
Name: File_Name, dtype: float64

In [235]:
# Which aliases were used in the papers with aliases in footnotes? Write results to csv
XML_fn_aliases = pd.DataFrame({'count' : XML_all_fn.groupby( [ "Software_Package", "Alias", "Identifier", 'File_Name'] ).size()}).reset_index()
XML_fn_aliases.to_csv("XML_ack_aliases_022719.csv")

# show readable results w/o file names
XML_ack_aliases_summary = pd.DataFrame({'count' : XML_all_ack.groupby( [ "Software_Package", "Alias", "Identifier"] ).size()}).reset_index()
XML_ack_aliases_summary

Unnamed: 0,Software_Package,Alias,Identifier,count
0,AstroPy,10.1051/0004-6361/201322068,1,95
1,AstroPy,2013A&A...558A..33A,1,96
2,AstroPy,AstroPy,0,31
3,AstroPy,astropy.org,0,68
4,AstroPy,doi.org/10.1051/0004-6361/201322068,1,2
5,AstroPy,github.com/astropy,0,7
6,RADMC-3D,10.1051/0004-6361:20031768,1,15
7,RADMC-3D,10.1051/0004-6361:20040017,1,10
8,RADMC-3D,2004A&A...417..159D,1,15
9,RADMC-3D,2004A&A...417..793P,1,10


In [113]:
# Did any articles have no reference, acknowledgement, or footnote?
no_credit = XML_results.loc[(XML_results['fn'] == 'no') & (XML_results['ref'] == 'no') & (XML_results['ack'] == 'no')]
no_credit = no_credit.loc[~(no_credit['File_Name'].isin(XML_all_fn['File_Name']))]
no_credit = no_credit.loc[~(no_credit['File_Name'].isin(XML_all_refs['File_Name']))]
no_credit = no_credit.loc[~(no_credit['File_Name'].isin(XML_all_ack['File_Name']))]
no_credit.groupby('Software_Package')['File_Name'].count()

Software_Package
SAOImage DS9    8
Spec2d          2
TARDIS          1
Name: File_Name, dtype: int64

In [117]:
# total mentions of software aliases over time
over_time = pd.DataFrame({'Count' : XML_results.groupby( [ "Software_Package", "Pub_Year"] ).size()}).reset_index()
over_time

Unnamed: 0,Software_Package,Pub_Year,Count
0,AstroBlend,2016,1
1,AstroPy,2013,7
2,AstroPy,2014,78
3,AstroPy,2015,120
4,AstroPy,2016,212
5,AstroPy,2017,364
6,AstroPy,2018,228
7,RADMC-3D,2009,28
8,RADMC-3D,2010,46
9,RADMC-3D,2011,29
