# Fetchs Data Packages, Keywords and Thesaurus info
This Notebook uses web services to download the packageid,title and list of keywords for each LTER dataset in EDI (based on the knb-lter-* scope) and queries the LTER Thesaurus using Tematres web services to gather information on which keywords are found in the thesaurus, and if so, which high-level category it falls into. Several intermediate CSV files are created along the way and contain the information needed to produce a spreadsheet listing the keywording data using a separate R program.  -John Porter, 2022

### Fetch keyword data from PASTA

In [2]:
import requests

r=requests.get('https://pasta.lternet.edu/package/search/eml?defType=edismax&q=*&fq=-scope:ecotrends&fq=-scope:knb-lter-landsat*&fq=scope:edi*&fl=packageid,title,keyword&sort=score,desc&sort=packageid,asc&debug=false&rows=1000000',verify=False)
f=open("resultSetEDI.xml","w",encoding='utf-8')
f.write(r.text)
f.close()
print("Done")



Done


### Parse ResultSet XML and create a csv file to read in as a Pandas data frame

In [3]:
import xml.etree.ElementTree as Xet
import numpy as np
import pandas as pd 

# create an output file and write the header
f=open("resultSetKeywordsEDI.csv","w",encoding="utf=8")
f.write("packageid,title,keyword\n")
# now parse the XML and write it out
resultSetxmlparse=Xet.parse("resultSetEDI.xml")
root=resultSetxmlparse.getroot()

for i in root:
    packageid=i.find("packageid").text
    title=i.find("title").text
    title=title.replace('\r','').replace('\n','') # get rid of embedded linefeeds in titles
    for k in i.iter("keyword"):
        keyword=str(k.text)
        f.write(packageid+',"'+title+'","'+keyword+'"\n')
f.close()  
resultSetDataFrame =pd.read_csv("resultSetKeywordsEDI.csv" 
          ,skiprows=1
            ,sep=","  
                ,quotechar='"' 
           , names=[
                    "pakcageid",     
                    "title",     
                    "keyword"]
                               )
print(resultSetDataFrame.info())
print("Done")
        
        

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13078 entries, 0 to 13077
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pakcageid  13078 non-null  object
 1   title      13078 non-null  object
 2   keyword    13078 non-null  object
dtypes: object(3)
memory usage: 306.6+ KB
None
Done


### Get list of top-level LTER Vocab units 

In [4]:
topr=requests.get('https://vocab.lternet.edu/vocab/vocab/services.php?task=fetchTopTerms',verify=False)
f=open("TopLevelTerms.xml","w",encoding='utf-8')
f.write(topr.text)
f.close()
print("Done")


Done




### and create  lists of their names and term_ids

In [5]:
topTermsxmlparse=Xet.parse("TopLevelTerms.xml")
rootTop=topTermsxmlparse.getroot()
topCounter=0

term_id=list()
term_string=list()
for i in rootTop.iter('term_id'):
    term_id.append(i.text)
for i in rootTop.iter('string'):
    term_string.append(i.text)
    topCounter=topCounter+1
print(term_id)
print(term_string)

f=open("TopLevelTerms.csv","w",encoding="utf-8")
for i in range(len(term_id)):
    f.write(term_string[i]+","+str(term_id[i])+"\n")
f.close()    


['799', '651', '698', '667', '693', '655', '643', '694', '173', '395']
['organizational units', 'disciplines', 'events', 'measurements', 'methods', 'processes', 'substances', 'substrates', 'ecosystems', 'organisms']


### Run Queries of keywords against Tematres web services to see if they match and what the top of the hierarchy is

In [6]:
import warnings

# create a list of keywords without duplicates
kwList=[]
[kwList.append(x) for x in resultSetDataFrame.keyword if x not in kwList]
print(len(kwList))
print(kwList)


        
    

3930
['LTREB', 'NSF', 'plant community', 'succession', 'fertilization', 'Macrosystems Biology', 'MSB', 'Cross-scale Interactions', 'CSI', 'National Science Foundation', 'lakes', 'multi-scaled', 'geospatial', 'geography', 'temporal', 'database', 'LAGOS', 'LAGOS-NE', 'nutrients', 'dissolved nutrients', 'inorganic nutrients', 'water quality', 'water properties', 'water clarity', 'chlorophyll', 'ecological context', 'lake trophic state', 'eutrophication', 'support package', 'semantic web', 'annotations', 'functional diversity', 'functional traits', 'Orinoquia', 'Morphometric traits', 'life-history traits', 'Darwin Core Archive (DwC-A) Event Core', 'GBIF', 'Global Biodiversity Information Facility', 'Taxon: ScientificName', 'ExtendedMeasurementOrFact', 'Event', 'Occurrence', 'Population Abundance', 'Disturbance Patterns', 'populations', 'abundance', 'aquatic invertebrates', 'biomass', 'gastropods', 'invertebrates', 'long term', 'long term ecological research', 'LTER', 'macroinvertebrates', 

In [None]:
f=open("termHierarchiesEDI.csv","w",encoding='utf-8',buffering=1)
counter=0
startCounter=0  # use for restarts. Be sure to rename old file before it is overwritten
for kw in kwList:
    #print(kw)
    counter=counter+1
    if (counter >= startCounter):
        warnings.filterwarnings("ignore") # eliminate security warning about verify option
        termr=requests.get('https://vocab.lternet.edu/vocab/vocab/services.php?task=search&arg='+str(kw),verify=False)
        warnings.filterwarnings("default")
        kwRoot=Xet.fromstring(termr.text)
        for i in kwRoot.iter("term"):
            if (i.find("string") is not None):
                termString=str(i.find("string").text )
                indexString=str(i.find("index").text )
                if (kw.lower() == termString.lower()):
                    print(str(counter)+',"'+termString+'",'+indexString)
                    f.write('"'+ kw +'",'+indexString+'\n')
f.close()
print("done")

4,"succession",|655|656|686
5,"fertilization",|655|191
11,"lakes",|173|41|278
14,"geography",|651|653
19,"nutrients",|643|665|384
20,"dissolved nutrients",|643|665|384|159
21,"inorganic nutrients",|643|665|266
22,"water quality",|667|691|623
23,"water properties",|667|691
25,"chlorophyll",|643|665|102
45,"disturbance patterns",|667|712|165
46,"populations",|799|439
47,"abundance",|667|674|6
48,"aquatic invertebrates",|395|33|271|42
49,"biomass",|667|716|68
50,"gastropods",|395|33|271|348|222
51,"invertebrates",|395|33|271
52,"long term",|667|670|311
55,"macroinvertebrates",|395|33|271|318
56,"marine",|173|41|695|325
57,"measurements",None
58,"mollusks",|395|33|271|348
61,"species",|799|541
62,"transects",|693|801|594
63,"community structure",|799|114|119
73,"reefs",|173|41|960
74,"species",|799|541
76,"crustaceans",|395|33|271|134
78,"communities",|799|114
79,"community composition",|667|674|115
80,"community dynamics",|651|172|116
81,"community patterns",|799|114|117
82,"species compo

712,"temperature",|667|668|579
713,"dissolved oxygen",|643|666|397|161
715,"respiration",|655|656|88|467
716,"gross primary production",|655|656|449|447|905
719,"light",|667|668|454|299
736,"corals",|395|33|271|128
737,"populations",|799|439
739,"coral reefs",|173|41|960|127
744,"ocean acidification",|655|657|962
750,"wind speed",|667|692|633|635
751,"wind direction",|667|692|633|634
752,"maximum temperature",|667|668|579|329
753,"minimum temperature",|667|668|579|342
754,"wind",|667|692|633
755,"fertilizer",|643|192
756,"soil",|694|535
757,"lysimeters",|693|801|315
758,"soil moisture",|643|617|525
759,"crops",|395|433|612|133
760,"soil phosphorus",|643|666|416|987
761,"soil samples",|693|801|989
762,"soil ph",|667|671|410|988
763,"potassium",|643|666|442
764,"relative humidity",|667|692|611|253|463
765,"soil temperature",|667|668|579|530
775,"regeneration",|655|656|298|461
787,"zooplankton",|395|802|641
788,"nitrogen fixation",|655|658|383|375|377
790,"air temperature",|667|668|579|22