# Fetchs Data Packages, Keywords and Thesaurus info
This Notebook uses web services to download the packageid,title and list of keywords for each LTER dataset in EDI (based on the knb-lter-* scope) and queries the LTER Thesaurus using Tematres web services to gather information on which keywords are found in the thesaurus, and if so, which high-level category it falls into. Several intermediate CSV files are created along the way and contain the information needed to produce a spreadsheet listing the keywording data using a separate R program.  -John Porter, 2022

### Fetch keyword data from PASTA

In [19]:
import requests

r=requests.get('https://pasta.lternet.edu/package/search/eml?defType=edismax&q=*&fq=-scope:ecotrends&fq=-scope:lter-landsat*&fq=scope:knb-lter-*&fl=packageid,title,keyword&sort=score,desc&sort=packageid,asc&debug=false&rows=1000000',verify=False)
f=open("resultSetAll.xml","w",encoding='utf-8')
f.write(r.text)
f.close()
print("Done")



Done


### Parse ResultSet XML and create a csv file to read in as a Pandas data frame

In [96]:
import xml.etree.ElementTree as Xet
import numpy as np
import pandas as pd 

# create an output file and write the header
f=open("resultSetKeywords.csv","w",encoding="utf=8")
f.write("packageid,title,keyword\n")
# now parse the XML and write it out
resultSetxmlparse=Xet.parse("resultSetAll.xml")
root=resultSetxmlparse.getroot()

for i in root:
    packageid=i.find("packageid").text
    title=i.find("title").text
    title=title.replace('\r','').replace('\n','') # get rid of embedded linefeeds in titles
    for k in i.iter("keyword"):
        keyword=str(k.text)
        f.write(packageid+',"'+title+'","'+keyword+'"\n')
f.close()  
resultSetDataFrame =pd.read_csv("resultSetKeywords.csv" 
          ,skiprows=1
            ,sep=","  
                ,quotechar='"' 
           , names=[
                    "pakcageid",     
                    "title",     
                    "keyword"]
                               )
print(resultSetDataFrame.info())
print("Done")
        
        

  and should_run_async(code)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110843 entries, 0 to 110842
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   pakcageid  110843 non-null  object
 1   title      110843 non-null  object
 2   keyword    110842 non-null  object
dtypes: object(3)
memory usage: 2.5+ MB
None
Done


### Get list of top-level LTER Vocab units 

In [78]:
topr=requests.get('https://vocab.lternet.edu/vocab/vocab/services.php?task=fetchTopTerms',verify=False)
f=open("TopLevelTerms.xml","w",encoding='utf-8')
f.write(topr.text)
f.close()
print("Done")


Done


### and create  lists of their names and term_ids

In [114]:
topTermsxmlparse=Xet.parse("TopLevelTerms.xml")
rootTop=topTermsxmlparse.getroot()
topCounter=0

term_id=list()
term_string=list()
for i in rootTop.iter('term_id'):
    term_id.append(i.text)
for i in rootTop.iter('string'):
    term_string.append(i.text)
    topCounter=topCounter+1
print(term_id)
print(term_string)

f=open("TopLevelTerms.csv","w",encoding="utf-8")
for i in range(len(term_id)):
    f.write(term_string[i]+","+str(term_id[i])+"\n")
f.close()    


['799', '651', '698', '667', '693', '655', '643', '694', '173', '395']
['organizational units', 'disciplines', 'events', 'measurements', 'methods', 'processes', 'substances', 'substrates', 'ecosystems', 'organisms']


### Run Queries of keywords against Tematres web services to see if they match and what the top of the hierarchy is

In [104]:
import warnings

# create a list of keywords without duplicates
kwList=[]
[kwList.append(x) for x in resultSetDataFrame.keyword if x not in kwList]
print(len(kwList))
print(kwList)


        
    

8598


In [110]:
f=open("termHierarchies.csv","w",encoding='utf-8',buffering=1)
counter=0
startCounter=0  # use for restarts. Be sure to rename old file before it is overwritten
for kw in kwList:
    #print(kw)
    counter=counter+1
    if (counter >= startCounter):
        warnings.filterwarnings("ignore") # eliminate security warning about verify option
        termr=requests.get('https://vocab.lternet.edu/vocab/vocab/services.php?task=search&arg='+str(kw),verify=False)
        warnings.filterwarnings("default")
        kwRoot=Xet.fromstring(termr.text)
        for i in kwRoot.iter("term"):
            if (i.find("string") is not None):
                termString=str(i.find("string").text )
                indexString=str(i.find("index").text )
                if (kw.lower() == termString.lower()):
                    print(str(counter)+',"'+termString+'",'+indexString)
                    f.write('"'+ kw +'",'+indexString+'\n')
f.close()
print("done")

1,"history",|651|248
8,"air temperature",|667|668|579|22
9,"maximum temperature",|667|668|579|329
10,"historical value",|651|248|247
11,"forest ecosystems",|173|798|212
16,"arthropods",|395|33|271|45
17,"forest ecosystems",|173|798|212
18,"insects",|395|33|271|45|267
19,"invertebrates",|395|33|271
21,"populations",|799|439
22,"vascular plants",|395|433|951
23,"birds",|395|33|614|69
24,"vertebrates",|395|33|614
25,"amphibians",|395|33|614|32
26,"reptiles",|395|33|614|466
27,"mosses",|395|433|354
28,"mammals",|395|33|614|320
29,"bryophytes",|395|433|354|75
31,"fungi",|395|218
33,"ecology",|651|172
35,"lichens",|395|297
36,"phenology",|651|67|413
37,"stand structure",|799|114|119|991
38,"forest dynamics",|651|172|427|931
39,"measurements",None
40,"productivity",|667|716|450
41,"biomass",|667|716|68
42,"plant properties",|667|917
43,"wind",|667|692|633
44,"community composition",|667|674|115
45,"plant species composition",|651|64|432
46,"spatial properties",|667|673
47,"long term",|667|670

403,"specific conductivity",|667|668|176|122|548
404,"surveys",|693|571
410,"conservation",|651|124
411,"accumulation",|655|8
413,"species abundance",|667|674|6|990
414,"metabolism",|655|800|332
415,"recruitment",|698|710|458
416,"census",|693|571|96
417,"soil water content",|667|672|621|534
418,"time domain reflectometry",|693|801|588
419,"droughts",|698|805|168
420,"stable isotopes",|643|276|551
422,"genetics",|651|67|223
424,"leaf area",|667|917|288
425,"tundra",|173|798|605
426,"fertilization",|655|191
427,"foliar nitrogen",|667|671|374|930
428,"carbon to nitrogen ratio",|667|671|92
429,"specific leaf area",|667|917|288|549
430,"leaves",|395|433|293
431,"species lists",|799|541|545
432,"relative abundance",|667|674|6|462
433,"weather",|667|692|627
434,"radiation",|667|668|454
436,"herbivory",|655|656|544|244
437,"fertilizer",|643|192
438,"rain",|667|672|443|455
443,"nitrogen",|643|666|372
444,"carbon",|643|666|86
448,"plant biomass",|667|716|68|423
449,"bacteria",|395|336|51
450,"b

1229,"species composition",|667|674|166|542
1231,"density",|667|668|916
1233,"tree rings",|395|433|951|1|599
1234,"dendrochronology",|651|67|140
1241,"browsing",|655|239|74
1243,"clearcuts",|173|798|212|2|107
1245,"stand density",|667|668|916|992
1246,"basal area",|667|917|56
1251,"pressure",None
1252,"inorganic nitrogen",|643|666|372|922
1253,"hydrogen",|643|666|933
1256,"soil ph",|667|671|410|988
1269,"floodplain",|173|630|197
1271,"boreal forests",|173|798|212|2|72
1273,"evaporation",|655|657|661|185|184
1278,"charcoal",|643|665|98
1286,"ions",|643|272
1287,"neutron probe",|693|801|366
1288,"digital elevation model",|693|705|152
1296,"allometry",|667|716|25
1298,"mesic soils",|694|535|331
1299,"humic soils",|694|535|252
1303,"dissolved inorganic nitrogen",|643|666|372|922|923
1308,"seedling establishment",|698|710|497
1310,"bark",|395|433|54
1311,"beetles",|395|33|271|45|267|60
1313,"pheromones",|643|665|414
1318,"bud burst",|698|710|901
1324,"subsidence",|655|657|660|564
1331,"peat

4288,"slugs",|395|33|271|348|222|510
4299,"tides",|698|586
4303,"cesium",|643|666|97
4367,"polychaetes",|395|33|271|42|894
4393,"holocene",|667|670|249
4406,"salt marshes",|173|630|326|482
4448,"speciation",|655|656|186|540
4501,"morphology",None
4663,"evolution",|655|656|186
4690,"acid neutralizing capacity",|667|671|9
4691,"fluorine",|643|666|204
4726,"plant biomass",|667|716|68|423
4733,"fluoride",|643|666|204|203
4739,"frost",|698|217
4740,"snow water equivalence",|694|513|803
4913,"belowground production",|655|656|449|903
4940,"soil horizons",|694|535|523
4992,"digital elevation model",|693|705|152
4995,"altitude",None
4998,"hurricane damage",|667|712|255
5005,"acid rain",|643|436|10
5009,"extinction",|655|656|186|188
5010,"life history",|655|656|298
5013,"paleolimnology",|651|301|400
5015,"hurricanes",|698|805|254
5016,"defoliation",|655|239|138
5018,"geological processes",|655|657|660
5019,"fens",|173|630|926
5020,"dendrometers",|693|801|141
5022,"ferns",|395|433|951|190
5023,"h