This notebook generates a sitemap index and collection of sitemap files linked from the index. The sitemap has links to html pages for all records in the CINERGI catalog. The map is generated by calls to the elastic search API.  Output sitemap is written to local c:\tmp folder, this will probably need to be changed to run in this Jupyter hub environment (not tested here...).  Original code produced on my local Jupyter instance using python 2.7 kernel.

Stephen Richard 2018-06-08


In [2]:
from datetime import datetime
import requests
import sys
# see http://docs.python-requests.org/en/master/user/quickstart/ for package documentation

XML_HEADER = '<?xml version="1.0" encoding="UTF-8"?>'

fileLocationBase = 'c:\\tmp\\'
print (fileLocationBase)
sitemaptohtml = 'https://raw.githubusercontent.com/CINERGI/xmlsitemap/master/xml-sitemap.xsl'


# construct Elasticsearch URL with  search request
espath="http://cinergi.sdsc.edu/geoportal/elastic/"
esindex="metadata"
esresource="/item/_search"
baseURL = espath+esindex+esresource


# need to use scrolling because there are >10000 records
# this is the time to live for the scroll index; renewed on each search call
p_scroll="1m"
#number of records to return
p_size="10000"
#p_size="10"
# the only field we need for the sitemap is the modified date
# comma delimited list of index fields to return from the _source section of the hits object
#p_source="sys_modified_dt,title"
p_source="sys_modified_dt"

c:\tmp\


In [3]:
# generates the top level index file. one entry for each sitemap written out
def indexFile():
    try:
        file_object  = open(fileLocationBase + "CinergiSiteIndex.xml", "w")
    except:
        print("ERROR: Can't open the index file, bailing out")
        print(sys.exc_info()[1])
        sys.exit(0)
    # put in the header stuff
    file_object.write(XML_HEADER)
    file_object.write("\n")
    file_object.write('<?xml-stylesheet type="text/xsl" href="' + sitemaptohtml + '"?>')
    file_object.write('\n')
    file_object.write('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
    file_object.write("\n")

    return(file_object)

In [4]:
# routine to generate individual sitemap file objects
def siteMapFile(name):
    try:
        file_object  = open(fileLocationBase + name, "w")
    except:
        print("ERROR: Can't open the new sitemap file: " + name + ", bailing out")
        print(sys.exc_info()[1])
        sys.exit(0)
        
    #put in the header stuff
    file_object.write(XML_HEADER)
    file_object.write('\n')
    file_object.write('<?xml-stylesheet type="text/xsl" href="' + sitemaptohtml + '"?>')
    file_object.write('\n')
    file_object.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
    file_object.write('\n')
    return(file_object)

In [5]:
def writeLinks( response, mfile ):
#   "functiondocstring"
    for hit in response["hits"]["hits"]:
#        hittitle = hit["_source"]["title"]
        try:
            hitid = hit["_id"]
            hitmodified =  hit["_source"]["sys_modified_dt"]
#        print ("title: ", hittitle, " id: ", hitid, " date: ", hitmodified) 

            mfile.write('<url>')
            mfile.write("\n")
            mfile.write('<loc>http://cinergi.sdsc.edu/geoportal/rest/metadata/item/' 
                        + hitid + '/html</loc>')
            mfile.write("\n")
            mfile.write('<lastmod>' + hitmodified + '</lastmod>')
            mfile.write("\n")
            mfile.write('<changefreq>monthly</changefreq>')
            mfile.write("\n")
#        mfile.write('<priority>0.8</priority>')
#        mfile.write("\n")
            mfile.write('</url>')
            mfile.write("\n")
        except:
            print("ERROR writing sitemap url for _id= " + hitid)
            print(sys.exc_info()[1])
    return


In [7]:
#This is the main routine.

# first get the scroll index to start scrolling loop, and the total number of records

counter = 0
filecount = 0
#print counter

#first request to get scrolling set up
p = {'scroll':p_scroll, 
    'size' : p_size, 
    '_source' : p_source}
r = requests.get(baseURL, params=p)
print ("request1: ", r.url)

if r.status_code == requests.codes.ok:
    response = r.json()
    totalRecords = response["hits"]["total"]
    scrollID = response["_scroll_id"]

    #    set up the index file
    indexhandle = indexFile()
    print ("total records: ", totalRecords)
    sitemapfilename = "cinergisitemap" + str(filecount)+ ".xml"
    sitemaphandle = siteMapFile(sitemapfilename)
    writeLinks(response, sitemaphandle)
    sitemaphandle.write('</urlset>')
    sitemaphandle.close() 
        
        #new index entry
    indexhandle.write('<sitemap>')
    indexhandle.write('\n')
    indexhandle.write('<loc>http://cinergi.sdsc.edu/geoportal/' + sitemapfilename + '</loc>')
    indexhandle.write('\n')
    indexhandle.write('<lastmod>' + str(datetime.now())+ '</lastmod>')
    indexhandle.write('\n')
    indexhandle.write('</sitemap>')
    indexhandle.write('\n')
        
    filecount = filecount + 1
    counter = counter + int(p_size)
else:
    r.raise_for_status()
    sys.exit(0)
            
        
while counter < totalRecords:
#while counter < 50:
    #have to hit the scroll resource for Elasticsearch
    esresource="_search/scroll"
    #cinergi requires publisher role to run the scroll resource
    espath="http://admin:admin@cinergi.sdsc.edu/geoportal/elastic/"
    baseURL = espath+esresource
    p = { 'scroll':p_scroll, 
    'scroll_id' : scrollID}
    r = requests.get(baseURL, params=p)
#    print "request: ", r.url
#        print "raw response2: ", r, " status: ", r.status_code
#        print r.headers['content-type']
    if r.status_code == requests.codes.ok:
        response = r.json()
        scrollID = response["_scroll_id"]
        sitemapfilename = "cinergisitemap" + str(filecount)+ ".xml"
        sitemaphandle = siteMapFile(sitemapfilename)
        writeLinks(response, sitemaphandle)
        sitemaphandle.write('</urlset>')
        sitemaphandle.close() 
        
        #new index entry
        indexhandle.write('<sitemap>')
        indexhandle.write('\n')
        indexhandle.write('<loc>http://cinergi.sdsc.edu/geoportal/' + sitemapfilename + '</loc>')
        indexhandle.write('\n')
        indexhandle.write('<lastmod>' + str(datetime.now())+ '</lastmod>')
        indexhandle.write('\n')
        indexhandle.write('</sitemap>')
        indexhandle.write('\n')
        
        filecount = filecount + 1
        counter = counter + int(p_size)
        print ("count: ", counter)
    else:
        r.raise_for_status()
        break

indexhandle.write('</sitemapindex>')        
indexhandle.close()
       
print ("done, counter = ",counter)

request1:  http://cinergi.sdsc.edu/geoportal/elastic/metadata/item/_search?size=10000&_source=sys_modified_dt&scroll=1m
total records:  1129855
count:  20000
count:  30000
count:  40000
count:  50000
count:  60000


KeyboardInterrupt: 