# 20 Working with HTML and KML
* 20.1 Working with HTML
  * 20.1.1 Specifying Links
  * 20.1.2 Embedding Images
  * 20.1.3 HTML Lists
  * 20.1.4 HTML Tables
  * 20.1.5 Writing HTML with Python
  * 20.1.6 Parsing HTML with BeautifulSoup
* 20.2 Fetching and Uncompressing Data
  * 20.2.1 Fetching HTML
  * 20.2.2 Fetching Compressed Data
  * 20.2.3 Expanding Compressed Data
* 20.3 Working with
  * 20.3.1 The Structure of KML
  * 20.3.2 Parsing
  * 20.3.3 Converting KML to Shapefile
* 20.4 Discussion
* 20.5 Key Terms
* 20.6 Exercises

## 20.1 Working with HTML

### 20.1.1 Specifying Links

### 20.1.2 Embedding Images

### 20.1.3 HTML Lists

### 20.1.4 HTML Tables

### 20.1.5 Writing HTML with Python

In [None]:
# %load script/writeSimpleHTML.py
# writeSimpleHTML.py
# Purpose: Write HTML page from hard-coded string.
# Usage: No arguments needed.

mystr = '''<!DOCTYPE html>
<html>
    <body>
        <h1>Asian Elephant</h1>
        <img src="../data/ch20/pics/lakshmi.jpg" alt="elephant">
    </body>
</html>
'''
htmlFile = 'C:/gispy/scratch/output.html'
outf = open(htmlFile, 'w')
outf.write(mystr)
outf.close()
print '{0} created.'.format(htmlFile)


In [3]:
%run script2/writeSimpleHTML.py

scratch/output.html created.


In [None]:
# %load script/writeSimpleHTML2.py
# writeSimpleHTML2.py
# Purpose: Write HTML page in 3 parts.
# Usage: workspace title image_path
# Example input: C:/gispy/scratch "Asian Elephant" ../data/ch20/pics/lakshmi.jpg
import sys
workspace = sys.argv[1]
title = sys.argv[2]
image = sys.argv[3]

beginning = '''<!DOCTYPE html>
<html>
    <body>'''

middle = '''
        <h1>{0}</h1>
        <img src='{1}' >\n'''.format(title, image)

end = '''   </body>
</html>
'''

htmlfile = workspace + '/output2.html'
with open(htmlfile, 'w') as outf:
    outf.write(beginning)
    outf.write(middle)
    outf.write(end)

print '{0} created.'.format(htmlfile)


In [7]:
%run script/writeSimpleHTML2.py scratch "Asian Elephant" ../data/pics/lakshmi.jpg

scratch/output2.html created.


In [None]:
# %load script/printHTMLList.py
# printHTMLList.py
# Purpose: Call a function that converts
#           a Python list to an HTML list.


def python2htmlList(myList, listType, attrs=''):
    '''Convert a Python list to HTML list.
    For example, convert [rast1,rast2] to:
    <ul>
       <li>rast1</li>
       <li>rast2</li>
    </ul>
    '''
    # Wrap items in item tags.
    htmlItems = ['<li>' + str(item) + '</li>' for item in myList]

    # Join the item list into a string with a line break after each item.
    itemsString = '''\n        '''.join(htmlItems)

    # Wrap the string of items in the list tag.
    htmlList = '''
    <{0} {1}>
        {2}
    </{0}>
    '''.format(listType, attrs, itemsString)
    return htmlList

rasts = [u'elev', u'landcov', u'soilsid', u'getty_cover']
htmlList = python2htmlList(rasts, 'ul')
print htmlList

htmlList2 = python2htmlList(rasts, 'ol')
print htmlList2

htmlList3 = python2htmlList(rasts, 'ol', 'type="a"')
print htmlList3


In [9]:
%run script2/printHTMLList.py


    <ul >
        <li>elev</li>
        <li>landcov</li>
        <li>soilsid</li>
        <li>getty_cover</li>
    </ul>
    

    <ol >
        <li>elev</li>
        <li>landcov</li>
        <li>soilsid</li>
        <li>getty_cover</li>
    </ol>
    

    <ol type="a">
        <li>elev</li>
        <li>landcov</li>
        <li>soilsid</li>
        <li>getty_cover</li>
    </ol>
    


### 20.1.6 Parsing HTML with BeautifulSoup

In [10]:
import sys
sys.path.append('script')
import BeautifulSoup

In [11]:
mystr = '<!DOCTYPE html><html><body><h1>Asian Elephant</h1><img src="lakshmi.jpg" alt="elephant"></body></html>'

In [12]:
soup = BeautifulSoup.BeautifulSoup(mystr)

In [14]:
h = soup.prettify()
print h

<!DOCTYPE html>
<html>
 <body>
  <h1>
   Asian Elephant
  </h1>
  <img src="lakshmi.jpg" alt="elephant" />
 </body>
</html>


In [15]:
t = soup.find('h1')
t

<h1>Asian Elephant</h1>

In [16]:
type(t)

BeautifulSoup.Tag

In [17]:
t.name

u'h1'

In [18]:
t.attr

In [19]:
t.contents

[u'Asian Elephant']

In [20]:
t.contents[0]

u'Asian Elephant'

In [22]:
t2 = soup.find('img')
t2

<img src="lakshmi.jpg" alt="elephant" />

In [23]:
t2.attrs

[(u'src', u'lakshmi.jpg'), (u'alt', u'elephant')]

In [24]:
t2['src']

u'lakshmi.jpg'

In [25]:
t2['alt']

u'elephant'

In [26]:
for name, value in t2.attrs:
  print 'Name: ' + name + ' Value: ' + value

Name: src Value: lakshmi.jpg
Name: alt Value: elephant


In [28]:
htmlList = '\n <ul>\n <li>elev</li>\n <li>landcov</li>\n<li>soilsid</li>\n <li>getty_cover</li>\n </ul>\n'

In [31]:
soup2 = BeautifulSoup.BeautifulSoup(htmlList)
soup2


<ul>
<li>elev</li>
<li>landcov</li>
<li>soilsid</li>
<li>getty_cover</li>
</ul>

In [32]:
tags = soup2.findAll('li')
tags

[<li>elev</li>, <li>landcov</li>, <li>soilsid</li>, <li>getty_cover</li>]

In [33]:
for t in tags:
  print t.contents[0]

elev
landcov
soilsid
getty_cover


In [34]:
Link: elephant1.html

SyntaxError: invalid syntax (<ipython-input-34-38a3b9dab961>, line 1)

In [None]:
# %load script/getLinks.py
# getLinks.py
# Purpose: Read and print the links in an html file.

import sys
basedir = 'C:/gispy/'
sys.path.append(basedir + 'sample_scripts/ch20')
import BeautifulSoup

# Read the HTML file contents.
with open(basedir + 'data/ch20/htmlExamplePages/elephant2.html', 'r') as infile:

    # Create a soup object and find all the hyperlinks.
    soup = BeautifulSoup.BeautifulSoup(infile)
    linkTags = soup.findAll('a')

    # Print each hyperlink reference.
    for linkTag in linkTags:
        print 'Link: {0}'.format(linkTag['href'])


In [38]:
%run script2/getLinks.py

Link: elephant1.html
Link: https://www.google.com/


## 20.2 Fetching and Uncompressing Data

### 20.2.1 Fetching HTML

In [None]:
# %load script/fetchHTML.py
# fetchHTML.py
# Fetch HTML from a site and print the number of lines in the HTML
import urllib2

url = 'http://www.google.com'
response = urllib2.urlopen(url)
contents = response.read()
response.close()
print 'This page has {0} characters.'.format(len(contents))


In [40]:
%run script2/fetchHTML.py

This page has 10486 characters.


In [41]:
pics = soup.findAll('img')
pics

[<img src="../pics/lakshmi.jpg" width="32" height="32" />]

### 20.2.2 Fetching Compressed Data

In [None]:
# %load script/fetchZip.py
# fetchZip.py
# Purpose: Fetch a zip file and place it in an output directory.
import os, urllib2


def fetchZip(url, outputDir):
    '''Fetch binary web content located at 'url'
    and write it in the output directory'''
    response = urllib2.urlopen(url)
    binContents = response.read()
    response.close()

    # Save zip file to output dir (write it in 'wb' mode).
    outFileName = outputDir + os.path.basename(url)
    with open(outFileName, 'wb') as outf:
        outf.write(binContents)

outputDir = 'C:/gispy/scratch/'
theURL = 'file:///C:/gispy/data/ch20/getty.zip'
fetchZip(theURL, outputDir)
print '{0}{1} created.'.format(outputDir, os.path.basename(theURL))


In [44]:
%run script2/fetchZip.py

scratch/getty.zip created.


### 20.2.3 Expanding Compressed Data

In [None]:
# %load script/extractFiles.py
# extractFiles.py
# Purpose: Extract files from an archive;
#     Place the files into an output directory.
# Usage: No script arguments

import os, zipfile


def unzipArchive(archiveName, dest):
    '''Extract files from an archive
    and save them in the destination directory'''
    print 'Unzip {0} to {1}'.format(archiveName, dest)
    # Get a Zipfile object.
    with zipfile.ZipFile(archiveName, 'r') as zipObj:
        zipObj.extractall(dest)
        # Report the list of files extracted from the archive.
        archiveList = zipObj.namelist()
        for fileName in archiveList:
            print ' Extract file: {0} ...'.format(fileName)
    print 'Extraction complete.'

archive = 'park.zip'
baseDir = 'C:/gispy/'
archiveFullName = baseDir + 'data/ch20/' + archive
destination = baseDir + 'scratch/' + os.path.splitext(archive)[0] + '/'
if not os.path.exists(destination):
    os.makedirs(destination)
unzipArchive(archiveFullName, destination)


In [46]:
%run script2/extractFiles.py

Unzip data/park.zip to scratch/park/
 Extract file: park.prj ...
 Extract file: park.sbn ...
 Extract file: park.sbx ...
 Extract file: park.shp ...
 Extract file: park.shp.xml ...
 Extract file: park.shx ...
 Extract file: park.dbf ...
 Extract file: park.kmz ...
Extraction complete.


In [47]:
%load script/restaurants.kml

ValueError: 'script/restaurants.kml' was not found in history, as a file, url, nor in the user namespace.

## 20.3 Working with KML

### 20.3.1 The Structure of KML

### 20.3.2 Parsing

In [None]:
# %load script/parseKMLrestaurants.py
# parseKMLrestaurants.py
# Purpose: Print kml placemark names and descriptions.
import sys

baseDir = 'C:/gispy/'
sys.path.append(baseDir + 'sample_scripts/ch20')
import BeautifulSoup

fileName = baseDir + 'data/ch20/restaurants.kml'

# Get the KML soup.
with open(fileName, 'r') as kmlCode:
    soup = BeautifulSoup.BeautifulSoup(kmlCode)

# Print the names and descriptions.
names = soup.findAll('name')
descriptions = soup.findAll('description')
for name, description in zip(names, descriptions):
    print name.contents[0]
    print '\t{0}'.format(description.contents)


In [50]:
%run script2/parseKMLrestaurants.py

Bubba's Tofu Gumbo
	[u'Tofu Gumbo and Zydeco!', <br />, u'Score: 97']
Joe Bob's Good Cookin'
	[u"The best tree top grits n' greens restaurant south of the Mason-Dixon line.", <br />, u'Score: 94']


In [51]:
description.contents

[u"The best tree top grits n' greens restaurant south of the Mason-Dixon line.",
 <br />,
 u'Score: 94']

In [52]:
scoreString = description.contents[2]
scoreString

u'Score: 94'

In [53]:
scoreList = scoreString.split(':')
scoreList

[u'Score', u' 94']

In [54]:
score = float(scoreList[1])
score

94.0

### 20.3.3 Converting KML to Shapefile

In [None]:
# %load script/restaurantKML2shp.py
# restaurantKML2shp.py
# Purpose: Create a shapefile from a kml file using an insert cursor.
# Usage: kml_directory output_directory
# Example input: C:/gispy/data/ch20 C:/gispy/scratch/

import arcpy, os, sys, BeautifulSoup

dataDir = sys.argv[1]
outDir = sys.argv[2]
arcpy.env.workspace = outDir
kmlFile = 'restaurants.kml'
kmlPath = os.path.join(dataDir, kmlFile)
baseName = os.path.splitext(kmlFile)[0]
fc = baseName + '.shp'

fieldNames = ['name', 'blurb', 'score']
fieldTypes = ['TEXT', 'TEXT', 'FLOAT']

# If the shapefile already been created, delete it.
if arcpy.Exists(fc):
    arcpy.Delete_management(fc)

sr = arcpy.SpatialReference('NAD 1983 UTM Zone 17N')
arcpy.CreateFeatureclass_management(outDir, fc, 'POINT', '#', '#', '#', sr)
for field, type in zip(fieldNames, fieldTypes):
    arcpy.AddField_management(fc, field, type)

# Get the tag soup.
with open(kmlPath, 'r') as kmlCode:
    soup = BeautifulSoup.BeautifulSoup(kmlCode)
coordinates = soup.findAll('coordinates')
names = soup.findAll('name')
descriptions = soup.findAll('description')

# Populate the shapefile.
with arcpy.da.InsertCursor(fc, ['SHAPE@XY'] + fieldNames) as ic:
    for c, n, d in zip(coordinates, names, descriptions):
        # Get field values.
        [x, y, z] = c.contents[0].split(',')
        myPoint = arcpy.Point(x, y)
        name = n.contents[0]
        blurb = d.contents[0]
        scoreString = d.contents[2]
        scoreList = scoreString.split(':')
        score = float(scoreList[1])
        # Put row values in a list & insert the new row.
        newRow = [myPoint, name, blurb, score]
        ic.insertRow(newRow)
print '{0}{1} created.'.format(outDir, fc)
if ic:
    del ic

In [56]:
%run script2/restaurantKML2shp.py data scratch/

scratch/restaurants.shp created.


## 20.4 Discussion

## 20.5 Key Terms

## 20.6 Exercises