# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [60]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [61]:
document_tree = ET.parse( './data/mondial_database_less.xml' )
type(document_tree)

xml.etree.ElementTree.ElementTree

In [62]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [63]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':'),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [64]:
document = ET.parse( './data/mondial_database.xml')
# print all group name
country = document.find('country')
for c in country.getiterator():
    #print(c.tag)  # print all the tags(variables)
    #print(c.attrib)
    #print(c.text)
    #print(c.tail)
    #print(c.tag,c.attrib,c.text,c.tail)
    pass

In [65]:
# print all root tag name
root = document.getroot()
roottaglist = []
for child in root:
    roottaglist.append(child.tag)
# unique value in a list
# but set change element order
roottagset = set(roottaglist)
print(roottagset)

{'country', 'sea', 'desert', 'airport', 'island', 'lake', 'river', 'organization', 'mountain', 'continent'}


In [66]:
# print tag name under country
a = document.findall("./country/ethnicgroup")
#for ai in a:
#    print(ai.attrib)


### 1. 10 countries with the lowest infant mortality rates

In [67]:
# extract country name and its infant mortality rate
clist =[]
mortallist =[]
for c in document.findall('country'):
    for node in c.getchildren():
        # get country name
        if node.tag == 'name':
            clist.append(node.text)
        # get infant mortality rate
        elif node.tag == 'infant_mortality':
            mortallist.append(float(node.text))
        
# put clist and mortallist into cmdict
keys = clist
values = mortallist
cmdict = dict(zip(keys,values))
# print(cmdict)

# sort dictionary in ascending order
cmdict_sort = sorted(cmdict.items(), key= lambda x:x[1], reverse=False)
# print the 10 countries with the lowest infant mortality rates
print(cmdict_sort[:10])

[('Norway', 1.81), ('Papua New Guinea', 2.13), ('Faroe Islands', 2.48), ('Anguilla', 2.48), ('Japan', 2.53), ('Finland', 2.6), ('Spain', 2.63), ('Thailand', 2.73), ('Cambodia', 3.13), ('Gibraltar', 3.15)]


### 2. 10 cities with the largest population

In [68]:
# extract city name and its population
citylist = []
poplist = []
flag = False
for c in document.findall('country'):
    for node in c.getchildren():
        if node.tag == 'city':
            for cn in node.getchildren():
                # get city population in 2011
                if cn.tag == 'population':
                    a = cn.attrib
                    if a['year'] == '2011':
                        poplist.append(cn.text)
                        flag = True
                # record the city name if the city has population data
                if cn.tag == 'name':
                    if flag == True:
                        citylist.append(cn.text)
                        flag = False
# put citylist and poplist in dict
keys= citylist
values = poplist
cpdict = dict(zip(keys,values))
# sort dictionary in descending order
cpdict_sort = sorted(cpdict.items(), key =lambda x:x[1], reverse = False)
cpdict_sort[:10]

[('Harare', '100079'),
 ('Pokhara', '1003285'),
 ('Gaza', '101000'),
 ('Cayenne', '104011'),
 ('Tbilisi', '1060138'),
 ('Beograd', '107745'),
 ('Vlorë', '113249'),
 ('Keflavik', '118061'),
 ('Galway', '118912'),
 ('Plovdiv', '1270284')]

### 3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)

In [69]:
import math
flagct = False
# extract ethnic group and population across countries
ethnicpopdict = dict()
for c in document.findall('country'):
    # get latest population of the country
    pops = c.findall('population')
    popdict = dict()
    
    # get population from all years
    for ipop in pops:
        thisyear = ipop.attrib['year']
        popdict[thisyear] = int(ipop.text)
    popdict_sort = sorted(popdict.items(), key = lambda x:x[0],reverse = True)
    # select the latest population
    thispop = popdict_sort[0][1]
    
    # get ethnic group
    eths = c.findall('ethnicgroup')
    for ieth in eths:
        # get population percentage
        popper = float(ieth.attrib['percentage'])
        # get population number
        popnum = math.floor(thispop*popper/100)
        # add population number to this particular ethnic group
        # node.text
        ethnicpopdict[ieth.text] = ethnicpopdict.get(ieth.text,0)+popnum

    
    # reset pop for next country
    thispop = 0

# sort dict
ethnicpopdict_sorted = sorted(ethnicpopdict.items(), key =  lambda x:x[1], reverse = True)
ethnicpopdict_sorted[:10]


[('Han Chinese', 1245058800),
 ('Indo-Aryan', 871815583),
 ('European', 494872201),
 ('African', 318325104),
 ('Dravidian', 302713744),
 ('Mestizo', 157734349),
 ('Bengali', 146776916),
 ('Russian', 131856989),
 ('Japanese', 126534212),
 ('Malay', 121993548)]

### 4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [70]:
# river: country -> length
# lake:country -> area
# airport: country -> elevation
def sortdict(d):
    dsort = sorted(d.items(), key = lambda x:x[0], reverse = True)
    return dsort

# extract country name and river, lake and airport information
riverlist = dict()
lakelist = dict()
airportlist =dict()
# river
for c in document.findall('river'):
    for ic in c.getchildren():
        if ic.tag == 'length':
            riverlist[ic.text] = c.attrib
riverlist_sort = sortdict(riverlist)
print(riverlist_sort[0])


#lake
for l in document.findall('lake'):
    for ik in l.getchildren():
        if ik.tag == 'area':
            lakelist[ik.text] = l.attrib
lakelist_sort = sortdict(lakelist)
print(lakelist_sort[0])


# airport
for a in document.findall('airport'):
    for ia in a.getchildren():
        if ia.tag == 'elevation':
            if ia.text is None:
                airportlist['0'] = a.attrib
            else:
                airportlist[ia.text] = a.attrib
airportlist_sort = sortdict(airportlist)           
print(airportlist_sort[0])


('992', {'country': 'R MNG', 'id': 'river-Selenge'})
('981', {'country': 'USA', 'type': 'dam', 'id': 'lake-FortPeckLake'})
('995', {'iatacode': 'MHD', 'country': 'IR', 'city': 'cty-Iran-3'})
