# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [2]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [8]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [11]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [3]:
document = ET.parse( './data/mondial_database.xml' )

In [22]:
# print names of all countries
col1 = []
col2 = []
for child in document.getroot():
    if (child.find('infant_mortality') != None) & (child.find('name') != None) :
        #print child.find('infant_mortality')
        col1.append(child.find('infant_mortality').text)
        col2.append(child.find('name').text)
d = {'country' : col2,
     'mortality' : col1}
df = pd.DataFrame(d)
df['mortality'] = df['mortality'].astype(float)
df = df.sort('mortality', ascending = True)
df.head(10)

(228, 228)


Unnamed: 0,country,mortality
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


In [37]:
col1 = []
col2 = []
for element in document.iterfind('country'):
    for subelement in element.getiterator('city'):
        #print(subelement.findall('population')[-1].text)
        col1.append(subelement.find('name').text)
# Using findall and [-1] ensures one uses the latest value for population
        if(subelement.find('population') != None):
            col2.append(subelement.findall('population')[-1].text)
        else:
# Trick to add NaN for those cities with no reported population
            col2.append(np.nan)

d = {'population' : col2,
     'city' : col1}
df = pd.DataFrame(d)
df['population'] = df['population'].astype(float)
df = df.sort('population', ascending = False)
df.head(10)

Unnamed: 0,city,population
1341,Shanghai,22315474
771,Istanbul,13710512
1527,Mumbai,12442373
479,Moskva,11979529
1340,Beijing,11716620
2810,São Paulo,11152344
1342,Tianjin,11090314
1064,Guangzhou,11071424
1582,Delhi,11034555
1067,Shenzhen,10358381


In [104]:
list1 = []
list2 = []
list3 = []
for child in document.getroot():
    if (child.find('ethnicgroup') != None):
        for i in range(len(child.findall('ethnicgroup'))):
            list1.append(child.findall('ethnicgroup')[i].text)
            list2.append(child.findall('ethnicgroup')[i].attrib['percentage'])
            list3.append(child.findall('population')[-1].text)

d = {'ethnic group' : list1,
     'percentage' : list2,
     'population' : list3 }

df = pd.DataFrame(d)
#df = df.groupby('ethnic group').size()
df['population'] = df['population'].astype(float)
df['percentage'] = df['percentage'].astype(float)
prod = df['population'] * df['percentage'] /100
df['real population'] = prod
df.groupby('ethnic group')['real population'].sum().order('real population', ascending = False).head(10)

ethnic group
Han Chinese    1.245059e+09
Indo-Aryan     8.718156e+08
European       4.948722e+08
African        3.183251e+08
Dravidian      3.027137e+08
Mestizo        1.577344e+08
Bengali        1.467769e+08
Russian        1.318570e+08
Japanese       1.265342e+08
Malay          1.219936e+08
Name: real population, dtype: float64

In [14]:
list1 = []
list2 = []
list3 = []

#& (child.find('country') != None)
for child in document.getroot():
    if ((child.find('length') != None) & ((child.find('located') != None))):
#        print(child.find('length').text, child.find('country').text, child.find('name').text)
        list1.append(child.find('length').text)
        #print(child.find('located').attrib['country'])
        location = []
        for i in range(len(child.findall('located'))):
            location.append(child.findall('located')[i].attrib['country']) 
        list2.append(location)
        list3.append(child.find('name').text)

d = {'length' : list1,
     'country' : list2,
     'river name' : list3 }

df = pd.DataFrame(d)
df['length'] = df['length'].astype(float)
df = df.sort('length', ascending = False)
df.head(1)


Unnamed: 0,country,length,river name
164,"[CO, BR, PE]",6448,Amazonas


In [15]:
list1 = []
list2 = []
list3 = []

#& (child.find('country') != None)
for child in document.iterfind('lake'):
    if ((child.find('area') != None) & ((child.find('located') != None))):
#        print(child.find('length').text, child.find('country').text, child.find('name').text)
        list1.append(child.find('area').text)
        #print(child.find('located').attrib['country'])
        location = []
        for i in range(len(child.findall('located'))):
            location.append(child.findall('located')[i].attrib['country']) 
        list2.append(location)
        list3.append(child.find('name').text)

d = {'area' : list1,
     'country' : list2,
     'river name' : list3 }

df = pd.DataFrame(d)
df['area'] = df['area'].astype(float)
df = df.sort('area', ascending = False)
df.head(1)

Unnamed: 0,area,country,river name
52,386400,"[R, KAZ, IR, TM]",Caspian Sea


In [17]:
list1 = []
list2 = []
list3 = []

#& (child.find('country') != None)
for child in document.iterfind('airport'):
    if ((child.find('elevation') != None) & (child.attrib['country'] != None)):
#        print(child.find('length').text, child.find('country').text, child.find('name').text)
        list1.append(child.find('elevation').text)
        list2.append(child.attrib['country'])
        list3.append(child.find('name').text)

d = {'elevation' : list1,
     'country' : list2,
     'airport name' : list3 }

df = pd.DataFrame(d)
df['elevation'] = df['elevation'].astype(float)
df = df.sort('elevation', ascending = False)
df.head(1)

Unnamed: 0,airport name,country,elevation
80,El Alto Intl,BOL,4063
