##### XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [3]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [4]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [5]:
# print names of all countries
for child in document_tree.getroot():
    print child.find('name').text

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [6]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print '* ' + element.find('name').text + ':',
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print capitals_string[:-2]

* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia: Skopje, Kumanovo
* Serbia: Beograd, Novi Sad, Niš
* Montenegro: Podgorica
* Kosovo: Prishtine
* Andorra: Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [7]:
document = ET.parse( './data/mondial_database.xml' )

In [16]:
data = {}
for child in document.iterfind('country'):
    name = child.find('name').text
    if child.find('infant_mortality') != None:
        infant_mortality = float(child.find('infant_mortality').text)
    data[infant_mortality] = name

data = sorted(data.items())
data[:10]

[(1.81, 'Monaco'),
 (2.13, 'Japan'),
 (2.48, 'Bermuda'),
 (2.53, 'Singapore'),
 (2.6, 'Sweden'),
 (2.63, 'Czech Republic'),
 (2.73, 'Hong Kong'),
 (3.13, 'Macao'),
 (3.15, 'Iceland'),
 (3.31, 'Italy')]

In [19]:
# Create a dataframe with all countries and their most recent measured population 
import pandas as pd

cities_list = []
for element in document.iterfind('country'):
    for subelement in element.getiterator('city'):
            if subelement.find('population') != None:
                dict = {}
                dict['city'] = subelement.find('name').text
                # Take the most recent population measured
                dict['population'] = float(subelement.findall('population')[-1].text)
                cities_list.append(dict)

cities = pd.DataFrame(cities_list)  
cities.sort_values(by = 'population', ascending = False).head(10)

Unnamed: 0,city,population
1251,Shanghai,22315474
707,Istanbul,13710512
1421,Mumbai,12442373
443,Moskva,11979529
1250,Beijing,11716620
2594,São Paulo,11152344
1252,Tianjin,11090314
974,Guangzhou,11071424
1467,Delhi,11034555
977,Shenzhen,10358381


In [22]:
# Create a dataframe with all countries, their population and the ethnic groups with percentage

countries_list = []
for element in document.iterfind('country'):
        # read in the data using a datadictionary
        dict = {}
        if element.find('population') != None: 
            for ethnic in element.findall('ethnicgroup'):
                dict['country'] = element.find('name').text
                dict['population'] = float(element.findall('population')[-1].text)
                dict['ethnicgroup'] = ethnic.text
                dict['percentage'] = float(ethnic.attrib['percentage'])
                countries_list.append(dict)
                
countries = pd.DataFrame(countries_list)               

# calculate the population based on the ethnic group
countries['population_group'] = countries.population*countries.percentage/100.0
countries.groupby('ethnicgroup').population_group.sum().sort_values(ascending = False).head(10)

ethnicgroup
Han Chinese        1.245059e+09
African            3.400925e+08
Malay              2.547754e+08
Arab-Berber        1.708847e+08
Eastern Hamitic    1.656608e+08
Bengali            1.467769e+08
Japanese           1.265342e+08
European           1.234517e+08
Mongol             1.089769e+08
Thai               9.897249e+07
Name: population_group, dtype: float64

In [25]:
# Create a dataframe with all airports, their elevation and the country

airport_list = []
for element in document.iterfind('airport'):
        # read in the data using a datadictionary
        dict = {}
        if element.find('elevation') != None: 
            dict['country'] = element.attrib['country']
            dict['name'] = element.find('name').text
            dict['elevation'] = element.find('elevation').text
            airport_list.append(dict)
# throw out the ones with missing elevation                
airports = pd.DataFrame(airport_list).dropna()  
# make the elevation a float
airports.elevation = airports.elevation.map(int)

airports.sort_values(by = 'elevation', ascending = False).head(1)

Unnamed: 0,country,elevation,name
80,BOL,4063,El Alto Intl


In [26]:
# Create a dataframe with all lakes, their size and the country

lake_list = []
for element in document.iterfind('lake'):
        # read in the data using a datadictionary
        dict = {}
        if element.find('area') != None: 
            dict['country'] = element.attrib['country']
            dict['name'] = element.find('name').text
            dict['area'] = float(element.find('area').text)
            
        lake_list.append(dict)
        
# throw out the ones with missing elevation                
lakes = pd.DataFrame(lake_list).dropna()  
lakes.sort_values(by = 'area', ascending = False).head(1)

Unnamed: 0,area,country,name
54,386400,R AZ KAZ IR TM,Caspian Sea


In [28]:
# Create a dataframe with all rivers, their length and the country

river_list = []
for element in document.iterfind('river'):
        # read in the data using a datadictionary
        dict = {}
        if element.find('length') != None: 
            dict['country'] = element.attrib['country']
            dict['name'] = element.find('name').text
            dict['length'] = float(element.find('length').text)
        river_list.append(dict)
# throw out the ones with missing elevation                
rivers = pd.DataFrame(river_list).dropna()
rivers.sort_values(by = 'length', ascending = False).head(1)

Unnamed: 0,country,length,name
174,CO BR PE,6448,Amazonas
