# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [1]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [12]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [13]:
# print names of all countries
for child in document_tree.getroot():
    print(child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [14]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print('* ' + element.find('name').text + ':')
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print(capitals_string[:-2])

* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella


****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [1]:
from xml.etree import ElementTree as ET
import pandas as pd

countries_tree = ET.parse('./data/mondial_database.xml')
countries_root = countries_tree.getroot()

In [17]:
#1. Find the 10 countries with the lowest infant mortality rates.

 #create dictionary, set country names as keys and infant mortality rate as values
IM_dict = {}
for element in countries_tree.iterfind('country'):
    try: IM_dict[element.find('name').text] = [float(element.find('infant_mortality').text)]
    except: pass 
    
 #use dictionary to construct df
IM_df = pd.DataFrame([[key, value] for key, value in IM_dict.items()], columns = ['country', 'IM_rate'])
IM_df.sort_values(by = 'IM_rate').head(10)

In [18]:
#2. Find the 10 cities with the largest population.

 #create dictionary, set city names as keys and population as values
city_popn_dict = {}
for element in countries_tree.iterfind('country'):
    for city in element.iterfind('city'):
        try: city_popn_dict[city.find('name').text] = [int(city.find('population').text)]
        except: pass 
    
 #use dictionary to construct df, 
city_popn_df = pd.DataFrame([[key, value] for key, value in city_popn_dict.items()], 
                            columns = ['city', 'population'])
city_popn_df.sort_values(by = 'population', ascending = False).head(10)

In [51]:
#3. Find the 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries).

 #create list, add country, ethnic group, and percentage
EG_list = []
for element in countries_root.iterfind('country'):
    country_name = element.find('name').text
    country_popn = element.find('population').text
    for EG in element.iterfind('ethnicgroup'):
        EG_name = EG.text
        EG_pct = EG.attrib['percentage']
        EG_list.append([country_name, country_popn, EG_name, EG_pct])

 #create df, convert percentage to decimal and country population to numeric
EG_df = pd.DataFrame(EG_list, columns = ['country', 'country_popn', 'ethnic_group', 'percentage'])
EG_df['percentage'] = pd.to_numeric(EG_df['percentage']) / 100
EG_df['country_popn'] = pd.to_numeric(EG_df['country_popn'])

 #add column capturing populations of ethnic groups, group, sort, and show top 10
EG_df['EG_popn'] = EG_df['country_popn'] * EG_df['percentage'] 
EG_df_group = EG_df.groupby('ethnic_group').sum()
EG_df_group.sort_values(by = 'EG_popn', ascending = False).head(10)


Unnamed: 0_level_0,country_popn,percentage,EG_popn
ethnic_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Han Chinese,543776080,0.915,497555100.0
European,362717873,9.7082,192865800.0
Indo-Aryan,238396327,0.72,171645400.0
Russian,202263854,2.241,92758440.0
African,357529690,18.6855,86329370.0
Japanese,82199470,0.994,81706270.0
German,145710759,1.656,66232190.0
Dravidian,238396327,0.25,59599080.0
English,50616012,0.836,42314990.0
Mestizo,67185932,8.707,35542330.0


In [24]:
#4a) Find the name and country of the longest river.

 #pull name, country, and length for rivers and add to relevant lists
r_name_list = []
r_country_list = []
r_length_list = []

for river in countries_root.findall('river'):
    r_name = river.find('name').text
    r_name_list.append(r_name)
    r_country = river.get('country')
    r_country_list.append(r_country)
    
for river in countries_root.findall('river'):
    try: length = int(river.find('length').text)
    except: pass
    r_length_list.append(length)

 #check length of each list (commented out in final) and create master list
#len(name_list)
#len(country_list)
#len(length_list)
rivers_list = [r_name_list, r_country_list, r_length_list]

 #create df and sort for longest 
rivers_df = pd.DataFrame(rivers_list).T
rivers_df.columns = ['name', 'country', 'length']
rivers_df.sort_values(by = 'length', ascending = False).head(1)

Unnamed: 0,name,country,length
174,Amazonas,CO BR PE,6448


In [22]:
#4b) Find the name and country of largest lake.

 #pull name, country, and size for lakes and add to relevant lists
l_name_list = []
l_country_list = []
l_area_list = []

for lake in countries_root.findall('lake'):
    l_name = lake.find('name').text
    l_name_list.append(l_name)
    l_country = lake.get('country')
    l_country_list.append(l_country)
    
for lake in countries_root.findall('lake'):
    try: area = int(lake.find('area').text)
    except: pass
    l_area_list.append(area) 
    
 #check length of each list (commented out in final) and create master list
#len(name_list)
#len(country_list)
#len(area_list)
lakes_list = [l_name_list, l_country_list, l_area_list]

 #create df and sort for largest (by area) 
lakes_df = pd.DataFrame(lakes_list).T
lakes_df.columns = ['name', 'country', 'area']
lakes_df.sort_values(by = 'area', ascending = False).head(1)

Unnamed: 0,name,country,area
54,Caspian Sea,R AZ KAZ IR TM,386400


In [20]:
#4c) Find the name and country of airport at highest elevation.

 #pull name, country, and elevation for airports and add to relevant lists
a_name_list = []
a_country_list = []
a_elevation_list = []

for airport in countries_root.findall('airport'):
    a_name = airport.find('name').text
    a_name_list.append(a_name)
    a_country = airport.get('country')
    a_country_list.append(a_country)
    
for airport in countries_root.findall('airport'):
    try: elevation = int(airport.find('elevation').text)
    except: pass
    a_elevation_list.append(elevation) 
    
 #check length of each list (commented out in final) and create master list
#len(a_name_list)
#len(a_country_list)
#len(a_elevation_list)
airports_list = [a_name_list, a_country_list, a_elevation_list]

 #create df and sort for largest (by area) 
airports_df = pd.DataFrame(airports_list).T
airports_df.columns = ['name', 'country', 'elevation']
airports_df.sort_values(by = 'elevation', ascending = False).head(1)

Unnamed: 0,name,country,elevation
80,El Alto Intl,BOL,4063
