# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [740]:
from xml.etree import ElementTree as ET
import pandas as pd
import numpy as np

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [741]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [742]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [743]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':',
    capitals_string = '')
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

TypeError: 'capitals_string' is an invalid keyword argument for this function

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [None]:
#1. 10 Countries with the lowest infant mortality rates

In [744]:
# Pulling XML data into an object
document = ET.parse( './data/mondial_database.xml' )
doc = document.getroot()

# Empy diccionary to collect specify keys and values.
dict_c = {}

for element in doc.iterfind('country'):
    country = element.find('name')
    inf_mr = element.find('infant_mortality')
    
    # Condition to ignore None values for 'infant_mortality table'
    if inf_mr != None:                          
        dict_c[country.text] = float(inf_mr.text) 
    

#Building created diccionary(dict_c) into a dataframe.
df_c = pd.DataFrame.from_dict(dict_c, orient='index')
df_c.columns = ['inf_mort_rate']
df1 = df_c.sort_values(by='inf_mort_rate', ascending=True)
# Obtaining the 10 Countries with the lowest infant mortality rates.
df1.head(10)

                

Unnamed: 0,inf_mort_rate
Monaco,1.81
Japan,2.13
Bermuda,2.48
Norway,2.48
Singapore,2.53
Sweden,2.6
Czech Republic,2.63
Hong Kong,2.73
Macao,3.13
Iceland,3.15


In [None]:
# .10 cities with the largest population

In [746]:
# Pulling XML data into an object
document = ET.parse( './data/mondial_database.xml' )
doc = document.getroot()

pop_lst = []     #list to build data into a data frame.


#Loop to find cities' names and population
for elements in doc.iterfind('country'):
    c = elements.iterfind('city')
    city = elements.find('name')
    pltion = elements.find('.//population[last()]')
    
    
    # Condition to ignore None values for 'infant_mortality table'
    if city != None and pltion != None:
        pop_lst.append([city.text, pltion.text]) 
               
            
#Building list into DataFrame        
df = pd.DataFrame(data = pop_lst, columns = ['City','population'])
# Turning 'population' column's values into numeric values to be able to sort
df['population'] = pd.to_numeric(df['population'])
df = df.sort_values('population', ascending=False)
df.head(10)

Unnamed: 0,City,population
55,China,1360720000
67,India,1210854977
120,United States,318857056
88,Indonesia,252124458
176,Brazil,202768562
57,Pakistan,173149306
202,Nigeria,164294516
65,Bangladesh,149772364
23,Russia,143666931
98,Japan,127298000


In [None]:
#3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries

In [747]:
# Getting xml data 
document = ET.parse( './data/mondial_database.xml' )
doc = document.getroot()

# empty list to start searching for needed objects
eth_lst = []


# finding name of countries
for element in doc.iterfind('country'):
    country = element.find('name').text
    pltion = element.find('.//population[last()]').text
    #print(country, c_pltion)
    
    # finding ethnic groups and their population's percentages 
    for eg in element.iterfind('ethnicgroup'):
        eg_name = eg.text
        #print(eg_name)                                 #Print comments are to check each step of the way
        #print(c_pltion)
        eg_perc = eg.attrib['percentage']
        #print(eg_perc)
        #print(c_pltion)
        
        
        # Building list of values
        if pltion != None:
            eth_lst.append([country, eg_name, eg_perc, pltion])
        #print(eth_lst)
    

# Building columns for list values. Also, building list into data frame.
eg_df = pd.DataFrame(eth_lst, columns = ['Country', 'Ethnic_g', 'Ethnic_pct', 'Population'])

# Turning text number values into numeric values.
eg_df['Population'] = pd.to_numeric(eg_df['Population'])
eg_df['Fraction'] = (eg_df['Ethnic_pct'].astype(float)/100) * eg_df['Population']

eg_df.head(5)
# On population letter 'e' represents (times 10), and numbers right after represent to that given power(ex. 06= to power of 6)

Unnamed: 0,Country,Ethnic_g,Ethnic_pct,Population,Fraction
0,Albania,Albanian,95.0,2800138,2660131.0
1,Albania,Greek,3.0,2800138,84004.14
2,Greece,Greek,93.0,10816286,10059150.0
3,Macedonia,Macedonian,64.2,2059794,1322388.0
4,Macedonia,Albanian,25.2,2059794,519068.1


In [748]:
# Grouping by ethnicity to get the largest 10 overall populations.
ethnic_df = eg_df.groupby('Ethnic_g').sum()
et = ethnic_df.sort_values('Fraction', ascending=False)
et.head(10)

Unnamed: 0_level_0,Population,Fraction
Ethnic_g,Unnamed: 1_level_1,Unnamed: 2_level_1
Han Chinese,1360720000,1245059000.0
Indo-Aryan,1210854977,871815600.0
European,1157295639,494872200.0
African,975352746,318325100.0
Dravidian,1210854977,302713700.0
Mestizo,279743964,157734400.0
Bengali,149772364,146776900.0
Russian,322438406,131857000.0
Japanese,127298000,126534200.0
Malay,377500275,121993600.0


In [None]:
#4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [None]:
#4  a) Name and country of the longest river

In [749]:
document = ET.parse( './data/mondial_database.xml' )
doc = document.getroot()
# Build a diccionary with Countries' name and Countries' code.
country_dict = {}

for elements in doc.iterfind('country'):
    country = elements.find('name').text
    c_code = elements.attrib['car_code']
    
    
    country_dict[c_code] = country
    #print(country, c_code)
    #country_lst.append([country, c_code])
    
# Building diccionary into a dataframe   
country_df = pd.DataFrame.from_dict(country_dict, orient='index')
country_df.reset_index(drop=False, inplace=True)
country_df.columns = [ 'C_code', 'Country']

country_df.head(5)
    

Unnamed: 0,C_code,Country
0,AL,Albania
1,GR,Greece
2,MK,Macedonia
3,SRB,Serbia
4,MNE,Montenegro


In [750]:
# New list to append the attributes of rivers
river_lst = [] 
    
for elements in doc.iterfind('river'):
        river_n = elements.find('name').text
        river_lng = elements.find('length')
        #print(river_n, river_lng)
        if river_lng != None:
            river_l = river_lng.text
            
            
        # There is more than one country for the same exact river, so they
        # need to be split individually istead of being group all together through
        # the .split() function.
        for code in elements.attrib['country'].split():
            country_r_code = country_dict[code]
        
            
        river_lst.append([country_r_code, code, river_n,float(river_l)])
#print(river_lst)



# Build new dataframe for river_lst
labels = ['Country', 'C_code', 'River_name', 'River_l']     
rivers_df = pd.DataFrame(river_lst, columns=labels )
rivers_df.head()

Unnamed: 0,Country,C_code,River_name,River_l
0,Iceland,IS,Thjorsa,230.0
1,Iceland,IS,Joekulsa a Fjoellum,206.0
2,Norway,N,Glomma,604.0
3,Norway,N,Lagen,322.0
4,Sweden,S,Goetaaelv,93.0


In [751]:
# Obtaining country and name of the longest river.
lngest_r = rivers_df.sort_values('River_l', ascending=False)
lngest_r.head(1)

Unnamed: 0,Country,C_code,River_name,River_l
174,Peru,PE,Amazonas,6448.0


In [None]:
# b) Name and Country of largest lake.

In [753]:
# Build list to loop through different lakes' characters.
lake_lst = []

for elements in doc.iterfind('lake'):
    lake_n = elements.find('name').text
    lake_a = elements.find('area')
    
    # Condtion, igonore None value.
    if lake_a != None:
        lake_area = lake_a.text
        
        
         #There is more than one country for the same exact river, so they
        # need to be split individually istead of being group all together through
        # the .split() function.    
    for code in elements.attrib['country'].split():
        l_cntry = country_dict[code]
        
    lake_lst.append([l_cntry, code, lake_n, float(lake_area)])
    
#print(lake_lst)

labels = ['Lake_Country','Country_code', 'Lake_name','Lake_area']
lakes_df = pd.DataFrame(lake_lst, columns=labels)
longest_lake = lakes_df.sort_values('Lake_area', ascending=False)
# Name and country of the largest lake
longest_lake.head(1)

Unnamed: 0,Lake_Country,Country_code,Lake_name,Lake_area
54,Turkmenistan,TM,Caspian Sea,386400.0


In [None]:
# c) Name and Country of airport at highest elevation

In [None]:
# Build list to append different airports' characteristics.
airpt_lst = []

for elements in doc.iterfind('airport'):
    airport_n = elements.find('name').text
    airport_ele_v = elements.find('elevation')
    
    if airport_ele_v != None:
        airp_ele = airport_ele_v.text
        
    for code in elements.attrib['country'].split():
        country = country_dict[code]
        
    airpt_lst.append([country, code, airport_n, airp_ele])
    
#print(airpt_lst)

In [754]:
# Build airpt_lst into dataframe, and defining different columns
labels = ['Country', 'Country_code','Airport_name', 'Airport_elevetion']
ndf = pd.DataFrame(airpt_lst, columns = labels)
# Change 'Airport_elevetion' values into numeric values to be able to sort.
ndf['Airport_elevetion'] = pd.to_numeric(ndf['Airport_elevetion'])
airport_hgest = ndf.sort_values('Airport_elevetion', ascending=False)
# Name and Country of airport at highest elevation
airport_hgest.head(1)

Unnamed: 0,Country,Country_code,Airport_name,Airport_elevetion
80,Bolivia,BOL,El Alto Intl,4063.0
