# XML example and exercise
****
+ study examples of accessing nodes in XML tree structure  
+ work on exercise to be completed and submitted
****
+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial
****

In [5]:
from xml.etree import ElementTree as ET

## XML example

+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html

In [6]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [15]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)

Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra


In [17]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':', capitals_string = '')
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])

TypeError: 'capitals_string' is an invalid keyword argument for this function

****
## XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

1. 10 countries with the lowest infant mortality rates
2. 10 cities with the largest population
3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [18]:
document = ET.parse( './data/mondial_database.xml' )
root = document.getroot() 

# Anwers

EXERCISE 1 

In [10]:
import pandas as pd

In [11]:
# Extracting values to future DataFrame - PLan: Create a easy dataframe and use Pandas on it



ex1 = { 'mort': [], 'name':[]}


for x in root.iter('country'):
    for name,mort in zip(x.findall('name'),x.findall('infant_mortality')):
        ex1['name'].append(name.text)
        ex1['mort'].append(mort.text)
    
    
# Using Pandas to show the results

data = pd.DataFrame(data=ex1)
data = data[['name','mort']]
data['mort']=data.mort.astype(float)

data.sort_values(by='mort').head(10)


Unnamed: 0,name,mort
36,Monaco,1.81
90,Japan,2.13
109,Bermuda,2.48
34,Norway,2.48
98,Singapore,2.53
35,Sweden,2.6
8,Czech Republic,2.63
72,Hong Kong,2.73
73,Macao,3.13
39,Iceland,3.15


EXERCISE 2

In [12]:
import numpy as np

In [13]:
# Extracting values to future DataFrame - PLan: Create a easy dataframe and use Pandas on it

#ASSUMING THE POPULATION ONLY IN CITIES WITH 2011 CENSUS INFO - NOT DESCRIBED IN THE QUESTION

ex2 = { 'pop': [], 'name':[]}

for ct in root.iter('city'):
    ex2['name'].append(ct[0].text)
    #checking if there is a population tag
    p = ct.find('population')
    if p is None:
        #if not, nan
        ex2['pop'].append(np.nan)
    else:
        # if there is:
        #iterate into population tags to check if is there a attrib = 2011
        check=[]
        for x in ct.iter('population'):
            check.append(x.attrib['year'])
        if '2011' in check:
            for x in ct.iter('population'):
                if x.attrib['year'] == '2011':
                    ex2['pop'].append(x.text)
        else:
            ex2['pop'].append(np.nan)
            
          
    
    
# Using Pandas to show the results

data = pd.DataFrame(data=ex2)
data_clean= data.dropna(axis=0)
data_clean['pop'] = data_clean['pop'].astype(int)



data_clean.sort_values(by='pop', ascending=False).head(10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,name,pop
1527,Mumbai,12442373
1582,Delhi,11034555
1515,Bangalore,8443675
1000,London,8250205
1382,Tehran,8154051
1470,Dhaka,7423137
1591,Hyderabad,6731790
1505,Ahmadabad,5577940
3056,Luanda,5000000
1556,Chennai,4646732


EXERCISE 3

In [14]:
# Extracting values to future DataFrame - PLan: Create a easy dataframe and use Pandas on it

ex3 = {}

for ct in root.iter('country'):
    #checking if there is a ethnicgroup tag
    p = ct.find('ethnicgroup')
    if p is not None:
    # Final Plan:  ex3['ethnicgroup'] += % * population
        
        #getting population variable
        bestyear = []
        for pop in ct.findall('population'):
            bestyear.append(pop.attrib['year'])
        
        population= []
        for pop in ct.findall('population'):
            if pop.attrib['year'] == bestyear[len(bestyear) -1]:
                population.append(pop.text)
        
    
        for et in ct.findall('ethnicgroup'):
            a = et.get('percentage')
            ex3[et.text] =+ ((float(a)/100 )*(float(population[0])))

# seting dict to pandas dataframe            
ex33 = {'ethnic': [], 'pop' : []}
for etc,pop in ex3.items():
    ex33['ethnic'].append(etc)
    ex33['pop'].append(pop)   
    
# Using Pandas to show the results
data = pd.DataFrame(data=ex33)
data.sort_values(by='pop', ascending=False).head(10)



Unnamed: 0,ethnic,pop
80,Han Chinese,1245059000.0
106,Indo-Aryan,871815600.0
105,Dravidian,302713700.0
98,Bengali,146776900.0
139,Japanese,126534200.0
130,Eastern Hamitic,82830380.0
147,Mulatto,78065900.0
119,Viet/Kinh,76078380.0
70,English,53592330.0
17,Mediterranean Nordic,46815920.0


EXERCISE 4

a) Longest River

In [54]:
# dict to create DataFrame to show the results in the end
ex4a = {'river': [], 'country':[], 'length':[] }

#Filling the dict with , country, river, length

country =[]
river = []
length = []

for ct in root.iter('river'):
    
    country.append(ct.get('country')) # getting the name of the country
    
    
    for n in ct.findall('name'): # exploring the name tags inside the lake`s tags
        river.append(n.text)
        
    length_check = ct.find('length')
    if length_check is not None: # just checking if there isn`t any missing tag
        for a in ct.findall('length'): # exploring the area tags inside the lake's tags
            length.append(a.text)     
    else:
        length.append('0')
        
ex4a['country'] = country
ex4a['river'] = river
ex4a['length'] = length

#print(len(ex4a['country']))
#print(len(ex4a['river']))
#print(len(ex4a['length'])) 
        
data = pd.DataFrame(data=ex4a)
data['length'] = data['length'].astype(float)
data.sort_values(by='length', ascending=False).head(10)

Unnamed: 0,country,length,river
174,CO BR PE,6448.0,Amazonas
137,CN,6380.0,Jangtse
136,CN,4845.0,Hwangho
123,R,4400.0,Lena
205,RCB ZRE,4374.0,Zaire
138,CN LAO THA K VN,4350.0,Mekong
115,R KAZ CN,4248.0,Irtysch
186,RMM RN WAN RG,4184.0,Niger
160,USA,4130.0,Missouri
119,R,4092.0,Jenissej


b) Largest Lake

In [50]:
# dict to create DataFrame to show the results in the end
ex4b = {'lake': [], 'country':[], 'area':[] }

#Filling the dict with , country, lake, area

country =[]
lake = []
area = []

for ct in root.iter('lake'):
    
    country.append(ct.get('country')) # getting the name of the country
    
    
    for n in ct.findall('name'): # exploring the name tags inside the lake`s tags
        lake.append(n.text)
        
    area_check = ct.find('area')
    if area_check is not None: # there are two lines with no area tag so we need to check first e add a 0 value to them
        for a in ct.findall('area'): # exploring the area tags inside the lake's tags
            area.append(a.text)     
    else:
        area.append('0')
        
ex4b['country'] = country
ex4b['lake'] = lake
ex4b['area'] = area

#print(len(ex4a['country']))
#print(len(ex4a['lake']))
#print(len(ex4a['area'])) #- 139 !! There are two lines with no area value, so we need to address this in the code
        
data = pd.DataFrame(data=ex4b)
data['area'] = data['area'].astype(float)
data.sort_values(by='area', ascending=False).head(10)

Unnamed: 0,area,country,lake
54,386400.0,R AZ KAZ IR TM,Caspian Sea
109,82103.0,CDN USA,Lake Superior
81,68870.0,EAT EAK EAU,Lake Victoria
106,59600.0,CDN USA,Lake Huron
108,57800.0,USA,Lake Michigan
47,41650.0,IL JOR WEST,Dead Sea
83,32893.0,ZRE Z BI EAT,Lake Tanganjika
98,31792.0,CDN,Great Bear Lake
43,31492.0,R,Ozero Baikal
89,29600.0,MW MOC EAT,Lake Malawi


c) airport at highest elevation 

In [62]:
# dict to create DataFrame to show the results in the end
ex4c = {'airport': [], 'country':[], 'elevation':[] }

#Filling the dict with , country, airport, elevation

country =[]
airport = []
elevation = []

for ct in root.iter('airport'):
    
    country.append(ct.get('country')) # getting the name of the country
    
    
    for n in ct.findall('name'): # exploring the name tags inside the lake`s tags
        airport.append(n.text)
        
    elevation_check = ct.find('elevation')
    if elevation_check is not None: # there are two lines with no area tag so we need to check first e add a 0 value to them
        for a in ct.findall('elevation'): # exploring the elevation tags inside the lake's tags
            elevation.append(a.text)     
    else:
        area.append('0')
        
ex4c['country'] = country
ex4c['airport'] = airport
ex4c['elevation'] = elevation

#print(len(ex4c['country']))
#print(len(ex4c['airport']))
#print(len(ex4c['elevation'])) 
        
data = pd.DataFrame(data=ex4c)
data['elevation'] = data['elevation'].astype(float)
data.sort_values(by='elevation', ascending=False).head(10)

Unnamed: 0,airport,country,elevation
80,El Alto Intl,BOL,4063.0
219,Lhasa-Gonggar,CN,4005.0
241,Yushu Batang,CN,3963.0
813,Juliaca,PE,3827.0
815,Teniente Alejandro Velasco Astete Intl,PE,3311.0
82,Juana Azurduy De Padilla,BOL,2905.0
334,Mariscal Sucre Intl,EC,2813.0
805,Coronel Fap Alfredo Mendivil Duarte,PE,2719.0
807,Mayor General FAP Armando Revoredo Iglesias Ai...,PE,2677.0
692,Licenciado Adolfo Lopez Mateos Intl,MEX,2581.0
