## Problem 4
Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, 
find the name and country of a) longest river, b) largest lake and c) airport at highest elevation.

In [3]:
from xml.etree import ElementTree as ET
import pandas as pd

In [23]:
# Get data.
document_tree = ET.parse("data/mondial_database.xml",)
document_root = document_tree.getroot()

# Make a data frame for the country codes. This will be used throughout each part of this problem.
df_country_codes = pd.DataFrame(columns=["Country", "Code"])
for elem in document_tree.iter(tag='country'):
    df_country_codes = df_country_codes.append(
                {
                    "Country": elem.find("name").text,
                    "Code": elem.attrib["car_code"]
                }, ignore_index=True)

Unnamed: 0,Country,Code
141,Colombia,CO
176,Brazil,BR
180,Peru,PE


## a) The longest river.

In [20]:
# Data frame for the rivers, their length, and the codes of any countries they pass through.
df_river = pd.DataFrame(columns=["River", "Length", "Country"])

# Use an iterator to search the tree and populate our data frame.
for elem in document_tree.iterfind('river'):
    if elem.find("length") is None:
                continue;
    df_river = df_river.append(
                {
                    "River": elem.find("name").text,
                    "Length": elem.find("length").text,
                    "Country": elem.attrib["country"]
                }, ignore_index=True)

# Make the river lengths numeric so they can be sorted, then sort them.
df_river["Length"] = pd.to_numeric(df_river["Length"])
df_river.sort_values("Length", ascending = False).head(1)

Unnamed: 0,River,Length,Country
174,Amazonas,6448.0,CO BR PE


In [None]:
# Determine which countries correspond to those codes.
df_country_codes[(df_country_codes.Code == "CO") | (df_country_codes.Code == "BR") | (df_country_codes.Code == "PE")]

## Thus, the longest river is the Amazonas River that runs through the countries of Colombia, Brazil, and Peru.

## b) The largest lake.

In [24]:
# Data frame for the lakes, their areas, and the codes of any countries they are part of.
df_lake = pd.DataFrame(columns=["Lake", "Area", "Country"])

# Use an iterator to search the tree and populate our data frame.
for elem in document_tree.iterfind('lake'):
    if elem.find("area") is None:
                continue;
    df_lake = df_lake.append(
                {
                    "Lake": elem.find("name").text,
                    "Area": elem.find("area").text,
                    "Country": elem.attrib["country"]
                }, ignore_index=True)

# Make the lake areas numeric so they can be sorted, then sort them.
df_lake["Area"] = pd.to_numeric(df_lake["Area"])
df_lake.sort_values("Area", ascending = False).head(1)

Unnamed: 0,Lake,Area,Country
54,Caspian Sea,386400.0,R AZ KAZ IR TM


In [25]:
# Determine which countries correspond to those codes.
df_country_codes[(df_country_codes.Code == "R") | (df_country_codes.Code == "AZ") | (df_country_codes.Code == "KAZ") | (df_country_codes.Code == "IR") | (df_country_codes.Code == "TM")]

Unnamed: 0,Country,Code
23,Russia,R
56,Iran,IR
59,Turkmenistan,TM
63,Azerbaijan,AZ
75,Kazakhstan,KAZ


## Therefore, the largest lake is the Caspian Sea, which is part of Russia, Iran, Turkmenistan, Azerbaijan, and Kazakhstan.

## c) The airport at the highest elevation.

In [26]:
# Data frame for the airports, their elevations, and the codes of any countries they pass through.
df_airport = pd.DataFrame(columns=["Airport", "Elevation", "Country"])

# Use an iterator to search the tree and populate our data frame.
for elem in document_tree.iterfind('airport'):
    if elem.find("elevation") is None:
                continue;
    df_airport = df_airport.append(
                {
                    "Airport": elem.find("name").text,
                    "Elevation": elem.find("elevation").text,
                    "Country": elem.attrib["country"]
                }, ignore_index=True)

# Make the airport elevations numeric so they can be sorted, then sort them.
df_airport["Elevation"] = pd.to_numeric(df_airport["Elevation"])
df_airport.sort_values("Elevation", ascending = False).head(1)

Unnamed: 0,Airport,Elevation,Country
80,El Alto Intl,4063.0,BOL


In [27]:
# Determine which country corresponds to that code.
df_country_codes[df_country_codes.Code == "BOL"]

Unnamed: 0,Country,Code
175,Bolivia,BOL


## The airport at the highest elevation is the El Alto International Airport in Bolivia.