## Problem 2
Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find the 10 cities with the largest population.

In [2]:
from xml.etree import ElementTree as ET
import pandas as pd

In [3]:
# Get data.
document_tree = ET.parse("data/mondial_database.xml",)
document_root = document_tree.getroot()

# Create a data frame for city populations. 
df_city_pop = pd.DataFrame(columns=["city", "year", "population"])

# Use an iterator to get each city name, population, and year the population was measured.
for country in document_root.getiterator("country"):
    for city in country.iter("city"):
        city_name = city.find("name").text
        
        if city.findall("population") is None:
            continue;
        
        for pop in city.findall("population"):
            year = pop.attrib["year"]
            population = pop.text
            
            df_city_pop = df_city_pop.append(
            {
                "city": city_name,
                "year": year,
                "population": population
            }, ignore_index=True)
            
# Check the data.
df_city_pop.head()

Unnamed: 0,city,year,population
0,Tirana,1987,192000
1,Tirana,1990,244153
2,Tirana,2011,418495
3,Shkodër,1987,62000
4,Shkodër,2011,77075


In [4]:
# Make a copy of the data frame.
df_city_pop_c = df_city_pop.copy()

# Create a new data frame that will hold only the most recent data on populations.
df_latest_city_pop = pd.DataFrame(columns=["city", "year", "population"])

# Change the population column to numeric so that it can be sorted.
df_city_pop_c["population"] = pd.to_numeric(df_city_pop_c["population"])

# Make sure that the cities are unique.
cities = df_city_pop["city"].unique()

# Populate the new data frame with only the most recent population measurement for each city.
for city in cities:
    city_pop = df_city_pop_c[df_city_pop_c.city == city]
    latest_measurement = city_pop[city_pop["year"] == city_pop["year"].max()]
    df_latest_city_pop = df_latest_city_pop.append(latest_measurement)

# Delete the unncessary column.    
del df_latest_city_pop["year"]

# Sort the table by population values.
df_latest_city_pop.sort_values("population", ascending=False)[0:10]

Unnamed: 0,city,population
3750,Shanghai,22315474
2607,Istanbul,13710512
4303,Mumbai,12442373
1546,Moskva,11979529
3746,Beijing,11716620
8208,São Paulo,11152344
3754,Tianjin,11090314
3364,Guangzhou,11071424
4399,Delhi,11034555
3371,Shenzhen,10358381
