In [1]:
# import the xml package module

import xml.etree.ElementTree as ET

In [2]:
# load and parse the data into python
tree = ET.parse('data.xml')

In [3]:
# View tree data type
print(type(tree))

<class 'xml.etree.ElementTree.ElementTree'>


In [4]:
# get main(roott) tag of file
root = tree.getroot()
root

<Element 'data' at 0x7d202019e480>

In [None]:
print(root.tag) # view tag
print(root.attrib) # view attributes
print(len(root)) # view length of root i.e. number of children

data
{}
3


In [7]:
# First child of the root
country1 = root[0]

# First child of the child
rank = country1[0]

# What is the tag of the grandchild
print(rank.tag)

# What is the text inside this grandchild
print(rank.text)

# What are the attributes of the last element?
print(country1[4].attrib)

rank
1
{'name': 'Switzerland', 'direction': 'W'}


## Extract the information for all children by iterating thorough the file

In [8]:
# Find all child with tag country
for country in root.findall('country'):
    # rank is child of the country
    rank = country.find('rank').text

    # nae is attribute of the country
    name = country.get('name')
    print(name, rank)

Liechtenstein 1
Singapore 4
Panama 68


In [9]:
# view grandchildren directly if we know their tag:
for neighbor in root.iter('neighbor'):
    print(neighbor.attrib)

{'name': 'Austria', 'direction': 'E'}
{'name': 'Switzerland', 'direction': 'W'}
{'name': 'Malaysia', 'direction': 'N'}
{'name': 'Costa Rica', 'direction': 'W'}
{'name': 'Colombia', 'direction': 'E'}


## Tips and Tricks for root.findall()

In [10]:
# Top-level elements
root.findall(".")
# All 'neighbor' grand=children of 'country' children of the top-level elements
root.findall("./country/neighbor")
# elements wiht name = 'Singapore' that have a 'year' child
root.findall(".//year/..[@name='Singapore']")
# 'year' elements that are hcildren of elements with name = 'Singapore'
root.findall(".//*[@name='Singapore']/year")
# All 'neighbor' elements that are the second child of their parent
root.findall(".//neighbor[2]")

[<Element 'neighbor' at 0x7d202019e660>,
 <Element 'neighbor' at 0x7d202019e9d0>]

## Exercise: Extract the name, rank, year and gdppc from the countries and create a Pandas DataFrame

In [11]:
#1. import necessary modules

import xml.etree.ElementTree as ET
import pandas as pd


#2. Parse file and obtain root
tree = ET.parse('data.xml') # load from file
root = tree.getroot()

#3. Create dictionary columns for our dataframe
my_dict = {'name': [],
            'rank': [],
            'year': [],
            'gdppc': []}


#4. Iterate through file and extract necessary information for dataframe

for country in root:
    name_value = country.attrib['name']
    my_dict['name'].append(name_value)

    rank_value = country[0].text
    my_dict['rank'].append(rank_value)

    year_value = country[1].text
    my_dict['year'].append(year_value)

    gdppc_value = country[2].text
    my_dict['gdppc'].append(gdppc_value)

#5. convert dictionary with extracted values into DataFrame
df = pd.DataFrame(my_dict)
df

Unnamed: 0,name,rank,year,gdppc
0,Liechtenstein,1,2008,141100
1,Singapore,4,2011,59900
2,Panama,68,2011,13600
