# Parsing XML

Import the Element Tree sub module

In [1]:
import xml.etree.ElementTree as et

Parsing the XML file

In [75]:
tree = et.parse('mondial.xml')

Get the `root` element of the file

In [76]:
root = tree.getroot()

Display the tag

In [24]:
root.tag

'mondial'

Get the attributes of this element (None in this case)

In [25]:
root.attrib

{}

iterate over subelements ("children") in the root

In [27]:
for sub_e in root[:10]:
    print(sub_e.tag, sub_e.attrib)

continent {'id': 'f0_119', 'name': 'Europe'}
continent {'id': 'f0_123', 'name': 'Asia'}
continent {'id': 'f0_126', 'name': 'America'}
continent {'id': 'f0_129', 'name': 'Australia/Oceania'}
continent {'id': 'f0_132', 'name': 'Africa'}
country {'id': 'f0_136', 'name': 'Albania', 'capital': 'f0_1461', 'population': '3249136', 'datacode': 'AL', 'total_area': '28750', 'population_growth': '1.34', 'infant_mortality': '49.2', 'gdp_agri': '55', 'gdp_total': '4100', 'inflation': '16', 'indep_date': '28 11 1912', 'government': 'emerging democracy', 'car_code': 'AL'}
country {'id': 'f0_144', 'name': 'Andorra', 'capital': 'f0_1464', 'population': '72766', 'datacode': 'AN', 'total_area': '450', 'population_growth': '2.96', 'infant_mortality': '2.2', 'gdp_total': '1000', 'government': '     parliamentary democracy that retains as its heads of state a coprincipality', 'car_code': 'AND'}
country {'id': 'f0_149', 'name': 'Austria', 'capital': 'f0_1467', 'population': '8023244', 'datacode': 'AU', 'tota

In [30]:
[element.tag for element in root[:10]]

['continent',
 'continent',
 'continent',
 'continent',
 'continent',
 'country',
 'country',
 'country',
 'country',
 'country']

In [45]:
[[element.tag, element.attrib] for element in root.iter()][:20]

[['mondial', {}],
 ['continent', {'id': 'f0_119', 'name': 'Europe'}],
 ['continent', {'id': 'f0_123', 'name': 'Asia'}],
 ['continent', {'id': 'f0_126', 'name': 'America'}],
 ['continent', {'id': 'f0_129', 'name': 'Australia/Oceania'}],
 ['continent', {'id': 'f0_132', 'name': 'Africa'}],
 ['country',
  {'id': 'f0_136',
   'name': 'Albania',
   'capital': 'f0_1461',
   'population': '3249136',
   'datacode': 'AL',
   'total_area': '28750',
   'population_growth': '1.34',
   'infant_mortality': '49.2',
   'gdp_agri': '55',
   'gdp_total': '4100',
   'inflation': '16',
   'indep_date': '28 11 1912',
   'government': 'emerging democracy',
   'car_code': 'AL'}],
 ['name', {}],
 ['city',
  {'id': 'f0_1461',
   'country': 'f0_136',
   'longitude': '10.7',
   'latitude': '46.2'}],
 ['name', {}],
 ['population', {'year': '87'}],
 ['city',
  {'id': 'f0_36498',
   'country': 'f0_136',
   'longitude': '19.2',
   'latitude': '42.2'}],
 ['name', {}],
 ['population', {'year': '87'}],
 ['located_at', {

We can use strings as XMLs as well

In [62]:
xml_str = '''<breakfast_menu>
<food>
<name>Belgian Waffles</name>
<price>$5.95</price>
<description>
Two of our famous Belgian Waffles with plenty of real maple syrup
</description>
<calories>650</calories>
</food>
<food>
<name>Strawberry Belgian Waffles</name>
<price>$7.95</price>
<description>
Light Belgian waffles covered with strawberries and whipped cream
</description>
<calories>900</calories>
</food>
<food>
<name>Berry-Berry Belgian Waffles</name>
<price>$8.95</price>
<description>
Light Belgian waffles covered with an assortment of fresh berries and whipped cream
</description>
<calories>900</calories>
</food>
<food>
<name>French Toast</name>
<price>$4.50</price>
<description>
Thick slices made from our homemade sourdough bread
</description>
<calories>600</calories>
</food>
<food>
<name>Homestyle Breakfast</name>
<price>$6.95</price>
<description>
Two eggs, bacon or sausage, toast, and our ever-popular hash browns
</description>
<calories>950</calories>
</food>
</breakfast_menu>'''

### fromstring

In [111]:
root = et.fromstring(xml_str)

Using tostring won't get us the information in a readable format

In [73]:
print(et.tostring(root))

b'<breakfast_menu>\n<food>\n<name>Belgian Waffles</name>\n<price>$5.95</price>\n<description>\nTwo of our famous Belgian Waffles with plenty of real maple syrup\n</description>\n<calories>650</calories>\n</food>\n<food>\n<name>Strawberry Belgian Waffles</name>\n<price>$7.95</price>\n<description>\nLight Belgian waffles covered with strawberries and whipped cream\n</description>\n<calories>900</calories>\n</food>\n<food>\n<name>Berry-Berry Belgian Waffles</name>\n<price>$8.95</price>\n<description>\nLight Belgian waffles covered with an assortment of fresh berries and whipped cream\n</description>\n<calories>900</calories>\n</food>\n<food>\n<name>French Toast</name>\n<price>$4.50</price>\n<description>\nThick slices made from our homemade sourdough bread\n</description>\n<calories>600</calories>\n</food>\n<food>\n<name>Homestyle Breakfast</name>\n<price>$6.95</price>\n<description>\nTwo eggs, bacon or sausage, toast, and our ever-popular hash browns\n</description>\n<calories>950</calor

For that we must specify both the encoding and decoding

In [74]:
print(et.tostring(root, encoding='utf8').decode('utf8'))

<?xml version='1.0' encoding='utf8'?>
<breakfast_menu>
<food>
<name>Belgian Waffles</name>
<price>$5.95</price>
<description>
Two of our famous Belgian Waffles with plenty of real maple syrup
</description>
<calories>650</calories>
</food>
<food>
<name>Strawberry Belgian Waffles</name>
<price>$7.95</price>
<description>
Light Belgian waffles covered with strawberries and whipped cream
</description>
<calories>900</calories>
</food>
<food>
<name>Berry-Berry Belgian Waffles</name>
<price>$8.95</price>
<description>
Light Belgian waffles covered with an assortment of fresh berries and whipped cream
</description>
<calories>900</calories>
</food>
<food>
<name>French Toast</name>
<price>$4.50</price>
<description>
Thick slices made from our homemade sourdough bread
</description>
<calories>600</calories>
</food>
<food>
<name>Homestyle Breakfast</name>
<price>$6.95</price>
<description>
Two eggs, bacon or sausage, toast, and our ever-popular hash browns
</description>
<calories>950</calories

Get the text within the elements using `text`

In [115]:
for detail in root.iter():
    print(detail.text, end='')



Belgian Waffles$5.95
Two of our famous Belgian Waffles with plenty of real maple syrup
650
Strawberry Belgian Waffles$7.95
Light Belgian waffles covered with strawberries and whipped cream
900
Berry-Berry Belgian Waffles$8.95
Light Belgian waffles covered with an assortment of fresh berries and whipped cream
900
French Toast$4.50
Thick slices made from our homemade sourdough bread
600
Homestyle Breakfast$6.95
Two eggs, bacon or sausage, toast, and our ever-popular hash browns
950

we can focus the `iter` function to a specific sub-element

In [117]:
for detail in root.iter('description'):
    print(detail.text, end='')


Two of our famous Belgian Waffles with plenty of real maple syrup

Light Belgian waffles covered with strawberries and whipped cream

Light Belgian waffles covered with an assortment of fresh berries and whipped cream

Thick slices made from our homemade sourdough bread

Two eggs, bacon or sausage, toast, and our ever-popular hash browns


In [123]:
print(et.tostring(mondial, encoding='utf-8').decode('utf-8'))

<mondial>
   <continent id="f0_119" name="Europe" />
   <continent id="f0_123" name="Asia" />
   <continent id="f0_126" name="America" />
   <continent id="f0_129" name="Australia/Oceania" />
   <continent id="f0_132" name="Africa" />
   <country capital="f0_1461" car_code="AL" datacode="AL" gdp_agri="55" gdp_total="4100" government="emerging democracy" id="f0_136" indep_date="28 11 1912" infant_mortality="49.2" inflation="16" name="Albania" population="3249136" population_growth="1.34" total_area="28750">
     <name>
       Albania
     </name>
     <city country="f0_136" id="f0_1461" latitude="46.2" longitude="10.7">
       <name>
         Tirane
       </name>
       <population year="87">
         192000
       </population>
     </city>
     <city country="f0_136" id="f0_36498" latitude="42.2" longitude="19.2">
       <name>
         Shkoder
       </name>
       <population year="87">
         62000
       </population>
       <located_at type="lake" water="f0_39058" />
     </ci

### findall
Get all movies from 1992

In [138]:
tree = et.parse('movies.xml')
root = tree.getroot()
for movie in root.findall("./genre/decade/movie/[year='1992']"):
    print(movie.attrib)

{'favorite': 'True', 'title': 'Batman Returns'}
{'favorite': 'False', 'title': 'Reservoir Dogs'}


find the number of movies that are available in multiple formats

In [132]:
for movie in root.findall("./genre/decade/movie/format/[@multiple='Yes']"):
    print(movie.tag,movie.attrib)

format {'multiple': 'Yes'}
format {'multiple': 'Yes'}
format {'multiple': 'Yes'}


If we want the actual names of the movies we can add 3 dots to return the parent element

In [133]:
for movie in root.findall("./genre/decade/movie/format[@multiple='Yes']..."):
    print(movie.attrib)

{'favorite': 'True', 'title': 'THE KARATE KID'}
{'favorite': 'False', 'title': 'X-Men'}
{'favorite': 'False', 'title': 'ALIEN'}


Unlike `findall`, `find` will not return a list, but a single element

In [145]:
b2tf = root.find("./genre/decade/movie[@title='Back 2 the Future']")
print(b2tf)

<Element 'movie' at 0x16C0ABD0>


We can modify an attribute easily that way

In [146]:
b2tf.attrib["title"] = "Back to the Future"
print(b2tf.attrib)

{'favorite': 'False', 'title': 'Back to the Future'}


### write
We can save our changes back to the original file using the `write` method

In [147]:
tree.write('movies.xml')

Let's make sure that our change has been saved by reading the file again

In [148]:
tree = et.parse('movies.xml')
root = tree.getroot()

In [150]:
for element in root.iter('movie'):
    print(element.tag, element.attrib)

movie {'favorite': 'True', 'title': 'Indiana Jones: The raiders of the lost Ark'}
movie {'favorite': 'True', 'title': 'THE KARATE KID'}
movie {'favorite': 'False', 'title': 'Back to the Future'}
movie {'favorite': 'False', 'title': 'X-Men'}
movie {'favorite': 'True', 'title': 'Batman Returns'}
movie {'favorite': 'False', 'title': 'Reservoir Dogs'}
movie {'favorite': 'False', 'title': 'ALIEN'}
movie {'favorite': 'True', 'title': "Ferris Bueller's Day Off"}
movie {'favorite': 'FALSE', 'title': 'American Psycho'}
