## XML Analysis

In [1]:
from xml.dom.minidom import parse, parseString
import pandas as pd

## DOM

In [2]:
datasource = open('cd.xml')
dom = parse(datasource)  # parse an open file


In [3]:
def getText(nodelist):
    rc = []
    for node in nodelist:
        if node.nodeType == node.TEXT_NODE:
            rc.append(node.data)
    return ''.join(rc)

def handleContent(slides):
    df = pd.DataFrame(
        columns=['title', 'artist', 'country', 'company', 'price', 'year'])
    for slide in slides:
        title = slide.getElementsByTagName("TITLE")[0]
        artist = slide.getElementsByTagName("ARTIST")[0]
        country = slide.getElementsByTagName("COUNTRY")[0]
        company = slide.getElementsByTagName("COMPANY")[0]
        price = slide.getElementsByTagName("PRICE")[0]
        year = slide.getElementsByTagName("YEAR")[0]
        slide_series = pd.Series(
            {'title': getText(title.childNodes),
             'artist': getText(artist.childNodes),
             'country': getText(country.childNodes),
             'company': getText(company.childNodes),
             'price': getText(price.childNodes),
             'year': getText(year.childNodes)})
        df = df.append(slide_series, ignore_index=True)
    return df


def handleCATALOG(catalog):
    cds = catalog.getElementsByTagName("CD")
    return handleContent(cds)



In [4]:
df = handleCATALOG(dom)
df.head()

Unnamed: 0,title,artist,country,company,price,year
0,Empire Burlesque,Bob Dylan,USA,Columbia,10.9,1985
1,Hide your heart,Bonnie Tyler,UK,CBS Records,9.9,1988
2,Greatest Hits,Dolly Parton,USA,RCA,9.9,1982
3,Still got the blues,Gary Moore,UK,Virgin records,10.2,1990
4,Eros,Eros Ramazzotti,EU,BMG,9.9,1997


## ElementTree

In [5]:
import xml.etree.ElementTree as ET
tree = ET.parse('cd.xml')
CATALOG = tree.getroot()

In [6]:
df = pd.DataFrame(
        columns=['title', 'artist', 'country', 'company', 'price', 'year'])
for CD in CATALOG:
    CD_info = {}
    for child in CD:
        CD_info[str(child.tag).lower()]=child.text
    slide_series = pd.Series(CD_info)
    df = df.append(slide_series, ignore_index=True)

df.head()


Unnamed: 0,title,artist,country,company,price,year
0,Empire Burlesque,Bob Dylan,USA,Columbia,10.9,1985
1,Hide your heart,Bonnie Tyler,UK,CBS Records,9.9,1988
2,Greatest Hits,Dolly Parton,USA,RCA,9.9,1982
3,Still got the blues,Gary Moore,UK,Virgin records,10.2,1990
4,Eros,Eros Ramazzotti,EU,BMG,9.9,1997
