<h1> Exploring HDFS Metadata Using XML and XPATH</h1>

In [None]:
from lxml import etree
import re

In [2]:
!ls
flatten = lambda l: [item for sublist in l for item in sublist]


Untitled.ipynb fsimage.xml    index.xml      xml.ipynb
bibs.xml       index.py       search.py


In [3]:
f= open('bibs.xml')
tree=etree.parse(f)
print(etree.tostring(tree,pretty_print=True))

b'<bib>\n<cd>abc</cd>\n<book price="35">\n\t<publisher>Addison-Wesley</publisher>\n        <author>Serge Abiteboul</author>\n        <author><first-name>Rick</first-name><last-name>Hull</last-name></author>\n        <author age="20">Victor Vianu</author>\n        <title>Foundations of Databases</title>\n        <year>1995</year>\n\t<price>38.8</price>\n</book>\n<book price="55">\n        <publisher>Freeman</publisher>\n        <author>Jeffrey D. Ullman</author>\n        <title>Principles of Database and Knowledge Base Systems</title>\n        <year>1998</year>\n</book>\n<book>\n\t<title>xyz</title>\n\t<author/>\n</book>\n</bib>\n'


In [4]:
for element in tree.xpath("//author"):
    print(etree.tostring(element))


b'<author>Serge Abiteboul</author>\n        '
b'<author><first-name>Rick</first-name><last-name>Hull</last-name></author>\n        '
b'<author age="20">Victor Vianu</author>\n        '
b'<author>Jeffrey D. Ullman</author>\n        '
b'<author/>\n'


In [5]:


for element in tree.xpath("//author"):
    print(element.tag,element.text)

author Serge Abiteboul
author None
author Victor Vianu
author Jeffrey D. Ullman
author None


In [6]:
# We can add a condition to the selections
# For example only authors who have a first name element of Rick 

for element in tree.xpath('//author[first-name="Rick"]'):
    print(etree.tostring(element))

b'<author><first-name>Rick</first-name><last-name>Hull</last-name></author>\n        '


In [7]:
def printf(elems,mode = 'node'):
    if mode == 'node':
        for elem in elems:
            print (etree.tostring(elem).decode('utf-8'))
    if mode == 'text':
        for elem in elems:
            print(elem.text)

In [8]:
printf(tree.xpath('//author[first-name="Rick"]'))


<author><first-name>Rick</first-name><last-name>Hull</last-name></author>
        


In [9]:
# In order to work with an HTML document we must import html from lxml
from lxml import html

In [10]:
# First we import the file fsimage.xml as a tree
f= open('fsimage.xml')
fstree=etree.parse(f)

In [11]:
printf(fstree.xpath('//INodeSection'))

<INodeSection><lastInodeId>16422</lastInodeId><numInodes>38</numInodes><inode><id>16385</id><type>DIRECTORY</type><name/><mtime>1581231015982</mtime><permission>ec2-user:supergroup:0755</permission><nsquota>9223372036854775807</nsquota><dsquota>-1</dsquota></inode>
<inode><id>16386</id><type>DIRECTORY</type><name>user</name><mtime>1581231034866</mtime><permission>ec2-user:supergroup:0755</permission><nsquota>-1</nsquota><dsquota>-1</dsquota></inode>
<inode><id>16387</id><type>DIRECTORY</type><name>ec2-user</name><mtime>1581875598912</mtime><permission>ec2-user:supergroup:0755</permission><nsquota>-1</nsquota><dsquota>-1</dsquota></inode>
<inode><id>16388</id><type>DIRECTORY</type><name>input1</name><mtime>1581379063205</mtime><permission>ec2-user:supergroup:0755</permission><nsquota>-1</nsquota><dsquota>-1</dsquota></inode>
<inode><id>16389</id><type>FILE</type><name>a.txt</name><replication>1</replication><mtime>1581379063191</mtime><atime>1581379062314</atime><preferredBlockSize>1342

In [12]:
for element in fstree.xpath("//INodeSection/inode/name"):
    if(element.text):
        print(element.text.replace(".txt","").replace(".xml",""))


user
ec2-user
input1
a
b
input
capacity-scheduler
core-site
hadoop-policy
hdfs-site
httpfs-site
kms-acls
kms-site
mapred-site
yarn-site
inf551
input
capacity-scheduler
core-site
hadoop-policy
hdfs-site
httpfs-site
kms-acls
kms-site
mapred-site
yarn-site
inf351
input
capacity-scheduler
core-site
hadoop-policy
hdfs-site
httpfs-site
kms-acls
kms-site
mapred-site
yarn-site


In [13]:
elem_list =[]
for element in fstree.xpath("//INodeSection/inode/name"):
    if(element.text):
        elem_list.append(element.text.replace(".txt","").replace(".xml",""))

token_list = [sub.replace('-',' ').split(' ') for sub in elem_list]

In [14]:
print(flatten(token_list))
flat_token_list = set(flatten(token_list))


['user', 'ec2', 'user', 'input1', 'a', 'b', 'input', 'capacity', 'scheduler', 'core', 'site', 'hadoop', 'policy', 'hdfs', 'site', 'httpfs', 'site', 'kms', 'acls', 'kms', 'site', 'mapred', 'site', 'yarn', 'site', 'inf551', 'input', 'capacity', 'scheduler', 'core', 'site', 'hadoop', 'policy', 'hdfs', 'site', 'httpfs', 'site', 'kms', 'acls', 'kms', 'site', 'mapred', 'site', 'yarn', 'site', 'inf351', 'input', 'capacity', 'scheduler', 'core', 'site', 'hadoop', 'policy', 'hdfs', 'site', 'httpfs', 'site', 'kms', 'acls', 'kms', 'site', 'mapred', 'site', 'yarn', 'site']


In [15]:
for name in flat_token_list:
    for element in fstree.xpath("//INodeSection/inode[contains(name,'{}')]/id".format(name)):
        print(name,element.text)



a 16389
a 16392
a 16394
a 16397
a 16399
a 16400
a 16403
a 16405
a 16408
a 16410
a 16411
a 16414
a 16416
a 16419
a 16421
a 16422
user 16386
user 16387
acls 16397
acls 16408
acls 16419
hdfs 16395
hdfs 16406
hdfs 16417
site 16393
site 16395
site 16396
site 16398
site 16399
site 16400
site 16404
site 16406
site 16407
site 16409
site 16410
site 16411
site 16415
site 16417
site 16418
site 16420
site 16421
site 16422
yarn 16400
yarn 16411
yarn 16422
input 16388
input 16391
input 16402
input 16413
kms 16397
kms 16398
kms 16408
kms 16409
kms 16419
kms 16420
hadoop 16394
hadoop 16405
hadoop 16416
core 16393
core 16404
core 16415
scheduler 16392
scheduler 16403
scheduler 16414
input1 16388
ec2 16387
policy 16394
policy 16405
policy 16416
inf351 16412
inf551 16401
httpfs 16396
httpfs 16407
httpfs 16418
mapred 16399
mapred 16410
mapred 16421
b 16390
capacity 16392
capacity 16403
capacity 16414


In [16]:
# We have created both a list of file and directory names. We have also found the inode numbers for each of these tokens 

printf(tree.xpath("//book[contains(author,'Serge')]/title"))

<title>Foundations of Databases</title>
        


In [17]:
#We must then make another tree and add all of the tokens to the index
index_root=etree.Element("index")
print(index_root.tag)

index


In [18]:
# postings = index_root.append(etree.Element("postings"))
# postings = index_root[0]
# etree.SubElement(postings, "name").text = "check"

In [19]:
print(etree.tostring(index_root,pretty_print=True))

b'<index/>\n'


In [20]:
# posting =index_root.append(etree.Element("postings"))
# print(etree.tostring(index_root,pretty_print=True))

In [21]:
# etree.SubElement(index_root, "postings").text="Core"
# etree.SubElement(index_root, "postings").text="site"


In [22]:
# print(etree.tostring(index_root,pretty_print=True))

In [23]:
# Lets try to make it 
index_root=etree.Element("index")

In [24]:
i=0
for token in flat_token_list:
    postings = index_root.append(etree.Element("postings"))
    postings = index_root[i]
    etree.SubElement(postings, "name").text = token
    i =i+1
    j = 0
    for element in fstree.xpath("//INodeSection/inode[contains(name,'{}')]/id".format(token)):
        etree.SubElement(postings, "inumber").text = element.text

print(etree.tostring(index_root,pretty_print=True))



b'<index>\n  <postings>\n    <name>a</name>\n    <inumber>16389</inumber>\n    <inumber>16392</inumber>\n    <inumber>16394</inumber>\n    <inumber>16397</inumber>\n    <inumber>16399</inumber>\n    <inumber>16400</inumber>\n    <inumber>16403</inumber>\n    <inumber>16405</inumber>\n    <inumber>16408</inumber>\n    <inumber>16410</inumber>\n    <inumber>16411</inumber>\n    <inumber>16414</inumber>\n    <inumber>16416</inumber>\n    <inumber>16419</inumber>\n    <inumber>16421</inumber>\n    <inumber>16422</inumber>\n  </postings>\n  <postings>\n    <name>user</name>\n    <inumber>16386</inumber>\n    <inumber>16387</inumber>\n  </postings>\n  <postings>\n    <name>acls</name>\n    <inumber>16397</inumber>\n    <inumber>16408</inumber>\n    <inumber>16419</inumber>\n  </postings>\n  <postings>\n    <name>hdfs</name>\n    <inumber>16395</inumber>\n    <inumber>16406</inumber>\n    <inumber>16417</inumber>\n  </postings>\n  <postings>\n    <name>site</name>\n    <inumber>16393</inumber

In [25]:
with open("index.xml", 'wb') as doc:
   doc.write(etree.tostring(index_root, pretty_print = True))