In [53]:
#libraries
from chemdataextractor import Document
from chemdataextractor.model import BaseModel, Compound, StringType, ModelType
from chemdataextractor.model.units import TemperatureModel, LengthModel
from chemdataextractor.parse import R, I, W, Optional, merge, join, AutoSentenceParser
from chemdataextractor.doc import Paragraph, Heading, Sentence
from pprint import pprint
from lxml import etree

In [54]:
#reading the document
with open('html_files (2).html', 'rb') as f:
    doc = Document.from_file(f)
print(doc)

<Document: 280 elements>


In [55]:
#document elements
doc.elements

[{'title': 'High performance 4.7 THz GaAs quantum cascade lasers based on four quantum wells', 'authors': ['Keita Ohtani', 'Dana Turčinková', 'Christopher Bonzon', 'Ileana-Cristina Benea-Chelmus', 'Mattias Beck', 'Jérôme Faist', 'Matthias Justen', 'Urs U Graf', 'Marc Mertens', 'Jürgen Stutzki'], 'publisher': 'IOP Publishing', 'journal': 'New Journal of Physics', 'date': '2016/12/02', 'language': 'en', 'volume': '18', 'issue': '12', 'firstpage': '123004', 'doi': '10.1088/1367-2630/18/12/123004', 'pdf_url': 'https://iopscience.iop.org/article/10.1088/1367-2630/18/12/123004/pdf', 'html_url': 'https://iopscience.iop.org/article/10.1088/1367-2630/18/12/123004/meta'},
 Paragraph(id='cookieBanner', references=[], text='\nThis site uses cookies. By continuing to use this site you agree to our use of cookies. To find out more, see our\nPrivacy and Cookies policy.\nClose this notification\n'),
 Paragraph(id=None, references=[], text='Accessibility Links'),
 Paragraph(id=None, references=[], text

In [56]:
#Document Metadata
doc.metadata

{'title': 'High performance 4.7 THz GaAs quantum cascade lasers based on four quantum wells', 'authors': ['Keita Ohtani', 'Dana Turčinková', 'Christopher Bonzon', 'Ileana-Cristina Benea-Chelmus', 'Mattias Beck', 'Jérôme Faist', 'Matthias Justen', 'Urs U Graf', 'Marc Mertens', 'Jürgen Stutzki'], 'publisher': 'IOP Publishing', 'journal': 'New Journal of Physics', 'date': '2016/12/02', 'language': 'en', 'volume': '18', 'issue': '12', 'firstpage': '123004', 'doi': '10.1088/1367-2630/18/12/123004', 'pdf_url': 'https://iopscience.iop.org/article/10.1088/1367-2630/18/12/123004/pdf', 'html_url': 'https://iopscience.iop.org/article/10.1088/1367-2630/18/12/123004/meta'}

In [57]:
#Document Tokens
for p in doc.paragraphs:
    for s in p.sentences:
        print(s.tagged_tokens)

[('This', 'DT'), ('site', 'NN'), ('uses', 'VBZ'), ('cookies', 'NNS'), ('.', '.')]
[('By', 'IN'), ('continuing', 'VBG'), ('to', 'TO'), ('use', 'VB'), ('this', 'DT'), ('site', 'NN'), ('you', 'PRP'), ('agree', 'VBP'), ('to', 'IN'), ('our', 'PRP$'), ('use', 'NN'), ('of', 'IN'), ('cookies', 'NNS'), ('.', '.')]
[('To', 'TO'), ('find', 'VB'), ('out', 'RP'), ('more', 'JJR'), (',', ','), ('see', 'VB'), ('our', 'PRP$'), ('Privacy', 'NN'), ('and', 'CC'), ('Cookies', 'NNP'), ('policy', 'NN'), ('.', '.')]
[('Close', 'RB'), ('this', 'DT'), ('notification', 'NN')]
[('Accessibility', 'NN'), ('Links', 'VBZ')]
[('Skip', 'NN'), ('to', 'TO'), ('content', 'VB')]
[('Skip', 'NN'), ('to', 'TO'), ('search', 'VB'), ('IOPscience', 'NN')]
[('Skip', 'NN'), ('to', 'IN'), ('Journals', 'NNS'), ('list', 'NN')]
[('Accessibility', 'NN'), ('help', 'VBP')]
[('IOP', 'NNP'), ('Science', 'NNP'), ('home', 'NN')]
[('Skip', 'NN'), ('to', 'TO'), ('content', 'VB')]
[('Accessibility', 'NN'), ('Help', 'NNP')]
[('Search', 'NNP')]
[(

In [58]:
#working temperature property model
class WorkingTemperature(TemperatureModel):#inherits the temperature model to handle the units
    """ A working temperature property"""
    specifier_expr = ((I('heat')+I('-')+I('sink')+I('temperature')+I('of'))|(I('operating')+I('temperature'))|(I('operation')+I('temperature'))|(I('maximum')+I('temperature'))|(I('working')+I('temperature'))|(I('pulsed')+('mode'))|(I('continous')+I('wave')+I('mode'))|(I('maximum')+I('heat')+I('-')+I('sink')+I('temperature'))|(I('temperatures')+I('up')+I('to'))|(I('lased')+I('up')+I('to'))).add_action(join)
    specifier = StringType(parse_expression=specifier_expr,required=True, contextual=True)
    compound = ModelType(Compound,required=False,contextual=True)
    compound.model_class.fields['names'].required = False
    parsers = [AutoSentenceParser()]

In [59]:
doc.models = [WorkingTemperature]
results=doc.records.serialize()
pprint(results)

[{'WorkingTemperature': {'compound': {'Compound': {'names': ['Cu']}},
                         'raw_units': 'K',
                         'raw_value': '150',
                         'specifier': 'operation temperature',
                         'units': 'Kelvin^(1.0)',
                         'value': [150.0]}}]
