In [127]:
#libraries
from chemdataextractor import Document
from chemdataextractor.model import BaseModel, Compound, StringType, ModelType
from chemdataextractor.model.units import TemperatureModel, LengthModel
from chemdataextractor.parse import R, I, W, Optional, merge, join, AutoSentenceParser
from chemdataextractor.doc import Paragraph, Heading, Sentence
from pprint import pprint
from lxml import etree
from bs4 import BeautifulSoup 

In [128]:
#reading the document
with open('345.html', 'rb') as f:
    doc = Document.from_file(f)
print(doc)

<Document: 204 elements>


In [130]:
#document elements
#doc.elements

In [131]:
#Document Metadata
doc.metadata

{'title': '3.4-THz quantum cascade laser based on longitudinal-optical-phonon scattering for depopulation', 'authors': ['Benjamin S.  Williams', 'Hans  Callebaut', 'Sushil  Kumar', 'Qing  Hu', 'John L.  Reno'], 'publisher': 'American Institute of PhysicsAIP', 'journal': 'Applied Physics Letters', 'date': '2003-02-17', 'volume': '82', 'issue': '7', 'firstpage': '1015', 'doi': '10.1063/1.1554479', 'pdf_url': 'https://aip.scitation.org/doi/pdf/10.1063/1.1554479', 'html_url': 'https://aip.scitation.org/doi/abs/10.1063/1.1554479'}

In [132]:
#working temperature property model
class WorkingTemperature(TemperatureModel):#inherits the temperature model to handle the units
    """ A working temperature property"""
    specifier_expr = ((I('heat')+I('-')+I('sink')+I('temperature')+I('of'))|(I('operating')+I('temperature'))|(I('operation')+I('temperature'))|(I('maximum')+I('temperature'))|(I('working')+I('temperature'))|(I('maximum')+I('heat')+I('-')+I('sink')+I('temperature'))|(I('temperatures')+I('up')+I('to'))|(I('lased')+I('up')+I('to'))).add_action(join)
    specifier = StringType(parse_expression=specifier_expr,required=True, contextual=True)
    compound = ModelType(Compound,required=False,contextual=True)
    compound.model_class.fields['names'].required = False
    parsers = [AutoSentenceParser()]

In [133]:
#analyzing the whole document at once
doc.models = [WorkingTemperature]
results=doc.records.serialize()
pprint(results)

[{'WorkingTemperature': {'compound': {'Compound': {'names': ['SIGN IN']}},
                         'raw_units': 'K',
                         'raw_value': '65',
                         'specifier': 'temperatures up to',
                         'units': 'Kelvin^(1.0)',
                         'value': [65.0]}}]


In [None]:
#analyzing document at sentence level
for i in doc.paragraphs:
    for j in i.sentences:
        j.models=[WorkingTemperature]
        results=j.records.serialize()
        pprint(results)

In [135]:
#setting the Sentence parsers
sentence_parser = AutoSentenceParser(lenient=True)
sentence_parser.model = WorkingTemperature

In [None]:
#analyzing the individual tokens
for p in doc.paragraphs:
    for s in p.sentences:
        x=s.tagged_tokens
        for result in sentence_parser.root.scan(x):
            print(etree.tostring(result[0]), '\n')
        pprint(s.records.serialize())