In [1]:
#libraries
from chemdataextractor import Document
from chemdataextractor.model import BaseModel, Compound, StringType, ModelType
from chemdataextractor.model.units import TemperatureModel, LengthModel
from chemdataextractor.parse import R, I, W, Optional, merge, join, AutoSentenceParser
from chemdataextractor.doc import Paragraph, Heading, Sentence
from lxml import etree
from pprint import pprint

In [2]:
#reading the document
with open('p1.html', 'rb') as f:
    doc = Document.from_file(f)
print(doc)

<Document: 909 elements>


In [3]:
#Document Metadata
doc.metadata

{'title': '2.1 THz quantum-cascade laser operating up to 144 K based on a scattering-assisted injection design', 'authors': ['Sudeep Khanal', 'John L. Reno', 'Sushil Kumar'], 'publisher': 'Optica Publishing Group', 'journal': 'Optics Express', 'date': '2015/07/22', 'language': 'EN', 'volume': '23', 'issue': '15', 'firstpage': '19689', 'lastpage': '19697', 'doi': '10.1364/OE.23.019689', 'pdf_url': 'https://opg.optica.org/viewmedia.cfm?uri=oe-23-15-19689&seq=0', 'html_url': 'https://opg.optica.org/abstract.cfm?uri=oe-23-15-19689'}

In [4]:
#The qcl Heterostructure(material) property model
class QCLHeterostructure(BaseModel):#inherits the compound model to parse material names
    """ A QCL Heterostructure(Material) property model"""
    specifier_expr =((I('laser')+I('structure'))|(I('design')+I('based')+I('on'))|(W('QW')+I('structure')+I('based')+I('on'))|I('growth')|I('system')|(I('material')+I('of')+I('choice')+I('is'))|(I('was')+I('grown')+I('in'))|(I('module')+I('grown')+I('in'))|(I('material')+I('system'))|(I('material')+I('systems'))|(I('material')+I('combination'))|I('wafer')|I('MBE')).add_action(join)
    specifier = StringType(parse_expression=specifier_expr,required=True, contextual=True)
    compound = ModelType(Compound,required=False,contextual=True)
    #compound.model_class.fields['names'].required = False
    parsers = [AutoSentenceParser()]

In [5]:
#analyzing the whole document at once
doc.models = [QCLHeterostructure]
results=doc.records.serialize()
pprint(results)

[{'QCLHeterostructure': {'compound': {'Compound': {'names': ['THz',
                                                             'terahertz']}},
                         'specifier': 'material system'}},
 {'QCLHeterostructure': {'compound': {'Compound': {'names': ['GaAs / '
                                                             'Al0.15Ga0.85As']}},
                         'specifier': 'material system'}},
 {'QCLHeterostructure': {'compound': {'Compound': {'names': ['THz',
                                                             'terahertz']}},
                         'specifier': 'growth'}},
 {'QCLHeterostructure': {'compound': {'Compound': {'names': ['SARP172']}},
                         'specifier': 'wafer'}},
 {'QCLHeterostructure': {'compound': {'Compound': {'names': ['GaAs / '
                                                             'Al0.15Ga0.85As']}},
                         'specifier': 'was grown in'}},
 {'QCLHeterostructure': {'compound': {'Compound': {'name

In [6]:
#setting the Sentence parsers
sentence_parser = AutoSentenceParser(lenient=True)
sentence_parser.model = QCLHeterostructure

In [None]:
#analyzing the individual tokens
for p in doc.paragraphs:
    for s in p.sentences:
        x=s.tagged_tokens
        for result in sentence_parser.root.scan(x):
            #print(etree.tostring(result[0]), '\n')
            pprint(s.records.serialize())