In [1]:
#future imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import abstractproperty, abstractmethod

In [2]:
#libraries
from chemdataextractor import Document
from chemdataextractor.model import BaseModel, Compound, StringType, ModelType,ListType, ModelType, SetType
from chemdataextractor.model.units import TemperatureModel, LengthModel
from chemdataextractor.parse import R, I, W, Optional, merge, join, AutoSentenceParser
from chemdataextractor.doc import Paragraph, Heading, Sentence
from pprint import pprint
from lxml import etree

In [3]:
#the qcl material property model
class QclMaterialDesign(BaseModel):
    heterostructure=StringType()

In [4]:
#grammar rules for identifyimg the qcl material
import re
from chemdataextractor.parse import R,I,W,Optional,merge,join

prefix= (I('grown')+I('in')|I('based')+I('on')|I('material')+I('system')|I('laser')+I('structure')|I('design')+I('based')+I('on')|I('QW')+I('structure')+I('based')+I('on')|I('growth')+I('system')|I('material')+I('of')+I('choice')+I('is')+I('material')+I('systems')|I('in')+I('the')|I('material')+I('combination')|I('wafer')|I('MBE')).hide()
material_name= (R('G[aAs]')+W('/')+R('A[l]')+R('^\d+(\.\d+)?$')+R('G[a]')+R('^\d+(\.\d+)?$')+R('A[s]')|R('I[nGaAs]')+W('/')+R('A[lInAsInP]')|R('I[n]')+R('^\d+(\.\d+)?$')+R('G[a]')+R('^\d+(\.\d+)?$')+R('A[s]')+W('/')+R('G[a]')+R('A[s]')+R('^\d+(\.\d+)?$')+R('S[b]')|R('I[nGaAs]')+W('/')+R('G[aAsSb]')|R('G[aAs]')+W('/')+R('A[lGaAs]')|R('I[nGaAs]')+W('/')+R('A[lInGaAs]'))('material_name').add_action(join)
material = ((prefix + material_name| material_name))('material')

In [5]:
#logic for parsing the qcl material property
from chemdataextractor.parse.base import BaseSentenceParser
from chemdataextractor.utils import first

class qclmaterialParser(BaseSentenceParser):
    root = material

    def interpret(self, result, start, end):
        qcl_material = self.model( heterostructure=first(result.xpath('//text()')))
        yield qcl_material

In [6]:
#setting the parser and the property model
QclMaterialDesign.parsers = [qclmaterialParser()]
sentence_parser = qclmaterialParser
sentence_parser.model = QclMaterialDesign

In [7]:
#sample sentences from the qcl journals
s1 =  Sentence('High performance based on InGaAs/GaAsSb terahertz quantum cascade lasers operating up to 142 K')
s2 =  Sentence('We report a quatum cascade laser grown in GaAs/AlGaAs material system.')
s3 = Sentence('Here, we present a study in a magnetic field for InGaAs/AlInGaAs THz QCLs.')
s4 =  Sentence('The four quantum well module  GaAs/Al0.15Ga0.85As is repeated multiple times to allow electron transport in a cascade scheme.')
s5 = Sentence('Figure 1 shows the conduction band and moduli wavefunctions of the quantum states of the design which is based on GaAs/Al0.25Ga0.75As material system.')
s6 = Sentence('Both devices were grown by molecular beam epitaxy with GaAs/Al0.3Ga0.7As quantum wells.')
s7 = Sentence('We present two different terahertz quantum cascade laser QCL designs based on GaAs/Al0.3Ga0.7As heterostructures that feature a depopulation mechanism of two longitudinal-optical phonon scattering events.') 

In [8]:
#specifying the model to be used for extracting the qcl material values
s1.models = [QclMaterialDesign]
s2.models = [QclMaterialDesign]
s3.models = [QclMaterialDesign] 
s4.models = [QclMaterialDesign] 
s5.models = [QclMaterialDesign] 
s6.models = [QclMaterialDesign] 
s7.models = [QclMaterialDesign] 

In [9]:
#analyzing sentence s1
pprint(s1.records.serialize())

[{'QclMaterialDesign': {'heterostructure': 'InGaAs / GaAsSb'}}]


In [10]:
#material record in s1
for result in sentence_parser.root.scan(s1.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s1.records.serialize())

b'<material>InGaAs / GaAsSb</material>' 

[{'QclMaterialDesign': {'heterostructure': 'InGaAs / GaAsSb'}}]


In [11]:
#analyzing sentence s2
pprint(s2.records.serialize())

[{'QclMaterialDesign': {'heterostructure': 'GaAs / AlGaAs'}}]


In [12]:
#material record in s2
for result in sentence_parser.root.scan(s2.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s2.records.serialize())

b'<material>GaAs / AlGaAs</material>' 

[{'QclMaterialDesign': {'heterostructure': 'GaAs / AlGaAs'}}]


In [13]:
#analyzing sentence s3
pprint(s3.records.serialize())

[{'QclMaterialDesign': {'heterostructure': 'InGaAs / AlInGaAs'}}]


In [14]:
#material record in s3
for result in sentence_parser.root.scan(s3.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s3.records.serialize())

b'<material>InGaAs / AlInGaAs</material>' 

[{'QclMaterialDesign': {'heterostructure': 'InGaAs / AlInGaAs'}}]


In [15]:
#analyzing sentence s4
pprint(s4.records.serialize())

[{'QclMaterialDesign': {'heterostructure': 'GaAs / Al0.15Ga0.85As'}}]


In [16]:
#material record in s4
for result in sentence_parser.root.scan(s4.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s4.records.serialize())

b'<material>GaAs / Al0.15Ga0.85As</material>' 

[{'QclMaterialDesign': {'heterostructure': 'GaAs / Al0.15Ga0.85As'}}]


In [17]:
#analyzing sentence s5
pprint(s5.records.serialize())

[{'QclMaterialDesign': {'heterostructure': 'GaAs / Al0.25Ga0.75As'}}]


In [18]:
#material record in s5
for result in sentence_parser.root.scan(s4.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s5.records.serialize())

b'<material>GaAs / Al0.15Ga0.85As</material>' 

[{'QclMaterialDesign': {'heterostructure': 'GaAs / Al0.25Ga0.75As'}}]


In [19]:
#analyzing sentence s6
pprint(s6.records.serialize())

[{'QclMaterialDesign': {'heterostructure': 'GaAs / Al0.3Ga0.7As'}}]


In [20]:
#material record in s6
for result in sentence_parser.root.scan(s4.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s6.records.serialize())

b'<material>GaAs / Al0.15Ga0.85As</material>' 

[{'QclMaterialDesign': {'heterostructure': 'GaAs / Al0.3Ga0.7As'}}]


In [21]:
#analyzing sentence s7
pprint(s7.records.serialize())

[{'QclMaterialDesign': {'heterostructure': 'GaAs / Al0.3Ga0.7As'}}]


In [22]:
#material record in s7
for result in sentence_parser.root.scan(s4.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s6.records.serialize())

b'<material>GaAs / Al0.15Ga0.85As</material>' 

[{'QclMaterialDesign': {'heterostructure': 'GaAs / Al0.3Ga0.7As'}}]


In [23]:
#reading a sample journal paper
with open('p (11).html', 'rb') as f:
    doc = Document.from_file(f)
print(doc)

<Document: 476 elements>


In [24]:
#Document Metadata
doc.metadata

{'title': 'High performance InGaAs/GaAsSb terahertz quantum cascade lasers operating up to 142\u2009K', 'authors': ['C.  Deutsch', 'M.  Krall', 'M.  Brandstetter', 'H.  Detz', 'A. M.  Andrews', 'P.  Klang', 'W.  Schrenk', 'G.  Strasser', 'K.  Unterrainer'], 'publisher': 'American Institute of PhysicsAIP', 'journal': 'Applied Physics Letters', 'date': '2012-11-19', 'volume': '101', 'issue': '21', 'firstpage': '211117', 'doi': '10.1063/1.4766915', 'pdf_url': 'https://aip.scitation.org/doi/pdf/10.1063/1.4766915', 'html_url': 'https://aip.scitation.org/doi/abs/10.1063/1.4766915'}

In [25]:
#document elements
#doc.elements

In [26]:
#setting the property model for the document
doc.models = [QclMaterialDesign]

In [27]:
#analyzing the individual tokens in the document and extracting the qcl material  record
for p in doc.paragraphs:
    for s in p.sentences:
        x=s.tagged_tokens
        for result in sentence_parser.root.scan(x):
            print(etree.tostring(result[0]), '\n')
            pprint(s.records.serialize())

b'<material>InGaAs / GaAsSb</material>' 

[{'QclMaterialDesign': {'heterostructure': 'InGaAs / GaAsSb'}}]
b'<material>InGaAs / AlGaAs</material>' 

[{'QclMaterialDesign': {'heterostructure': 'InGaAs / AlGaAs'}}]
b'<material>GaAs / Al0.15Ga0.85As</material>' 

[{'QclMaterialDesign': {'heterostructure': 'GaAs / Al0.15Ga0.85As'}}]
b'<material>InGaAs / AlAs(Sb)</material>' 

[{'QclMaterialDesign': {'heterostructure': 'InGaAs / AlAs(Sb)'}}]
b'<material>InGaAs / InAlAs</material>' 

[{'QclMaterialDesign': {'heterostructure': 'InGaAs / InAlAs'}}]
b'<material>In0.53Ga0.47As / GaAs0.51Sb0.49</material>' 

[{'QclMaterialDesign': {'heterostructure': 'In0.53Ga0.47As / GaAs0.51Sb0.49'}}]
b'<material>InGaAs / InAlAs,44</material>' 

[{'QclMaterialDesign': {'heterostructure': 'InGaAs / InAlAs,44'}}]
b'<material>GaAs / Al0.15Ga0.85As.5,65</material>' 

[{'QclMaterialDesign': {'heterostructure': 'GaAs / Al0.15Ga0.85As.5,65'}}]
b'<material>InGaAs / InAlAs</material>' 

[{'QclMaterialDesign': {'heterostr

In [28]:
#analyzing the whole document at once
doc.models = [QclMaterialDesign]
results=doc.records.serialize()
pprint(results)

[{'QclMaterialDesign': {'heterostructure': 'InGaAs / GaAsSb'}},
 {'QclMaterialDesign': {'heterostructure': 'InGaAs / AlGaAs'}},
 {'QclMaterialDesign': {'heterostructure': 'GaAs / Al0.15Ga0.85As'}},
 {'QclMaterialDesign': {'heterostructure': 'InGaAs / AlAs(Sb)'}},
 {'QclMaterialDesign': {'heterostructure': 'InGaAs / InAlAs'}},
 {'QclMaterialDesign': {'heterostructure': 'In0.53Ga0.47As / GaAs0.51Sb0.49'}},
 {'QclMaterialDesign': {'heterostructure': 'InGaAs / InAlAs,44'}},
 {'QclMaterialDesign': {'heterostructure': 'GaAs / Al0.15Ga0.85As.5,65'}},
 {'QclMaterialDesign': {'heterostructure': 'GaAs / AlGaAs'}},
 {'QclMaterialDesign': {'heterostructure': 'InGaAs / InAlAs.8'}},
 {'QclMaterialDesign': {'heterostructure': 'GaAs / AlGaAs.5,95'}},
 {'QclMaterialDesign': {'heterostructure': 'InAs / AlSb'}},
 {'QclMaterialDesign': {'heterostructure': 'GaAs / Al0.40Ga0.60As'}},
 {'QclMaterialDesign': {'heterostructure': 'In0.53Ga0.47As / '
                                           'GaAs0.51Sb0.49.13,