In [1]:
#future imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

In [2]:
#libraries
from chemdataextractor import Document
from chemdataextractor.model import BaseModel, Compound, StringType, ModelType
from chemdataextractor.parse import R, I, W, Optional, merge, join, AutoSentenceParser
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore
from chemdataextractor.doc import Paragraph, Heading, Sentence
from lxml import etree
from pprint import pprint

In [3]:
#Defining the BarrierThicknessModel to handle the sequence layer units and its conversions
from chemdataextractor.model.units.dimension import Dimension
from chemdataextractor.model.units.quantity_model import QuantityModel
from chemdataextractor.model.units.unit import Unit
import logging
log = logging.getLogger(__name__)

class barrierthickness(Dimension):
    """
    Dimension subclass for the qcl barrier thickness.
    """
    pass


class BarrierThicknessModel(QuantityModel):
    dimensions = barrierthickness()


class barrierthicknessUnit(Unit):

    def __init__(self, magnitude=0.0, powers=None):
        super(barrierthicknessUnit, self).__init__(barrierthickness(), magnitude, powers)


class Nanometers(barrierthicknessUnit):
    """
    Class for barrier thickness in nanometers.
    """

    def convert_value_to_standard(self, value):
        return value

    def convert_value_from_standard(self, value):
        return value

    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error
    
class Angstrom(barrierthicknessUnit):
    """
    Class for barrier thickness in nanometers.
    """

    def convert_value_to_standard(self, value):
        return value/10

    def convert_value_from_standard(self, value):
        return value*10
    def convert_error_to_standard(self, error):
        return error
    
    def convert_error_from_standard(self, error):
        return error


units_dict = {R('((nm|\(nm\)|(N|n)anometers))\.?', group=0): Nanometers, R('Å|\(Å\)|((A|a)ngstrom)\.?', group=0): Angstrom}
barrierthickness.units_dict.update(units_dict)
barrierthickness.standard_units = Nanometers()

In [4]:
#the qcl barrier thickness property model
class QclBarrierThickness(BarrierThicknessModel):
    value=StringType()

In [5]:
#the possible combinations of the barrier thickness values with forwad slashes
i=W('/')+R('^\d+(\.\d+)?$')
x2 = R('^\d+(\.\d+)?$')+W('/')+R('^\d+(\.\d+)?$')
x5 = x2+x2+i
x6=x2+W('/')+R('^\d+(\.\d+)?$')+W('/')+R('^\d+(\.\d+)?$')+W('/')+R('^\d+(\.\d+)?$')+W('/')+R('^\d+(\.\d+)?$')
x7=x6+i
x8 = x2+W('/')+R('^\d+(\.\d+)?$')+W('/')+R('^\d+(\.\d+)?$')+W('/')+R('^\d+(\.\d+)?$')+W('/')+R('^\d+(\.\d+)?$')+W('/')+R('^\d+(\.\d+)?$')+W('/')+R('^\d+(\.\d+)?$')
x9 = x8+i
x10=x9+i
x11=x10+i
x12=x11+i
x13=x12+i
x14=x13+i
x15=x14+i
x16=x15+i
x17=x16+i
x18=x17+i
x19=x18+i
x20=x19+i
x21=x20+i
x22=x21+i
x23=x22+i
x24=x23+i
x25=x24+i

In [6]:
#differnt combinations of expressions for the different combination of values with forward slashes
y5=x5+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x5|W('nanomters')+x5|W('nanometer')+('is')+x5|W('nanomters')+ W('are')+x5|W('in')+W('Å')+ W('are')+x5)
y6=x6+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x6|W('nanomters')+x6|W('nanometer')+('is')+x6|W('nanomters')+ W('are')+x6|W('in')+W('Å')+ W('are')+x6)
y7=x7+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x7|W('nanomters')+x7|W('nanometer')+('is')+x7|W('nanomters')+ W('are')+x7|W('in')+W('Å')+ W('are')+x7)
y8=x8+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x8|W('nanomters')+x8|W('nanometer')+('is')+x8|W('nanomters')+ W('are')+x8|W('in')+W('Å')+ W('are')+x8)
y9=x9+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x9|W('nanomters')+x9|W('nanometer')+('is')+x9|W('nanomters')+ W('are')+x9|W('in')+W('Å')+ W('are')+x9)
y10=x10+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x10|W('nanomters')+x10|W('nanometer')+('is')+x10|W('nanomters')+ W('are')+x10|W('in')+W('Å')+ W('are')+x10)
y11=x11+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x11|W('nanomters')+x11|W('nanometer')+('is')+x11|W('nanomters')+ W('are')+x11|W('in')+W('Å')+ W('are')+x11)
y12=x12+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x12|W('nanomters')+x12|W('nanometer')+('is')+x12|W('nanomters')+ W('are')+x12|W('in')+W('Å')+ W('are')+x12)
y13=x13+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x13|W('nanomters')+x13|W('nanometer')+('is')+x13|W('nanomters')+ W('are')+x13|W('in')+W('Å')+ W('are')+x13)
y14=x14+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x14|W('nanomters')+x14|W('nanometer')+('is')+x14|W('nanomters')+ W('are')+x14|W('in')+W('Å')+ W('are')+x14)
y15=x15+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x15|W('nanomters')+x15|W('nanometer')+('is')+x15|W('nanomters')+ W('are')+x15|W('in')+W('Å')+ W('are')+x15)
y16=x16+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x16|W('nanomters')+x16|W('nanometer')+('is')+x16|W('nanomters')+ W('are')+x16|W('in')+W('Å')+ W('are')+x16)
y17=x17+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x17|W('nanomters')+x17|W('nanometer')+('is')+x17|W('nanomters')+ W('are')+x17|W('in')+W('Å')+ W('are')+x17)
y18=x18+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x18|W('nanomters')+x18|W('nanometer')+('is')+x18|W('nanomters')+ W('are')+x18|W('in')+W('Å')+ W('are')+x18)
y19=x19+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x19|W('nanomters')+x19|W('nanometer')+('is')+x19|W('nanomters')+ W('are')+x19|W('in')+W('Å')+ W('are')+x19)
y20=x20+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x20|W('nanomters')+x20|W('nanometer')+('is')+x20|W('nanomters')+ W('are')+x20|W('in')+W('Å')+ W('are')+x20)
y21=x21+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x21|W('nanomters')+x21|W('nanometer')+('is')+x21|W('nanomters')+ W('are')+x21|W('in')+W('Å')+ W('are')+x21)
y22=x22+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x22|W('nanomters')+x22|W('nanometer')+('is')+x22|W('nanomters')+ W('are')+x22|W('in')+W('Å')+ W('are')+x22)
y23=x23+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x23|W('nanomters')+x23|W('nanometer')+('is')+x23|W('nanomters')+ W('are')+x23|W('in')+W('Å')+ W('are')+x23)
y24=x24+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x24|W('nanomters')+x24|W('nanometer')+('is')+x24|W('nanomters')+ W('are')+x24|W('in')+W('Å')+ W('are')+x24)
y25=x25+(W('nm')|W('Å')|W('(')+W('nm')+W(')')|W('Angstrom')+x25|W('nanomters')+x25|W('nanometer')+('is')+x25|W('nanomters')+ W('are')+x25|W('in')+W('Å')+ W('are')+x25)

In [7]:
#grammar rules for identifyimg the qcl barrier/sequence layer thickness
import re
from chemdataextractor.parse import R,I,W,Optional,merge,join
prefix= (I('layer')+I('sequence')+I('in')+I('each')+I('stage')+I('of')+I('the')+I('active')+I('region')+I('is')|I('are')|I('and')|I('layer')+I('sequence')|I('layer')+I('thicknesses')+I('in')+I('Å')+I('are')|I('the')+I('layer')+I('thicknesses')+I('in')+I('nm')+I('(')+I('with')+I('barriers')+I('indicated')+I('in')+I('bold')+I('-')+I('face')+I('font')+I(')')+I('are')|I('the')+I('layer')+I('thickness')+I('in')+I('Å')+I('are')|I('the')+I('layer')+I('thickness')+I('in')+I('nm')+I('are')|I('the')+I('layer')+I('thicknesses')+I('in')+I('nanometer')+I('are')|I('layer')+I('thickness')+I('in')+I('nanometer')+I('is')|I('quantum')+I('structure')+I(',')+I('starting')+I('with')+I('an')+I('injector')+I('barrier')+I(',') +I('is')|I('injection')+I('barrier')+I('is')|I('starting')+I('from')+I('the')+I('widest')+I('well')+I('is')|I('active')+I('region')+I('is')+I('as')+I('following')+I(':')|I('materials')+I('are')|I('active')+I('region')+I('is')|I('layer')+I('thicknesses')+I('of')|I('in')+I('nanometers')+I('is')+I('(')+I('from') +I('right') +I('to') +I('left')+I(')')+I(':')|I('the')+I('thicknesses')+I('are')|I('thicknesses')+I('of')|I('nanometer')+I('is')|I('barrier')+I('is')).hide()
p=(y5|y6|y7|y8|y9|y10|y11|y12|y13|y14|y15|y16|y17|y18|y19|y20|y21|y22|y23|y24|y25)
prefix_value_unit=(prefix+(p))('prefix_value_unit').add_action(join)
value_unit = (p)('value_unit').add_action(join)
bt1=(prefix_value_unit)('bt1')
bt2 =(value_unit)('bt2')

In [8]:
#logic for parsing the qcl barrier/sequence layer thickness
from chemdataextractor.parse.base import BaseSentenceParser
from chemdataextractor.utils import first

class qclthicknesslParser(BaseSentenceParser):
    root = bt1|bt2

    def interpret(self, result, start, end):
        qcl_thickness = self.model(value=first(result.xpath('//text()')))
        yield qcl_thickness

In [9]:
#setting the parser and the property model
QclBarrierThickness.parsers = [qclthicknesslParser()]
sentence_parser = qclthicknesslParser
sentence_parser.model = QclBarrierThickness

In [10]:
#sample sentences from journal papers
s1=  Sentence('The layer sequence of one period for the active region is as following: 5.2/10.3/1.7/10.75/3.6/8.8/3.95/17.2 nm.')
s2 = Sentence('The Al0.15Ga0.85As/GaAs layer sequence in each stage of the active region is 3.8/14.0/0.6/9.0/0.6/15.8/1.5/12.8/1.8/12.2/2.0/12.0/2.0/11.4/2.7/11.3/3.5/11.6 (nm).')
s3 = Sentence('The improved structure has layer sequence 31/93/14/73.4/23/155.4/11/110.2/14/84.7/20/155.4/17/110.1 Å, where altered layers are highlighted in red (and italicized well is doped to 3.15 × 1016 cm−3).')

In [11]:
#specifying the model to be used for extracting the qcl barrier/sequence layer thickness
s1.models = [QclBarrierThickness]
s2.models = [QclBarrierThickness]
s3.models = [QclBarrierThickness] 

In [12]:
#analyzing sentence s1
pprint(s1.records.serialize())

[{'QclBarrierThickness': {'value': '5.2 / 10.3 / 1.7 / 10.75 / 3.6 / 8.8 / '
                                   '3.95 / 17.2 nm'}}]


In [13]:
#barrier/sequence layer thickness record in s1
for result in sentence_parser.root.scan(s1.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s1.records.serialize())

b'<bt1>5.2 / 10.3 / 1.7 / 10.75 / 3.6 / 8.8 / 3.95 / 17.2 nm</bt1>' 

[{'QclBarrierThickness': {'value': '5.2 / 10.3 / 1.7 / 10.75 / 3.6 / 8.8 / '
                                   '3.95 / 17.2 nm'}}]


In [14]:
#analyzing sentence s2
pprint(s2.records.serialize())

[{'QclBarrierThickness': {'value': '3.8 / 14.0 / 0.6 / 9.0 / 0.6 / 15.8 / 1.5 '
                                   '/ 12.8 / 1.8 / 12.2 / 2.0 / 12.0 / 2.0 / '
                                   '11.4 / 2.7 / 11.3 / 3.5 / 11.6 ( nm )'}}]


In [15]:
#barrier/sequence layer thickness record in s2
for result in sentence_parser.root.scan(s2.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s2.records.serialize())

b'<bt1>3.8 / 14.0 / 0.6 / 9.0 / 0.6 / 15.8 / 1.5 / 12.8 / 1.8 / 12.2 / 2.0 / 12.0 / 2.0 / 11.4 / 2.7 / 11.3 / 3.5 / 11.6 ( nm )</bt1>' 

[{'QclBarrierThickness': {'value': '3.8 / 14.0 / 0.6 / 9.0 / 0.6 / 15.8 / 1.5 '
                                   '/ 12.8 / 1.8 / 12.2 / 2.0 / 12.0 / 2.0 / '
                                   '11.4 / 2.7 / 11.3 / 3.5 / 11.6 ( nm )'}}]


In [16]:
#analyzing sentence s3
pprint(s3.records.serialize())

[{'QclBarrierThickness': {'value': '31 / 93 / 14 / 73.4 / 23 / 155.4 / 11 / '
                                   '110.2 / 14 / 84.7 / 20 / 155.4 / 17 / '
                                   '110.1 Å'}}]


In [17]:
#barrier/sequence layer thickness record in s3
for result in sentence_parser.root.scan(s3.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s3.records.serialize())

b'<bt1>31 / 93 / 14 / 73.4 / 23 / 155.4 / 11 / 110.2 / 14 / 84.7 / 20 / 155.4 / 17 / 110.1 &#197;</bt1>' 

[{'QclBarrierThickness': {'value': '31 / 93 / 14 / 73.4 / 23 / 155.4 / 11 / '
                                   '110.2 / 14 / 84.7 / 20 / 155.4 / 17 / '
                                   '110.1 Å'}}]


In [18]:
#reading a sample journal paper
with open('p (32).html', 'rb') as f:
    doc = Document.from_file(f)
print(doc)

<Document: 818 elements>


In [19]:
#Document Metadata
doc.metadata

{'title': 'Terahertz quantum cascade lasers with copper metal-metal waveguides operating up to 178 K', 'authors': ['Mikhail A. Belkin', 'Jonathan A. Fan', 'Sahand Hormoz', 'Federico Capasso', 'Suraj P. Khanna', 'Mohamed Lachab', 'A. Giles Davies', 'Edmund H. Linfield'], 'publisher': 'Optica Publishing Group', 'journal': 'Optics Express', 'date': '2008/02/25', 'language': 'EN', 'volume': '16', 'issue': '5', 'firstpage': '3242', 'lastpage': '3248', 'doi': '10.1364/OE.16.003242', 'pdf_url': 'https://opg.optica.org/viewmedia.cfm?uri=oe-16-5-3242&seq=0', 'html_url': 'https://opg.optica.org/abstract.cfm?uri=oe-16-5-3242'}

In [20]:
#document elements
#doc.elements

In [21]:
#setting the property model for the document
doc.models = [QclBarrierThickness]

In [22]:
#analyzing the individual tokens in the document and extracting the qcl material  record
for p in doc.paragraphs:
    for s in p.sentences:
        x=s.tagged_tokens
        for result in sentence_parser.root.scan(x):
            #print(etree.tostring(result[0]), '\n')
            pprint(s.records.serialize())

[{'QclBarrierThickness': {'value': '48 / 96 / 20 / 74 / 42 / 161 Å'}},
 {'QclBarrierThickness': {'value': '49 / 79 / 25 / 66 / 41 / 156 / 33 / 90 Å'}}]
[{'QclBarrierThickness': {'value': '48 / 96 / 20 / 74 / 42 / 161 Å'}},
 {'QclBarrierThickness': {'value': '49 / 79 / 25 / 66 / 41 / 156 / 33 / 90 Å'}}]
[{'QclBarrierThickness': {'value': '48 / 96 / 20 / 74 / 42 / 161 Å'}},
 {'QclBarrierThickness': {'value': '49 / 79 / 25 / 66 / 41 / 156 / 33 / 90 Å'}}]
[{'QclBarrierThickness': {'value': '48 / 96 / 20 / 74 / 42 / 161 Å'}},
 {'QclBarrierThickness': {'value': '49 / 79 / 25 / 66 / 41 / 156 / 33 / 90 Å'}}]
