In [15]:
#future imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

In [16]:
#libraries
from chemdataextractor import Document
from chemdataextractor.model import BaseModel, Compound, StringType, ModelType
from chemdataextractor.model.units import TemperatureModel, LengthModel
from chemdataextractor.parse import R, I, W, Optional, merge, join, AutoSentenceParser
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore
from chemdataextractor.doc import Paragraph, Heading, Sentence
from lxml import etree
from pprint import pprint

In [17]:
#Defining the frequency model to handle the frequency units and its conversions
from chemdataextractor.model.units.dimension import Dimension
from chemdataextractor.model.units.quantity_model import QuantityModel
from chemdataextractor.model.units.unit import Unit
import logging
log = logging.getLogger(__name__)

class barrierthickness(Dimension):
    """
    Dimension subclass for the qcl barrier thickness.
    """
    pass


class BarrierThicknessModel(QuantityModel):
    dimensions = barrierthickness()


class barrierthicknessUnit(Unit):

    def __init__(self, magnitude=0.0, powers=None):
        super(barrierthicknessUnit, self).__init__(barrierthickness(), magnitude, powers)


class Nanometers(barrierthicknessUnit):
    """
    Class for barrier thickness in nanometers.
    """

    def convert_value_to_standard(self, value):
        return value

    def convert_value_from_standard(self, value):
        return value

    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error
    
class Angstrom(barrierthicknessUnit):
    """
    Class for barrier thickness in nanometers.
    """

    def convert_value_to_standard(self, value):
        return value/10

    def convert_value_from_standard(self, value):
        return value*10
    def convert_error_to_standard(self, error):
        return error
    
    def convert_error_from_standard(self, error):
        return error


units_dict = {R('((nm|\(nm\)|(N|n)anometers))\.?', group=0): Nanometers, R('Å|\(Å\)|((A|a)ngstrom)\.?', group=0): Angstrom}
barrierthickness.units_dict.update(units_dict)
barrierthickness.standard_units = Nanometers()

In [18]:
#the barrier thickness property model with autosentence parser
class BarrierThickness(BarrierThicknessModel):
    """ The Barrier Thickness Property"""
    specifier_expr = (I('layer')+I('thickness')|I('layer')+I('sequences')).add_action(join)
    specifier = StringType(parse_expression=specifier_expr,required=True, contextual=True)
    compound = ModelType(Compound,required=False,contextual=True)
    compound.model_class.fields['names'].required = True

In [19]:
#setting the parser to the autosentence parser
BarrierThickness.parsers = [AutoSentenceParser()]
sentence_parser = AutoSentenceParser
sentence_parser.model = BarrierThickness

In [20]:
#sample sentences from journal papers
s1=  Sentence('The layer sequence of one period for the active region is as following: 5.2/10.3/1.7/10.75/3.6/8.8/3.95/17.2 nm.')
s2 = Sentence('The Al0.15Ga0.85As/GaAs layer sequence in each stage of the active region is 3.8/14.0/0.6/9.0/0.6/15.8/1.5/12.8/1.8/12.2/2.0/12.0/2.0/11.4/2.7/11.3/3.5/11.6 (nm).')
s3 = Sentence('The improved structure has layer sequence 31/93/14/73.4/23/155.4/11/110.2/14/84.7/20/155.4/17/110.1 Å, where altered layers are highlighted in red (and italicized well is doped to 3.15 × 1016 cm−3).')
s4 = Sentence('Starting at the right-hand injector well, the layer thicknesses in Å are 155/41/65/25/79/55/90/ 30.')

In [21]:
#setting the property model to the sentences
s1.models=[BarrierThickness]
s2.models=[BarrierThickness]
s3.models=[BarrierThickness]
s4.models=[BarrierThickness]

In [23]:
#barrier thickness record in s1
pprint(s1.records.serialize())

[{'BarrierThickness': {'raw_units': 'nm',
                       'raw_value': '17.2',
                       'units': 'Nanometers^(1.0)',
                       'value': [17.2]}}]


From the above output, the autosentence parser only recognizes the last value in the layer sequence.

In [24]:
#barrier thickness record in s2
pprint(s2.records.serialize())

[{'BarrierThickness': {'compound': {'Compound': {'names': ['Al0.15Ga0.85As / '
                                                           'GaAs']}},
                       'raw_units': '(nm)',
                       'raw_value': '11.6',
                       'units': 'Nanometers^(1.0)',
                       'value': [11.6]}},
 {'Compound': {'names': ['Al0.15Ga0.85As / GaAs']}}]


From the above output also, the autosentence parser only recognizes the last value in the layer sequence.

In [25]:
#barrier thickness record in s3
pprint(s3.records.serialize())

[{'BarrierThickness': {'raw_units': 'Å',
                       'raw_value': '110.1',
                       'units': 'Angstrom^(1.0)',
                       'value': [110.1]}}]


From the above output also, the autosentence parser only recognizes the last value in the layer sequence.

In [26]:
#barrier thickness record in s4
pprint(s4.records.serialize())

[]


For the above case, no the layer sequence is mentioned without the value following it. The autosentence parser deos not match it. This is the case with most of the layer/sequence thickness mentioned in the papers.