In [1]:
#future imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

In [2]:
#libraries
from chemdataextractor import Document
from chemdataextractor.model import BaseModel, Compound, StringType, ModelType
from chemdataextractor.model.units import TemperatureModel, LengthModel
from chemdataextractor.parse import R, I, W, Optional, merge, join, AutoSentenceParser
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore
from chemdataextractor.doc import Paragraph, Heading, Sentence
from lxml import etree
from pprint import pprint

In [3]:
#Units and models definition for power in mW and µW
#extends the powerModel with the milliwatt and microwatt class in order to capture power measurements in milliwatts and microwatts.
from chemdataextractor.model.units.dimension import Dimension
from chemdataextractor.model.units.quantity_model import QuantityModel
from chemdataextractor.model.units.unit import Unit
from chemdataextractor.model.units.charge import Charge
from chemdataextractor.model.units.energy import Energy
from chemdataextractor.model.units.electric_potential import ElectricPotential
from chemdataextractor.model.units.current import ElectricalCurrent
import logging
log = logging.getLogger(__name__)

class Power(Dimension):
    constituent_dimensions = ElectricPotential() * ElectricalCurrent()


class PowerModel(QuantityModel):
    dimensions = Power()


class PowerUnit(Unit):

    def __init__(self, magnitude=0.0, powers=None):
        super(PowerUnit, self).__init__(Power(), magnitude, powers)


class Watt(PowerUnit):

    def convert_value_to_standard(self, value):
        return value

    def convert_value_from_standard(self, value):
        return value

    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error
    
class milliwatt(PowerUnit):
    """
    Class for power in milliwatt
    """

    def convert_value_to_standard(self, value):
        return value/1000

    def convert_value_from_standard(self, value):
        return value*1000
    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error
    
class microwatt(PowerUnit):
    """
    Class for power in microwatt
    """

    def convert_value_to_standard(self, value):
        return value/1000000

    def convert_value_from_standard(self, value):
        return value*1000000
    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error

units_dict = {R('(W|w)(att(s)?)?', group=0): Watt, R('(mW|(milli(W|w)watt(s)?))\.?', group=0): milliwatt,R('(µW|(micro(W|w)watt(s)?))\.?', group=0): milliwatt }
Power.units_dict.update(units_dict)
Power.standard_units = Watt()

In [4]:
#optical power property model
class OpticalPower(PowerModel):
    value = StringType()
    units = StringType(contextual=True)

In [5]:
#grammar rules for identifyimg the the optical power
import re
from chemdataextractor.parse import R, I, W, Optional, merge

prefix = (I('peak') + I('power') + I('of')|I('peak')+I('-')+I('power')+I('of')|(I('peak')+I('power')+I('is')+I('approximately')|I('maximum')+I('output')+I('power')+I('of')|I('peak')+I('optical')+I('powers')+I('up')+I('to'))).hide()
units = (W('mW')|W('µW'))('units').add_action(merge)
value = R('^\d+(\.\d+)?$')('value')
op = (prefix + value + units)('op')

In [6]:
#the logic for parsing the optical power value
from chemdataextractor.parse.base import BaseSentenceParser
from chemdataextractor.utils import first
from lxml import etree

class OpParser(BaseSentenceParser):
    root = op

    def interpret(self, result, start, end):
        raw_value = first(result.xpath('./value/text()'))
        raw_units = first(result.xpath('./units/text()'))
        optical_power = self.model(raw_value=raw_value,
                    raw_units=raw_units,
                    value=self.extract_value(raw_value),
                    error=self.extract_error(raw_value),
                    units=self.extract_units(raw_units, strict=True))
        cem_el = first(result.xpath('./cem'))
        if cem_el is not None:
            optical_power.compound = Compound()
            optical_power.compound.names = cem_el.xpath('./name/text()')
            optical_power.compound.labels = cem_el.xpath('./label/text()')
        yield optical_power

In [7]:
#adding the defined optical power parser(logic) to the optical power property model
OpticalPower.parsers = [OpParser()]

In [8]:
#sample sentences from the qcl journals
s1=  Sentence('Operating at 5 K in pulsed mode, the threshold current density is 840 A/cm2, and the peak power is approximately 2.5 mW.')
s2 = Sentence('The devices exhibit threshold current densities of 0.75 kA/cm2 and provide peak optical powers up to 9mW.')
s3 = Sentence('Laser spectra were measured with a room temperature detector, making the whole setup cryogenic low temperatures (40 K), a maximum output power of 200 mW was measured.')

In [9]:
#specifying the model to be used for extracting the power values
s1.models = [OpticalPower]
s2.models = [OpticalPower]
s3.models = [OpticalPower] 

In [10]:
pprint(s1.records.serialize())

[{'OpticalPower': {'raw_units': 'mW',
                   'raw_value': '2.5',
                   'units': '(10^-3.0) * Watt^(1.0)',
                   'value': '[2.5]'}}]


In [11]:
pprint(s2.records.serialize())

[{'OpticalPower': {'raw_units': 'mW',
                   'raw_value': '9',
                   'units': '(10^-3.0) * Watt^(1.0)',
                   'value': '[9.0]'}}]


In [12]:
pprint(s3.records.serialize())

[{'OpticalPower': {'raw_units': 'mW',
                   'raw_value': '200',
                   'units': '(10^-3.0) * Watt^(1.0)',
                   'value': '[200.0]'}}]


In [13]:
#reading a sample journal paper
with open('345.html', 'rb') as f:
    doc = Document.from_file(f)
print(doc)

<Document: 204 elements>


In [14]:
#Document Metadata
doc.metadata

{'title': '3.4-THz quantum cascade laser based on longitudinal-optical-phonon scattering for depopulation', 'authors': ['Benjamin S.  Williams', 'Hans  Callebaut', 'Sushil  Kumar', 'Qing  Hu', 'John L.  Reno'], 'publisher': 'American Institute of PhysicsAIP', 'journal': 'Applied Physics Letters', 'date': '2003-02-17', 'volume': '82', 'issue': '7', 'firstpage': '1015', 'doi': '10.1063/1.1554479', 'pdf_url': 'https://aip.scitation.org/doi/pdf/10.1063/1.1554479', 'html_url': 'https://aip.scitation.org/doi/abs/10.1063/1.1554479'}

In [15]:
#Extracting the temperature property from the document
doc.models = [OpticalPower]
results=doc.records.serialize()
pprint(results)

[{'OpticalPower': {'raw_units': 'mW',
                   'raw_value': '2.5',
                   'units': '(10^-3.0) * Watt^(1.0)',
                   'value': '[2.5]'}}]
