In [1]:
#future imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

In [2]:
#libraries
from chemdataextractor import Document
from chemdataextractor.model import BaseModel, Compound, StringType, ModelType
from chemdataextractor.model.units import TemperatureModel, LengthModel
from chemdataextractor.parse import R, I, W, Optional, merge, join, AutoSentenceParser
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore
from chemdataextractor.doc import Paragraph, Heading, Sentence
from lxml import etree
from pprint import pprint

In [3]:
#Defining the frequency model to handle the frequency units and its conversions
from chemdataextractor.model.units.dimension import Dimension
from chemdataextractor.model.units.quantity_model import QuantityModel
from chemdataextractor.model.units.unit import Unit

import logging
log = logging.getLogger(__name__)

class Frequency(Dimension):
    """
    Dimension subclass for frequencies.
    """
    pass


class FrequencyModel(QuantityModel):
    dimensions = Frequency()


class FrequencyUnit(Unit):

    def __init__(self, magnitude=0.0, powers=None):
        super(FrequencyUnit, self).__init__(Frequency(), magnitude, powers)


class Hertz(FrequencyUnit):

    def convert_value_to_standard(self, value):
        return value

    def convert_value_from_standard(self, value):
        return value

    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error
    
class Megahertz(FrequencyUnit):
    """
    Class for frequency in Megahertz
    """

    def convert_value_to_standard(self, value):
        return value*1000000

    def convert_value_from_standard(self, value):
        return value/1000000
    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error
    
class Terahertz(FrequencyUnit):
    """
    Class for frequency in Terahertz
    """

    def convert_value_to_standard(self, value):
        return value*1000000000000

    def convert_value_from_standard(self, value):
        return value/1000000000000
    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error

units_dict = {R('((Hz|(H|h)ertz))\.?', group=0): Hertz, R('mHz|((m|M)ega(H|h)ertz)\.?', group=0): Megahertz, R('THz|((t|T)era(H|h)ertz)\.?', group=0): Terahertz}
Frequency.units_dict.update(units_dict)
Frequency.standard_units = Hertz()

In [4]:
#the lasing frequency property model
from chemdataextractor.model.units import TemperatureModel, Temperature, Kelvin
from chemdataextractor.model import ListType, ModelType, StringType, Compound
from chemdataextractor.parse import I, AutoSentenceParser, join

class LasingFrequency(FrequencyModel):
    value = StringType()
    units = StringType(contextual=True)

In [5]:
#grammar rules for identifyimg the lasing frequency
import re
from chemdataextractor.parse import R, I, W, Optional, merge

prefix = prefix = (I('lasing')+I('frequency')+I('of')|I('lasing')+I('predominantly')+I('at')+I('a')+I('frequency')+I('of')|I('cutoff')+I('frequency')+I('of')|I('centered')+I('around')|I('emmission')+I('frequency')+I('of')|I('corresponding')+I('to') |I('operating')+I('predominantly')+I('at')| I('broad') + I('spectral')+I('range')+I('between')).hide()
units = (W('Hz')|W('mHz')|'THz')('units').add_action(merge)
value = R('^\d+(\.\d+)?$')('value')
lf = (prefix + value + units)('lf')

In [6]:
#the logic for parsing the lasing frequency value
from chemdataextractor.parse.base import BaseSentenceParser
from chemdataextractor.utils import first
from lxml import etree

class LfParser(BaseSentenceParser):
    root = lf

    def interpret(self, result, start, end):
        raw_value = first(result.xpath('./value/text()'))
        raw_units = first(result.xpath('./units/text()'))
        lasing_frequency = self.model(raw_value=raw_value,
                    raw_units=raw_units,
                    value=self.extract_value(raw_value),
                    error=self.extract_error(raw_value),
                    units=self.extract_units(raw_units, strict=True))
        yield lasing_frequency

In [7]:
#setting the parser and the property model
LasingFrequency.parsers = [LfParser()]
sentence_parser = LfParser
sentence_parser.model = LasingFrequency

In [8]:
#sample sentences from the qcl journals
s1 = Sentence('We report terahertz quantum-cascade lasers operating predominantly at 1.90 THz')
s2 = Sentence('We report the development of a quantum cascade laser, at l587.2 mm, corresponding to 3.44 THz or 14.2 meV photon energy.')

In [9]:
#specifying the model to be used for extracting the lasing frequency values
s1.models = [LasingFrequency]
s2.models = [LasingFrequency]

In [10]:
#frequency record in s1
for result in sentence_parser.root.scan(s1.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s1.records.serialize())

b'<lf><value>1.90</value><units>THz</units></lf>' 

[{'LasingFrequency': {'raw_units': 'THz',
                      'raw_value': '1.90',
                      'units': 'Terahertz^(1.0)',
                      'value': '[1.9]'}}]


In [11]:
#frequency record in s2
for result in sentence_parser.root.scan(s1.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s2.records.serialize())

b'<lf><value>1.90</value><units>THz</units></lf>' 

[{'LasingFrequency': {'raw_units': 'THz',
                      'raw_value': '3.44',
                      'units': 'Terahertz^(1.0)',
                      'value': '[3.44]'}}]


In [12]:
#reading a sample journal paper
with open('345.html', 'rb') as f:
    doc = Document.from_file(f)
print(doc)

<Document: 510 elements>


In [13]:
#Document Metadata
doc.metadata

{'title': '3.4-THz quantum cascade laser based on longitudinal-optical-phonon scattering for depopulation', 'authors': ['Williams, Benjamin S.', 'Callebaut, Hans', 'Kumar, Sushil', 'Hu, Qing', 'Reno, John L.'], 'publisher': 'AIP Publishing', 'journal': 'Applied Physics Letters', 'volume': '82', 'issue': '7', 'firstpage': '1015', 'lastpage': '1017', 'doi': '10.1063/1.1554479', 'pdf_url': 'https://pubs.aip.org/aip/apl/article-pdf/82/7/1015/12225149/1015_1_online.pdf'}

In [14]:
#document elements
#doc.elements

In [15]:
#setting the property model for the document
doc.models = [LasingFrequency]

In [16]:
#analyzing the individual tokens in the document and extracting the frequency record
for p in doc.paragraphs:
    for s in p.sentences:
        x=s.tagged_tokens
        for result in sentence_parser.root.scan(x):
            print(etree.tostring(result[0]), '\n')
            pprint(s.records.serialize())

b'<lf><value>3.44</value><units>THz</units></lf>' 

[{'LasingFrequency': {'raw_units': 'THz',
                      'raw_value': '3.44',
                      'units': 'Terahertz^(1.0)',
                      'value': '[3.44]'}}]


In [17]:
#analyzing the whole document tokens at once
doc.models = [LasingFrequency]
results=doc.records.serialize()
pprint(results)

[{'LasingFrequency': {'raw_units': 'THz',
                      'raw_value': '3.44',
                      'units': 'Terahertz^(1.0)',
                      'value': '[3.44]'}}]
