In [1]:
#future imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

In [2]:
#libraries
from chemdataextractor import Document
from chemdataextractor.model import BaseModel, Compound, StringType, ModelType
from chemdataextractor.model.units import TemperatureModel, LengthModel
from chemdataextractor.parse import R, I, W, Optional, merge, join, AutoSentenceParser
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore
from chemdataextractor.doc import Paragraph, Heading, Sentence
from lxml import etree
from pprint import pprint

In [3]:
#Defining the frequency model to handle the frequency units and its conversions
from chemdataextractor.model.units.dimension import Dimension
from chemdataextractor.model.units.quantity_model import QuantityModel
from chemdataextractor.model.units.unit import Unit
from chemdataextractor.model.units.charge import Charge
from chemdataextractor.model.units.energy import Energy
from chemdataextractor.model.units.electric_potential import ElectricPotential
from chemdataextractor.model.units.current import ElectricalCurrent
import logging
log = logging.getLogger(__name__)

class Frequency(Dimension):
    """
    Dimension subclass for frequencies.
    """
    pass


class FrequencyModel(QuantityModel):
    dimensions = Frequency()


class FrequencyUnit(Unit):

    def __init__(self, magnitude=0.0, powers=None):
        super(FrequencyUnit, self).__init__(Frequency(), magnitude, powers)


class Hertz(FrequencyUnit):

    def convert_value_to_standard(self, value):
        return value

    def convert_value_from_standard(self, value):
        return value

    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error
    
class megahertz(FrequencyUnit):
    """
    Class for frequency in megahertz
    """

    def convert_value_to_standard(self, value):
        return value*1000000

    def convert_value_from_standard(self, value):
        return value/1000000
    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error
    
class terahertz(FrequencyUnit):
    """
    Class for frequency in terahertz
    """

    def convert_value_to_standard(self, value):
        return value*1000000000000

    def convert_value_from_standard(self, value):
        return value/1000000000000
    def convert_error_to_standard(self, error):
        return error

    def convert_error_from_standard(self, error):
        return error

units_dict = {R('((Hz|(H|h)ertz))\.?', group=0): Hertz, R('mHz|((m|M)ega(H|h)ertz)\.?', group=0): megahertz, R('THz|((t|T)era(H|h)ertz)\.?', group=0): terahertz}
Frequency.units_dict.update(units_dict)
Frequency.standard_units = Hertz()

In [4]:
#lasing frequency property model
class LasingFrequency(FrequencyModel):#inherits the defined frequency model to handle the units
    """ The Lasing Frequency Property"""
    specifier_expr = (I('operating')+I('predominantly')+I('at')|I('emmission')+I('frequency')|I('lasing')+I('predominantly')+I('at')+I('a')+I('frequency')|I('lasing')+I('frequency')|I('cutoff')+I('frequency')|I('corresponding')+I('to')|I('broad')+I('spectral')+I('range')).add_action(join)
    specifier = StringType(parse_expression=specifier_expr,required=True, contextual=True)
    compound = ModelType(Compound,required=False,contextual=True)
    compound.model_class.fields['names'].required = True
    parsers = [AutoSentenceParser()]#using the default parser

In [5]:
#sample sentence from the qcl journals
s1=  Sentence('which is considerably higher than that achieved for previously reported THz QCLs operating around the frequency of 2 THz.')

In [6]:
#specifying the model to be used for extracting the temperature values
s1.models = [LasingFrequency]

In [7]:
pprint(s1.records.serialize())

[{'LasingFrequency': {'raw_units': 'THz',
                      'raw_value': '2',
                      'units': 'terahertz^(1.0)',
                      'value': [2.0]}}]


In [8]:
#reading a sample journal paper
with open('345.html', 'rb') as f:
    doc = Document.from_file(f)
print(doc)

<Document: 204 elements>


In [9]:
#Document Metadata
doc.metadata

{'title': '3.4-THz quantum cascade laser based on longitudinal-optical-phonon scattering for depopulation', 'authors': ['Benjamin S.  Williams', 'Hans  Callebaut', 'Sushil  Kumar', 'Qing  Hu', 'John L.  Reno'], 'publisher': 'American Institute of PhysicsAIP', 'journal': 'Applied Physics Letters', 'date': '2003-02-17', 'volume': '82', 'issue': '7', 'firstpage': '1015', 'doi': '10.1063/1.1554479', 'pdf_url': 'https://aip.scitation.org/doi/pdf/10.1063/1.1554479', 'html_url': 'https://aip.scitation.org/doi/abs/10.1063/1.1554479'}

In [10]:
#Extracting the temperature property from the document
doc.models = [LasingFrequency]
results=doc.records.serialize()
pprint(results)

[{'LasingFrequency': {'compound': {'Compound': {'names': ['SIGN IN']}},
                      'raw_units': 'THz',
                      'raw_value': '3.44',
                      'specifier': 'corresponding to',
                      'units': 'terahertz^(1.0)',
                      'value': [3.44]}}]
