In [79]:
#libraries
from chemdataextractor import Document
from chemdataextractor.model import Compound,BaseModel
from chemdataextractor.doc import Paragraph, Heading,Sentence
from chemdataextractor.reader.pdf import PdfReader
from pprint import pprint

In [50]:
#working temperature property model
from chemdataextractor.model.units import TemperatureModel, Temperature, Kelvin
from chemdataextractor.model import ListType, ModelType, StringType, Compound
from chemdataextractor.parse import I, AutoSentenceParser, join

class WorkingTemperature(TemperatureModel):
    value = StringType()
    units = StringType(contextual=True)

In [51]:
#grammar rules for identifyimg the the working temperature
import re
from chemdataextractor.parse import R, I, W, Optional, merge

prefix = (I('Tmax') +I('of') |I('(')+I('Tmax')+I(')')+I('of')|I('operating')+I('temperature')+I('of')|I('characteristic')+I('temperature')+I('To')+I('=')| (R('^\(T.*max\)$', re.I)+ I('of')) |I('pulsed')+ I('mode') +I('at') + I('temperatures') + I('up')+I('to')|I('working')+I('temperature')+I('of')| I('maximum')+ I('operation') + I('temperature') + I('of')| (I('heat')+ I('-')+I('sink')+I('temperature')+ I('of'))).hide()
units = (Optional(W('°')) + Optional(R('^°?[CFK]\.?$')))('units').add_action(merge)
value = R('^\d+(\.\d+)?$')('value')
wt = (prefix + value + units)('wt')

In [52]:
#the logic for parsing the temperature value
from chemdataextractor.parse.base import BaseSentenceParser
from chemdataextractor.utils import first
from lxml import etree

class WtParser(BaseSentenceParser):
    root = wt

    def interpret(self, result, start, end):
        raw_value = first(result.xpath('./value/text()'))
        raw_units = first(result.xpath('./units/text()'))
        working_temp = self.model(raw_value=raw_value,
                    raw_units=raw_units,
                    value=self.extract_value(raw_value),
                    error=self.extract_error(raw_value),
                    units=self.extract_units(raw_units, strict=True))
        yield working_temp

In [53]:
#setting the parser and the property model
WorkingTemperature.parsers = [WtParser()]
sentence_parser = WtParser
sentence_parser.model = WorkingTemperature

In [54]:
#sample sentences from the qcl journals
s1=  Sentence('A large current density dynamic range is observed, leading to a maximum operation temperature of 150 K for the double metal waveguide device')
s2 = Sentence('The laser operated up to a heat-sink temperature of 110K in pulsed mode, 95K in continuous wave (cw) mode, and the threshold current density at 5K was ∼140A∕cm2.')
s3 = Sentence('In pulsed operation, the device lased up to a maximum heat-sink temperature (Tmax) of 111K. The threshold current density (Jth) at 6K was 142A∕cm2. ')

In [55]:
#specifying the model to be used for extracting the temperature values
s1.models = [WorkingTemperature]
s2.models = [WorkingTemperature]
s3.models = [WorkingTemperature] 

In [56]:
#temperature record in s1
for result in sentence_parser.root.scan(s1.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s1.records.serialize())

b'<wt><value>150</value><units>K</units></wt>' 

[{'WorkingTemperature': {'raw_units': 'K',
                         'raw_value': '150',
                         'units': 'Kelvin^(1.0)',
                         'value': '[150.0]'}}]


In [57]:
#temperature record in s2
for result in sentence_parser.root.scan(s1.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s2.records.serialize())

b'<wt><value>150</value><units>K</units></wt>' 

[{'WorkingTemperature': {'raw_units': 'K',
                         'raw_value': '110',
                         'units': 'Kelvin^(1.0)',
                         'value': '[110.0]'}}]


In [58]:
#temperature record in s3
for result in sentence_parser.root.scan(s1.tagged_tokens):
    print(etree.tostring(result[0]), '\n')
    pprint(s3.records.serialize())

b'<wt><value>150</value><units>K</units></wt>' 

[{'WorkingTemperature': {'raw_units': 'K.',
                         'raw_value': '111',
                         'units': 'Kelvin^(1.0)',
                         'value': '[111.0]'}}]


In [94]:
#reading a sample journal paper
with open('../../Evaluation Data(Articles)/oscillator power-amplifier.pdf', 'rb') as f:
    doc = Document.from_file(f,fname="oscillator power-amplifier.pdf")
print(doc.paragraphs)

[Paragraph(id=None, references=[], text='Terahertz master-oscillator power-amplifier \nquantum cascade laser with a grating coupler \nof extremely low reflectivity'), Paragraph(id=None, references=[], text='HUAN ZHU,1,2 HAIQING ZHU,1,2 FANGFANG WANG,1 GAOLEI CHANG,1,2 \nCHENREN YU,1,2 QUAN YAN,1 JIANXIN CHEN,1 LIANHE LI,3 A. GILES \nDAVIES,3 EDMUND H. LINFIELD,3 ZHOU TANG,4 PINGPING CHEN,4 WEI LU,4 \nGANGYI XU,1,* AND LI HE\n1Key Laboratory of Infrared Imaging Materials and Detectors, Shanghai Institute of Technical Physics, \nChinese Academy of Sciences, Shanghai 200083, China \n2University of Chinese Academy of Sciences, Beijing 100049, China \n3School of Electronic and Electrical Engineering, University of Leeds, Leeds LS2 9JT, United Kingdom \n4National Laboratory of Infrared Physics, Shanghai Institute of Technical Physics, Chinese Academy of \nSciences, Shanghai 200083, China \n*gangyi.xu@mail.sitp.ac.cn'), Paragraph(id=None, references=[], text='1'), Paragraph(id=None, reference

In [69]:
#Document Metadata
doc.metadata

IndexError: list index out of range

In [None]:
#document elements
#doc.elements

In [93]:
#setting the property model for the document
doc.models = [WorkingTemperature]

In [97]:
#analyzing the individual tokens in the document and extracting the temperature record
for p in doc.paragraphs:
    for s in p.sentences:
        x=s.tagged_tokens
        for result in sentence_parser.root.scan(x):
            print("ici")
            print(etree.tostring(result[0]), '\n')
            pprint(s.records.serialize())

In [67]:
#analyzing the whole document tokens at once
doc.models = [WorkingTemperature]
results=doc.records.serialize()
pprint(results)

[]
