### Example extraction of a paper from the Royal Society of Chemistry

#### Import the required modules

In [1]:
from tadf_models.models import *
from chemdataextractor.doc import Document, Citation
import json
import importlib.resources
from chemdataextractor.model import ThemeCompound
import os
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


#### Load the compound blocklists

In [2]:
tadf_blocklist = importlib.resources.read_text('tadf_models', 'tadf_blocklist_6_more_abbrev_enriched', encoding='utf8')
element_blocklist = importlib.resources.read_text('tadf_models', 'elements', encoding='utf8')
tadf_blocklist = tadf_blocklist.split('\n')
element_blocklist = element_blocklist.split('\n')
ThemeCompound.name_blocklist = ThemeCompound.name_blocklist + tadf_blocklist + element_blocklist

In [3]:
class TADFExtractor():
    """Class that extracts a document and saves records in JSON format."""

    def __init__(self, paper_root, models):
        """
        Initalise the class with the directory and name of the document, the models to extract, and the save directory.
        :param str paper_root: directory that stores the paper.
        :param str save_root: directory that the JSON results are written into.
        :param str filename: The name of output file name.
        :param list[chemdataextractor.models.BaseModel] models: list of ChemDataExtractor models to extract.
        """
        self.dic = None
        self.paper_root = paper_root
        self.models = models
        self.count = 0

    def is_incomplete_paper(self, fstring):
        """
        Method that check if a document is complete.
        :param str fstring: document in plain text string.
        :returns: if the document is complete or not.
        :rtype: Boolean.
        """
        print('Checking completeness.')
        if "(Note: The full text of this document is currently only available in the" in fstring:
            # RSC case
            return True
        elif "<xocs:rawtext" in fstring:
            # Elsevier case
            return True
        return False

    def load_document(self, file):
        """
        Method that load a document into a chemdataextractor.doc.Document object.
        :param str file: path to the document.
        :returns: the Document object and if the document is incomplete.
        :rtype: chemdataextractor.doc.Document, Boolean
        """

        d = Document.from_file(file)
        incomplete = False
        with open(file, encoding="utf8") as f:
            fstring = f.read()
        if self.is_incomplete_paper(fstring):
            incomplete = True

        return d, incomplete

    def extraction(self, file):
        """
        Method that load a extract records from a document and save them in JSON.
        :param str file: path to the document.
        """

        print(f'Attempting to extract {file}.')

        if True:
            doc, incomplete = self.load_document(os.path.join(self.paper_root, file))

            if incomplete:
                print(f'{file} is incomplete!')
                return
            
            else:
                print(f"{file} is complete, extracting.")

                try:
                    doi = doc.metadata.doi

                except IndexError:
                    print("MetaData is Empty.")
                    doi = file.name

                doc.models = self.models
                doc.skip_elements = [Citation]
                rough = doc.records

                records = []
                for r in rough:
                    self.dic = r.serialize()
                    self.dic['doi'] = doi
                    self.dic['record_method'] = r.record_method
                    records.append(self.dic)
                    self.count += 1

                print(f"{self.count} records in total")
                print(f"{file} extracted.")
                return records

#### Start the extraction!

In [6]:
# The extraction will takes ~5 mins to run.
mdb = TADFExtractor(paper_root="papers_for_extraction", 
                    models=[STSplit])
records = mdb.extraction("test.html")

Attempting to extract test.html.
Checking completeness.
test.html is complete, extracting.
10 records in total
test.html extracted.


In [7]:
### print the extracted records ###
pprint(records)

[{'STSplit': {'compound': {'ThemeCompound': {'labels': ['2'],
                                             'names': ['2,4-2CzBN',
                                                       '2,4-di(9H-carbazol-9-yl)benzonitrile',
                                                       'CzBNs'],
                                             'roles': ['nesting theme']}},
              'raw_units': 'eV',
              'raw_value': '0.5',
              'specifier': 'ΔESTs',
              'units': 'ElectronVolt^(1.0)',
              'value': [0.5]},
  'doi': '10.1039/C5SC04755B',
  'record_method': 'QuantityModelTemplateParser'},
 {'STSplit': {'compound': {'ThemeCompound': {'names': ['2,4,6-tri(9H-carbazol-9-yl)benzonitrile',
                                                       '3CzBN'],
                                             'roles': ['nesting theme']}},
              'raw_units': 'eV',
              'raw_value': '0.39',
              'specifier': 'ΔESTs',
              'units': 'Electron