# Document MetaData Extraction

## Getting started


### Set notebook parameters

In [1]:
from dsnotebooks.settings import ProjectNotebookSettings

# notebook settings auto-loaded from .env / env vars
notebook_settings = ProjectNotebookSettings()

PROFILE_NAME = notebook_settings.profile  # the profile to use
PROJ_KEY = notebook_settings.proj_key     # the project to use

# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456

Project key:  1234567890abcdefghijklmnopqrstvwyz123456


### Import example dependencies

In [2]:
import os
import json

import textwrap

import pandas as pd

import deepsearch as ds

from pathlib import Path
from zipfile import ZipFile

from deepsearch.documents.core.export import export_to_markdown
from IPython.display import display, Markdown, HTML, display_html

from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models

from deepsearch_glm.nlp_utils import (
    extract_references_from_doc,
    init_nlp_model,
    list_nlp_model_configs,
)

from tabulate import tabulate

models = load_pretrained_nlp_models(verbose=True)

 -> already downloaded part-of-speech
 -> already downloaded reference
 -> already downloaded material
 -> already downloaded language
 -> already downloaded name
 -> already downloaded semantic
 -> already downloaded geoloc


### Connect to Deep Search

In [3]:
api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)

## Convert Document

In [4]:
output_dir = Path("./converted_docs")

fname = "2206.00785.pdf"

documents = ds.convert_documents(
    api=api,
    proj_key=PROJ_KEY,
    source_path=f"../../data/samples/{fname}",
    progress_bar=True
)           
documents.download_all(result_dir=output_dir)
info = documents.generate_report(result_dir=output_dir)
print(info) 

Processing input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:00<00:00, 78.98it/s][38;2;15;98;254m                                                                                                                                                              [0m
Submitting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:03<00:00,  3.27s/it][38;2;15;98;254m                                                                                                                                                              [0m
Converting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:27<00:00, 27.58s/it][38;2;15;98;254m                                                                                                                                                              [0m


{'Total documents': 1, 'Successfully converted documents': 1}


In [5]:
# Iterare output files and visualize the output
for output_file in output_dir.rglob("json*.zip"):
    with ZipFile(output_file) as archive:
        all_files = archive.namelist()
        for name in all_files:
            if not name.endswith(".json"):
                continue
            
            #basename = name.rstrip('.json')
            doc_json = json.loads(archive.read(name))
            
            ofile = output_dir / name
            print(f"writing {ofile}")
            with ofile.open("w") as fw:
                fw.write(json.dumps(doc_json, indent=2))
                
            doc_md = export_to_markdown(doc_json)

            ofile = output_dir / name.replace(".json", ".md")
            print(f"writing {ofile}")
            with ofile.open("w") as fw:
                fw.write(doc_md)

            

writing converted_docs/2206.00785.json
writing converted_docs/2206.00785.md


In [49]:
# display last document
# display(Markdown(doc_md))

## Extract references from converted Document

In [6]:
def resolve(path, doc):

    if len(path)>1 and path[0]=="#":
        return resolve(path[1:], doc)
        
    if len(path)==1 and isinstance(doc, dict):
        return doc[path[0]]

    elif len(path)==1 and isinstance(doc, list):
        ind = int(path[0])
        return doc[ind]
    
    elif len(path)>1 and isinstance(doc, dict):
        return resolve(path[1:], doc[path[0]])

    elif len(path)>1 and isinstance(doc, list):
        ind = int(path[0])
        return resolve(path[1:], doc[ind])

    else:
        return None
    

In [7]:
ifile = "converted_docs/2206.00785.json"

with open(ifile) as fr:
    doc = json.load(fr)

model = init_nlp_model("language;reference;metadata")
res = model.apply_on_doc(doc)

props = pd.DataFrame(res["properties"]["data"], columns=res["properties"]["headers"])
insts = pd.DataFrame(res["instances"]["data"], columns=res["instances"]["headers"])

In [16]:
#print(res["instances"]["headers"])

doc_insts = insts[insts["subj_name"]=="DOCUMENT"][["subtype", "subj_path", "name"]]
print(doc_insts)

    subtype  subj_path                                               name
0  abstract          #  Abstract-Document understanding is a key busin...
1     title  #/texts/1  Delivering Document Conversion as a Cloud Serv...
2    author  #/texts/1                                     Christoph Auer
3    author  #/texts/1                                Research Ruschlikon
4    author  #/texts/2                                Research Ruschlikon
5    author  #/texts/3                                      Michele Dolfi
6    author  #/texts/3                                Research Ruschlikon
7    author  #/texts/4                                            J Staar
8    author  #/texts/4                                Research Ruschlikon


In [10]:


refs = props[(props["label"]=="reference") & (props["confidence"]>0.8)]

cnt = 0
for i,ref in refs.iterrows():
    #print(ref)

    item = resolve(ref["subj_path"].split("/"), res)
    print("\n".join(textwrap.wrap(item["text"], 70)))

    ents = insts[insts["subj_hash"]==item["subj_hash"]][["subtype", "name"]]
    print("\nentities:\n", ents, "\n\n")

    
    cnt+=1
    if cnt>5:
        break


[1] C. Gopal, C. L. Marshall, D. Vesset, N. Ward-Dutton, J. Hamel, R.
Jyoti, P. Rutten, C. W. Olofson, J. Rydning, S. Rau, and J. Duke, 'IDC
FutureScape: Worldwide future of intelligence 2022 predictions,'
International Data Group, Inc., Needham, MA, Research Report
US47913321, Oct. 2021. [Online]. Available:
https://www.idc.com/getdoc.jsp?containerId=US47913321

entities:
               subtype                                               name
214  reference-number                                                  1
215           authors  C. Gopal, C. L. Marshall, D. Vesset, N. Ward-D...
216       person-name                                            C Gopal
217       person-name                                       C L Marshall
218       person-name                                           D Vesset
219       person-name                                             N Ward
220       person-name                                            J Hamel
221       person-name                  

## Extract Matedata from ingested documents

In [18]:
# Import standard dependenices
from copy import deepcopy
import pandas as pd
from numerize.numerize import numerize
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

# IPython utilities
from IPython.display import display, HTML

# Import the deepsearch-toolkit
import deepsearch as ds
from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource
from deepsearch.cps.queries import DataQuery

In [19]:
# Fetch list of all data collections
collections = api.elastic.list()
collections.sort(key=lambda c: c.name.lower())

In [20]:
# Visualize summary table
results = [
    {
        "Name": c.name,
        "Type": c.metadata.type,
        "Num entries": numerize(c.documents),
        "Date": c.metadata.created.strftime("%Y-%m-%d"),
        "Coords": f"{c.source.elastic_id}/{c.source.index_key}",
    }
    for c in collections
]
display(pd.DataFrame(results))

Unnamed: 0,Name,Type,Num entries,Date,Coords
0,AAAI,Document,16.02K,2023-08-29,default/aaai
1,ACL Anthology,Document,55.28K,2023-08-22,default/acl
2,Annual Reports,Document,107.38K,2024-01-12,default/annual-report
3,arXiv abstracts,Document,2.37M,2023-12-07,default/arxiv-abstract
4,arXiv category taxonomy,Record,155,2023-12-05,default/arxiv-category
5,arXiv full documents,Document,2.29M,2023-10-29,default/arxiv
6,BioRxiv,Document,357.76K,2023-11-09,default/biorxiv
7,Brenda,Record,7.12K,2023-01-03,default/brenda
8,ChEMBL,Record,2.11M,2023-01-03,default/chembl
9,ChemRxiv,Document,8.82K,2023-11-23,default/chemrxiv


In [22]:
# Input query
search_query = "main-text.text:(\"DocLayNet\" OR \"PubLayNet\")"

# Iterate through the data collections
results = []
for c in (pbar := tqdm(collections)):
    pbar.set_description(f"Querying {c.name}")

    # Search only on document collections
    if c.metadata.type != "Document":
        continue

    # Execute the query
    query = DataQuery(search_query, source=[], limit=0, coordinates=c.source)
    query_results = api.queries.run(query)
    results.append({
        "name": c.name,
        "matches": query_results.outputs["data_count"]
    })

# Sort and display results
results.sort(reverse=True, key=lambda r: r["matches"])
display(pd.DataFrame(results[0:5]))

  0%|          | 0/60 [00:00<?, ?it/s]

Unnamed: 0,name,matches
0,arXiv full documents,165
1,Semantic Scholar Academic Graph,40
2,OpenCVF,31
3,arXiv abstracts,24
4,ACL Anthology,16


In [24]:
data_collection = ElasticDataCollectionSource(elastic_id="default", index_key="arxiv")
page_size = 5

# Prepare the data query
query = DataQuery(
    search_query, # The search query to be executed
    #source=["description.title", "description.authors", "identifiers"], # Which fields of documents we want to fetch
    limit=page_size, # The size of each request page
    coordinates=data_collection # The data collection to be queries
)


# [Optional] Compute the number of total results matched. This can be used to monitor the pagination progress.
count_query = deepcopy(query)
count_query.paginated_task.parameters["limit"] = 0
count_results = api.queries.run(count_query)
expected_total = count_results.outputs["data_count"]
expected_pages = (expected_total + page_size - 1) // page_size # this is simply a ceiling formula


# Iterate through all results by fetching `page_size` results at the same time
all_results = []
cursor = api.queries.run_paginated_query(query)
for result_page in tqdm(cursor, total=expected_pages):
    # Iterate through the results of a single page, and add to the total list
    for row in result_page.outputs["data_outputs"]:
        doc = row["_source"]
        print(doc["file-info"]["filename"])

        res = model.apply_on_doc(doc)

        props = pd.DataFrame(res["properties"]["data"], columns=res["properties"]["headers"])
        insts = pd.DataFrame(res["instances"]["data"], columns=res["instances"]["headers"])

        doc_insts = insts[insts["subj_name"]=="DOCUMENT"][["subtype", "subj_path", "name"]]
        print(doc_insts)



  0%|          | 0/33 [00:00<?, ?it/s]

2007.12238.pdf
    subtype  subj_path                                               name
0  abstract          #  MiniConf is a framework for hosting virtual ac...
1     title  #/texts/2            MiniConf-A Virtual Conference Framework
2    author  #/texts/2                                   Alexander M Rush
3    author  #/texts/2                                   Hendrik Strobelt
2111.06016.pdf
    subtype  subj_path                                               name
0  abstract          #  Analyzing the layout of a document to identify...
1     title  #/texts/2  SYNTHETIC DOCUMENT GENERATOR FOR ANNOTATION-FR...
2    author  #/texts/2                                       Natraj Raman
3    author  #/texts/2                                       Sameena Shah
4    author  #/texts/2                                     Manuela Veloso
2105.14931.pdf
    subtype  subj_path                                               name
0  abstract          #  Abstract. We present d ocument d omain r an