# Document Reference parsing

## Getting started


### Set notebook parameters

In [None]:
from dsnotebooks.settings import ProjectNotebookSettings

# notebook settings auto-loaded from .env / env vars
notebook_settings = ProjectNotebookSettings()

PROFILE_NAME = notebook_settings.profile  # the profile to use
PROJ_KEY = notebook_settings.proj_key     # the project to use

# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456

### Import example dependencies

In [24]:
import os
import json

import textwrap

import pandas as pd

import deepsearch as ds

from pathlib import Path
from zipfile import ZipFile

from deepsearch.documents.core.export import export_to_markdown
from IPython.display import display, Markdown, HTML, display_html

from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models

from deepsearch_glm.nlp_utils import (
    extract_references_from_doc,
    init_nlp_model,
    list_nlp_model_configs,
)

from deepsearch_glm.glm_utils import (
    create_glm_config_from_docs,
    create_glm_dir,
    create_glm_from_docs,
    expand_terms,
    load_glm,
    read_edges_in_dataframe,
    read_nodes_in_dataframe,
    show_query_result,
)

from tabulate import tabulate

models = load_pretrained_nlp_models()

### Connect to Deep Search

In [4]:
api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)

## Convert Document

In [5]:
output_dir = Path("./converted_docs")

fname = "2206.00785.pdf"

documents = ds.convert_documents(
    api=api,
    proj_key=PROJ_KEY,
    source_path=f"../../data/samples/{fname}",
    progress_bar=True
)           
documents.download_all(result_dir=output_dir)
info = documents.generate_report(result_dir=output_dir)
print(info) 

Processing input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:00<00:00, 122.68it/s][38;2;15;98;254m                                                                                                                                                    [0m
Submitting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:01<00:00,  1.53s/it][38;2;15;98;254m                                                                                                                                                     [0m
Converting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:18<00:00, 18.45s/it][38;2;15;98;254m                                                                                                                                                     [0m


{'Total documents': 1, 'Successfully converted documents': 1}


In [6]:
# Iterare output files and visualize the output
for output_file in output_dir.rglob("json*.zip"):
    with ZipFile(output_file) as archive:
        all_files = archive.namelist()
        for name in all_files:
            if not name.endswith(".json"):
                continue
            
            #basename = name.rstrip('.json')
            doc_json = json.loads(archive.read(name))
            
            ofile = output_dir / name
            print(f"writing {ofile}")
            with ofile.open("w") as fw:
                fw.write(json.dumps(doc_json, indent=2))
                
            doc_md = export_to_markdown(doc_json)

            ofile = output_dir / name.replace(".json", ".md")
            print(f"writing {ofile}")
            with ofile.open("w") as fw:
                fw.write(doc_md)

            

writing converted_docs/2206.00785.json
writing converted_docs/2206.00785.md


In [49]:
# display last document
# display(Markdown(doc_md))

## Extract references from Documents

In [13]:
def resolve(path, doc):

    if len(path)>1 and path[0]=="#":
        return resolve(path[1:], doc)
        
    if len(path)==1 and isinstance(doc, dict):
        return doc[path[0]]

    elif len(path)==1 and isinstance(doc, list):
        ind = int(path[0])
        return doc[ind]
    
    elif len(path)>1 and isinstance(doc, dict):
        return resolve(path[1:], doc[path[0]])

    elif len(path)>1 and isinstance(doc, list):
        ind = int(path[0])
        return resolve(path[1:], doc[ind])

    else:
        return None
    

In [28]:
ifile = "converted_docs/2206.00785.json"

with open(ifile) as fr:
    doc = json.load(fr)

model = init_nlp_model("language;reference")
res = model.apply_on_doc(doc)

props = pd.DataFrame(res["properties"]["data"], columns=res["properties"]["headers"])
insts = pd.DataFrame(res["instances"]["data"], columns=res["instances"]["headers"])

refs = props[(props["label"]=="reference") & (props["confidence"]>0.8)]

cnt = 0
for i,ref in refs.iterrows():
    #print(ref)

    item = resolve(ref["subj_path"].split("/"), res)
    print("\n".join(textwrap.wrap(item["text"], 70)))

    ents = insts[insts["subj_hash"]==item["subj_hash"]][["subtype", "name"]]
    print("\nentities:\n", ents, "\n\n")

    
    cnt+=1
    if cnt>5:
        break


[1] C. Gopal, C. L. Marshall, D. Vesset, N. Ward-Dutton, J. Hamel, R.
Jyoti, P. Rutten, C. W. Olofson, J. Rydning, S. Rau, and J. Duke, 'IDC
FutureScape: Worldwide future of intelligence 2022 predictions,'
International Data Group, Inc., Needham, MA, Research Report
US47913321, Oct. 2021. [Online]. Available:
https://www.idc.com/getdoc.jsp?containerId=US47913321

entities:
               subtype                                               name
167  reference-number                                                  1
168           authors  C. Gopal, C. L. Marshall, D. Vesset, N. Ward-D...
169             title  'IDC FutureScape: Worldwide future of intellig...
170           journal  International Data Group, Inc., Needham, MA, R...
171            volume                                           47913321
172              date                                          Oct. 2021
173              note                                Online]. Available:
174               url  https://www.idc.