# Document Conversion - Quick start

## Getting started

The [Deep Search Toolkit](https://ds4sd.github.io/deepsearch-toolkit/) allows document conversion with the following few lines of code. It's that simple! For more info or step-by-step guide:
- Visit https://ds4sd.github.io/deepsearch-toolkit/guide/convert_doc/
- Follow this example notebook

### Set notebook parameters

In [44]:
from dsnotebooks.settings import ProjectNotebookSettings

# notebook settings auto-loaded from .env / env vars
notebook_settings = ProjectNotebookSettings()

PROFILE_NAME = notebook_settings.profile  # the profile to use
PROJ_KEY = notebook_settings.proj_key     # the project to use

# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456

Project key:  1234567890abcdefghijklmnopqrstvwyz123456


### Import example dependencies

In [14]:
import os
import json

import pandas as pd

import deepsearch as ds

from pathlib import Path
from zipfile import ZipFile

from deepsearch.documents.core.export import export_to_markdown
from IPython.display import display, Markdown, HTML, display_html

### Connect to Deep Search

In [46]:
api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)

## Convert Document

In [47]:
output_dir = Path("./converted_docs")

fname = "20140197356.pdf"

documents = ds.convert_documents(
    api=api,
    proj_key=PROJ_KEY,
    source_path=f"../../data/samples/{fname}",
    progress_bar=True
)           
documents.download_all(result_dir=output_dir)
info = documents.generate_report(result_dir=output_dir)
print(info) 

Processing input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:00<00:00, 126.92it/s][38;2;15;98;254m                                                                                                                                                           [0m
Submitting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:03<00:00,  3.65s/it][38;2;15;98;254m                                                                                                                                                            [0m
Converting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:14<00:00, 14.24s/it][38;2;15;98;254m                                                                                                                                                            [0m


{'Total documents': 1, 'Successfully converted documents': 1}


In [48]:
# Iterare output files and visualize the output
for output_file in output_dir.rglob("json*.zip"):
    with ZipFile(output_file) as archive:
        all_files = archive.namelist()
        for name in all_files:
            if not name.endswith(".json"):
                continue
            
            #basename = name.rstrip('.json')
            doc_json = json.loads(archive.read(name))
            
            ofile = output_dir / name
            print(f"writing {ofile}")
            with ofile.open("w") as fw:
                fw.write(json.dumps(doc_json, indent=2))
                
            doc_md = export_to_markdown(doc_json)

            ofile = output_dir / name.replace(".json", ".md")
            print(f"writing {ofile}")
            with ofile.open("w") as fw:
                fw.write(doc_md)

            

writing converted_docs/20140197356.json
writing converted_docs/20140197356.md


In [49]:
# display last document
# display(Markdown(doc_md))

## Analyse Document with NLP

### term counting

In [50]:
ifile = "./converted_docs/20140197356.json"

with open(ifile) as fr:
    doc = json.load(fr)

terms = ["METAL",
         "COPPER",
         "COBALT",
         "TUNGSTEN",
         "MOLYBDENUM",
         "RUTHENIUM",
         "Self-assembly material", 
         "Self-assembly molecular layer",
         "surface modification", 
         "inhibitor", 
         "corrosion inhibitor", 
         "adsorption", "selectivity", 
         "Anti-corrosion", 
         "contact angle",
         "Area selective deposition",
         "Advanced interconnect metallization",
         "Integrated circuits",
         "Atomic layer deposition"]

term_hist = [ {"key":term, "count":0} for term in terms]

for i,item in enumerate(doc["main-text"]):

    if "text" not in item:
        continue
    
    for j,term in enumerate(terms):
        term_hist[j]["count"] += item["text"].count(term.lower())

df = pd.DataFrame(term_hist)
print(df)

                                    key  count
0                                 METAL      8
1                                COPPER      2
2                                COBALT      0
3                              TUNGSTEN     10
4                            MOLYBDENUM      0
5                             RUTHENIUM      0
6                Self-assembly material      0
7         Self-assembly molecular layer      0
8                  surface modification      0
9                             inhibitor      2
10                  corrosion inhibitor      2
11                           adsorption      0
12                          selectivity      1
13                       Anti-corrosion      0
14                        contact angle      0
15            Area selective deposition      0
16  Advanced interconnect metallization      0
17                  Integrated circuits      2
18              Atomic layer deposition      0


### term analysis

In [15]:
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models

from deepsearch_glm.nlp_utils import (
    extract_references_from_doc,
    init_nlp_model,
    list_nlp_model_configs,
)

from deepsearch_glm.glm_utils import (
    create_glm_config_from_docs,
    create_glm_dir,
    create_glm_from_docs,
    expand_terms,
    load_glm,
    read_edges_in_dataframe,
    read_nodes_in_dataframe,
    show_query_result,
)

from tabulate import tabulate

models = load_pretrained_nlp_models()
#print(f"models: {models}")

In [23]:
with open(ifile) as fr:
    doc = json.load(fr)

model = init_nlp_model("language;term")

for i,item in enumerate(doc["main-text"]):

    if "text" in item:
        res = model.apply_on_text(item["text"])
        #print(res.keys())

        #print(item["text"])
        #print(tabulate(res["instances"]["data"], 
        #               headers=res["instances"]["headers"]))

    if i>10:
        break


res = model.apply_on_doc(doc)

df = pd.DataFrame(res["instances"]["data"], columns=res["instances"]["headers"])

terms = df[df["type"]=="term"][["type", "name", "subj_path"]]
print(terms)

      type                                        name   subj_path
18    term                               United States   #/texts/0
21    term  Patent Application Publication MOEGGENBORG   #/texts/1
26    term                CMP COMPOSITIONS AND METHODS   #/texts/2
27    term                            CMP COMPOSITIONS   #/texts/2
28    term                                     METHODS   #/texts/2
...    ...                                         ...         ...
2760  term                   organic or inorganic salt  #/texts/98
2761  term                              inorganic salt  #/texts/98
2765  term                                 composition  #/texts/99
2766  term                                       claim  #/texts/99
2768  term   salt additive comprises potassium sulfate  #/texts/99

[1548 rows x 3 columns]


In [31]:
ifile = "./converted_docs/20140197356.json"

odir = "./glm"
os.makedirs(odir, exist_ok=True)

model_names = "spm;term"
json_files = [ifile]

odir, glm = create_glm_from_docs(odir, json_files, model_names)

<deepsearch_glm.andromeda_glm.glm_model object at 0x1235a7f30>


2024-02-20 12:55:01.725 (8965.383s) [          6E06DF]                utils.h:270   INFO| initialising models-expression: spm;term
2024-02-20 12:55:01.725 (8965.383s) [          6E06DF]                utils.h:77    INFO| initialising model: SPM
2024-02-20 12:55:01.749 (8965.407s) [          6E06DF]                utils.h:77    INFO| initialising model: TERM
2024-02-20 12:55:01.749 (8965.407s) [          6E06DF]                utils.h:77    INFO| initialising model: LAPOS
2024-02-20 12:55:03.354 (8967.012s) [          6E06DF]                utils.h:77    INFO| initialising model: LANGUAGE
2024-02-20 12:55:03.453 (8967.111s) [          6E06DF]                utils.h:77    INFO| initialising model: SENTENCE
2024-02-20 12:55:03.453 (8967.111s) [          6E06DF]                utils.h:77    INFO| initialising model: NUMVAL
2024-02-20 12:55:03.453 (8967.111s) [          6E06DF]                utils.h:77    INFO| initialising model: CITE
2024-02-20 12:55:03.453 (8967.111s) [          6E06DF]

In [45]:
nodes = read_nodes_in_dataframe("./glm/nodes.csv")
#print(nodes)

# Get all terms of the document
terms = nodes[nodes["name"]=="term"][["total-count", "nodes-text"]]
print(terms)

# Get all terms of the document with `composition`
res = expand_terms(glm, "composition")
#show_query_result(res)

last_result = res["result"][-1]["nodes"]
expanded_terms = pd.DataFrame(last_result["data"], columns=last_result["headers"])

expanded_terms = expanded_terms[["weight", "prob", "cumul", "text", "count"]]
print(expanded_terms)

      total-count          nodes-text
2590           30         composition
2591           29           invention
2592           27           substrate
2593           24                 ppm
2594           19        compositions
...           ...                 ...
3403            0    Suitable polymer
3404            0                film
3405            0              › tion
3406            0  insulator material
3407            0                   A

[818 rows x 2 columns]
   weight    prob   cumul                   text  count
0  0.5625  0.5625  0.5625        CMP composition      9
1  0.3750  0.3750  0.9375  polishing composition      6
2  0.0625  0.0625  1.0000   abrasive composition      1
