# Material Science on Documents - Quick start

## Getting started

The [Deep Search Toolkit](https://ds4sd.github.io/deepsearch-toolkit/) allows document conversion with the following few lines of code. It's that simple! For more info or step-by-step guide:
- Visit https://ds4sd.github.io/deepsearch-toolkit/guide/convert_doc/
- Follow this example notebook

### Set notebook parameters

In [1]:
from dsnotebooks.settings import ProjectNotebookSettings

# notebook settings auto-loaded from .env / env vars
notebook_settings = ProjectNotebookSettings()

PROFILE_NAME = notebook_settings.profile  # the profile to use
PROJ_KEY = notebook_settings.proj_key  # the project to use

# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456

Project key:  1234567890abcdefghijklmnopqrstvwyz123456


### Import example dependencies

In [2]:
import os
import json

import pandas as pd

import deepsearch as ds

from pathlib import Path
from zipfile import ZipFile

from deepsearch.documents.core.export import export_to_markdown
from IPython.display import display, Markdown, HTML, display_html

### Connect to Deep Search

In [3]:
api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)

## Convert Document

In [4]:
output_dir = Path("./converted_docs")

fname = "20140197356.pdf"

documents = ds.convert_documents(
    api=api,
    proj_key=PROJ_KEY,
    source_path=f"../../data/samples/{fname}",
    progress_bar=True,
)
documents.download_all(result_dir=output_dir)
info = documents.generate_report(result_dir=output_dir)
print(info)

Processing input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:00<00:00, 206.31it/s][38;2;15;98;254m                                                                                                                                                            [0m
Submitting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:03<00:00,  3.48s/it][38;2;15;98;254m                                                                                                                                                             [0m
Converting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:23<00:00, 23.54s/it][38;2;15;98;254m                                                                                                                                                             [0m


{'Total documents': 1, 'Successfully converted documents': 1}


In [5]:
# Iterare output files and visualize the output
for output_file in output_dir.rglob("json*.zip"):
    with ZipFile(output_file) as archive:
        all_files = archive.namelist()
        for name in all_files:
            if not name.endswith(".json"):
                continue

            # basename = name.rstrip('.json')
            doc_json = json.loads(archive.read(name))

            ofile = output_dir / name
            print(f"writing {ofile}")
            with ofile.open("w") as fw:
                fw.write(json.dumps(doc_json, indent=2))

            doc_md = export_to_markdown(doc_json)

            ofile = output_dir / name.replace(".json", ".md")
            print(f"writing {ofile}")
            with ofile.open("w") as fw:
                fw.write(doc_md)

writing converted_docs/20140197356.json
writing converted_docs/20140197356.md


In [6]:
# display last document
# display(Markdown(doc_md))

## Find materials in a local PDF Document

### load models

In [7]:
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models

from deepsearch_glm.nlp_utils import (
    extract_references_from_doc,
    init_nlp_model,
    list_nlp_model_configs,
)

from deepsearch_glm.glm_utils import (
    create_glm_config_from_docs,
    create_glm_dir,
    create_glm_from_docs,
    expand_terms,
    load_glm,
    read_edges_in_dataframe,
    read_nodes_in_dataframe,
    show_query_result,
)

from tabulate import tabulate

models = load_pretrained_nlp_models(verbose=True, force=False)

 -> already downloaded part-of-speech
 -> already downloaded reference
 -> already downloaded material
 -> already downloaded language
 -> already downloaded name
 -> already downloaded semantic
 -> already downloaded geoloc


### Run the model

In [20]:
ifile = "./converted_docs/20140197356.json"

with open(ifile) as fr:
    doc = json.load(fr)

model = init_nlp_model("language;term;material")
model.set_loglevel("INFO")

res = model.apply_on_doc(doc)

insts = pd.DataFrame(res["instances"]["data"], columns=res["instances"]["headers"])

In [21]:
# print(insts.columns)

materials = insts[
    (insts["type"] == "material") & (insts["subtype"] == "complex_chemical")
][["type", "subtype", "name", "subj_path"]]
print(materials.to_string())

          type           subtype                                      name   subj_path
1097  material  complex_chemical                          2-methylpropyl).  #/texts/48
1173  material  complex_chemical    2,4,7-trimethyloctadec-5-yne-4, 7-diol  #/texts/49
1216  material  complex_chemical  2,5,8,11-tetramethyldodec-6-yne-5,8-diol  #/texts/49
1673  material  complex_chemical                          1-naphthoic acid  #/texts/58
1677  material  complex_chemical                          2-naphthoic acid  #/texts/58
1720  material  complex_chemical                   1,2,3,4-butanetetracar›  #/texts/58


## Extracting materials from Document collections

In [11]:
from numerize.numerize import numerize

# Fetch list of all data collections
collections = api.elastic.list()
collections.sort(key=lambda c: c.name.lower())

# Visualize summary table
results = [
    {
        "Name": c.name,
        "Type": c.metadata.type,
        "Num entries": numerize(c.documents),
        "Date": c.metadata.created.strftime("%Y-%m-%d"),
        "Coords": f"{c.source.elastic_id}/{c.source.index_key}",
    }
    for c in collections
]
display(pd.DataFrame(results[0:10]))

Unnamed: 0,Name,Type,Num entries,Date,Coords
0,AAAI,Document,16.02K,2023-08-29,default/aaai
1,ACL Anthology,Document,55.28K,2023-08-22,default/acl
2,Annual Reports,Document,107.38K,2024-01-12,default/annual-report
3,arXiv abstracts,Document,2.37M,2023-12-07,default/arxiv-abstract
4,arXiv category taxonomy,Record,155,2023-12-05,default/arxiv-category
5,arXiv full documents,Document,2.29M,2023-10-29,default/arxiv
6,BioRxiv,Document,357.76K,2023-11-09,default/biorxiv
7,Brenda,Record,7.12K,2023-01-03,default/brenda
8,ChEMBL,Record,2.11M,2023-01-03,default/chembl
9,ChemRxiv,Document,8.82K,2023-11-23,default/chemrxiv


In [12]:
from tqdm import tqdm
from copy import deepcopy

from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource
from deepsearch.cps.queries import DataQuery


# Input query
search_query = '"SUBSTITUTED 6-PHENYLNICOTINIC ACIDS AND THEIR USE"'
data_collection = ElasticDataCollectionSource(
    elastic_id="default", index_key="patent-uspto"
)
page_size = 50

# Prepare the data query
query = DataQuery(
    search_query,  # The search query to be executed
    # source=["description.title", "description.authors", "identifiers"], # Which fields of documents we want to fetch
    limit=page_size,  # The size of each request page
    coordinates=data_collection,  # The data collection to be queries
)


# [Optional] Compute the number of total results matched. This can be used to monitor the pagination progress.
count_query = deepcopy(query)
count_query.paginated_task.parameters["limit"] = 0
count_results = api.queries.run(count_query)
expected_total = count_results.outputs["data_count"]
expected_pages = (
    expected_total + page_size - 1
) // page_size  # this is simply a ceiling formula

print(f"#-found documents: ", count_results)

# Iterate through all results by fetching `page_size` results at the same time
documents = []
cursor = api.queries.run_paginated_query(query)
for result_page in tqdm(cursor, total=expected_pages):
    # Iterate through the results of a single page, and add to the total list
    for row in result_page.outputs["data_outputs"]:
        documents.append(row["_source"])

print(f"Finished fetching all data. Total is {len(documents)} records.")

#-found documents:  RunQueryResult(outputs={'data_outputs': [], 'data_count': 2, 'data_aggs': {'deepsearch_total_size': {'value': 713388.0}}}, next_pages={}, timings=RunQueryResult.QueryTimings(overall=0.28376056300476193, tasks={'0_ElasticQuery': RunQueryResult.QueryTimings.TaskTimings(overall=0.2827732330188155, details={})}))


3it [00:02,  1.13it/s]                                                                                                                                                                                                                                

Finished fetching all data. Total is 2 records.





In [24]:
import textwrap

# Create a TextWrapper object
wrapper = textwrap.TextWrapper(width=100)  # Set the desired width

model = init_nlp_model("language;term;material")
model.set_loglevel("INFO")

max_items = 5

for doc in documents:

    dname = doc["file-info"]["filename"]

    for i, item in enumerate(doc["main-text"]):

        if "text" not in item:
            continue

        if i > max_items:
            break

        res = model.apply_on_text(item["text"])

        insts = pd.DataFrame(
            res["instances"]["data"], columns=res["instances"]["headers"]
        )

        materials = insts[insts["type"] == "material"][
            ["type", "subtype", "name", "subj_path"]
        ]

        if len(materials) > 0:
            lines = wrapper.wrap(item["text"])
            print(f"\n {dname}: text-{i}\n")
            print("\n".join(lines), "\n")
            print(materials.to_string())


 US8143411B2: text-2

The present application relates to novel substituted 6-phenylnicotinic acid derivatives, to
processes for their preparation, to their use for the treatment and/or prophylaxis of diseases and
to their use for preparing medicaments for the treatment and/or prophylaxis of diseases, preferably
for the treatment and/or prophylaxis of cardiovascular disorders, in particular dyslipidaemias,
arteriosclerosis and heart failure. 

       type           subtype                    name subj_path
2  material  complex_chemical  6-phenylnicotinic acid         #

 US8143411B2: text-3

The present invention relates to novel substituted 6-phenylnicotinic acid derivatives, to processes
for their preparation, to their use for the treatment and/or prophylaxis of diseases and to their
use for preparing medicaments for the treatment and/or prophylaxis of diseases, preferably for the
treatment and/or prophylaxis of cardiovascular diseases, in particular dyslipidaemias,
arteriosclerosis 