# Material Science on Documents - Quick start

## Getting started

The [Deep Search Toolkit](https://ds4sd.github.io/deepsearch-toolkit/) allows document conversion with the following few lines of code. It's that simple! For more info or step-by-step guide:
- Visit https://ds4sd.github.io/deepsearch-toolkit/guide/convert_doc/
- Follow this example notebook

### Set notebook parameters

In [1]:
from dsnotebooks.settings import ProjectNotebookSettings

# notebook settings auto-loaded from .env / env vars
notebook_settings = ProjectNotebookSettings()

PROFILE_NAME = notebook_settings.profile  # the profile to use
PROJ_KEY = notebook_settings.proj_key  # the project to use

# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456

### Import example dependencies

In [2]:
import os
import json

import pandas as pd

import deepsearch as ds

from pathlib import Path
from zipfile import ZipFile

from deepsearch.documents.core.export import export_to_markdown
from IPython.display import display, Markdown, HTML, display_html

### Connect to Deep Search

In [3]:
api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)

## Convert Document

In [4]:
output_dir = Path("./converted_docs")

fname = "20140197356.pdf"

documents = ds.convert_documents(
    api=api,
    proj_key=PROJ_KEY,
    source_path=f"../../data/samples/{fname}",
    progress_bar=True,
)
documents.download_all(result_dir=output_dir)
info = documents.generate_report(result_dir=output_dir)
print(info)

Submitting input:     :   0%|[38;2;15;98;254m                              [0m| 0/1 [00:00<?, ?it/s][38;2;15;98;254m[0m

  Expected `list[str]` but got `_LiteralGenericAlias` with value `typing.Literal['ApiServer...lsHttpSource', 'object']` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(
Submitting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:02<00:00,  2.15s/it][38;2;15;98;254m[0m


{'total_pages': 11, 'processed_pages': 11, 'truncated_pages': 0}


In [5]:
# group output files and visualize the output
md_files = list(output_dir.glob("*.md"))
json_files = list(output_dir.glob("*.json"))

In [None]:
# display last document
# display(Markdown(md_files[-1]))

## Find materials in a local PDF Document

### load models

In [6]:
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models

from deepsearch_glm.nlp_utils import (
    extract_references_from_doc,
    init_nlp_model,
    list_nlp_model_configs,
)

from deepsearch_glm.glm_utils import (
    create_glm_config_from_docs,
    create_glm_dir,
    create_glm_from_docs,
    expand_terms,
    load_glm,
    read_edges_in_dataframe,
    read_nodes_in_dataframe,
    show_query_result,
)

from tabulate import tabulate

models = load_pretrained_nlp_models(verbose=True, force=False)

 -> already downloaded part-of-speech
 -> already downloaded reference
 -> already downloaded material
 -> already downloaded language
 -> already downloaded name
 -> already downloaded semantic
 -> already downloaded geoloc


### Run the model

In [7]:
ifile = json_files[-1]

with open(ifile) as fr:
    doc = json.load(fr)

model = init_nlp_model("language;term;material")
model.set_loglevel("INFO")

res = model.apply_on_doc(doc)

insts = pd.DataFrame(res["instances"]["data"], columns=res["instances"]["headers"])

In [8]:
# print(insts.columns)

materials = insts[
    (insts["type"] == "material") & (insts["subtype"] == "complex_chemical")
][["type", "subtype", "name", "subj_path"]]
print(materials.to_string())

          type           subtype                                      name   subj_path
32    material  complex_chemical                          - (21) Appl. No.   #/texts/4
39    material  complex_chemical                                    - (22)   #/texts/6
1095  material  complex_chemical                          2-methylpropyl).  #/texts/57
1170  material  complex_chemical    2,4,7-trimethyloctadec-5-yne-4, 7-diol  #/texts/58
1213  material  complex_chemical  2,5,8,11-tetramethyldodec-6-yne-5,8-diol  #/texts/58
1666  material  complex_chemical                          1-naphthoic acid  #/texts/67
1670  material  complex_chemical                          2-naphthoic acid  #/texts/67
1713  material  complex_chemical                   1,2,3,4-butanetetracar›  #/texts/67


## Extracting materials from Document collections

In [10]:
from numerize.numerize import numerize
from datetime import datetime

# Fetch list of all data collections
collections = api.elastic.list()
collections.sort(key=lambda c: c.name.lower())

# Visualize summary table
results = [
    {
        "Name": c.name,
        "Type": c.metadata.type,
        "Num entries": numerize(c.documents),
        "Date": datetime.fromisoformat(c.metadata.created).strftime("%Y-%m-%d"),
        "Coords": f"{c.source.elastic_id}/{c.source.index_key}",
    }
    for c in collections
]
display(pd.DataFrame(results[0:10]))

Unnamed: 0,Name,Type,Num entries,Date,Coords
0,AAAI,Document,16.02K,2023-08-29,default/aaai
1,ACL Anthology,Document,55.28K,2023-08-22,default/acl
2,Annual Reports,Document,107.38K,2024-04-15,default/annual-report
3,arXiv abstracts,Document,2.48M,2024-05-22,default/arxiv-abstract
4,arXiv category taxonomy,Record,155,2024-05-22,default/arxiv-category
5,arXiv full documents,Document,2.29M,2023-10-29,default/arxiv
6,BioRxiv,Document,357.76K,2023-11-09,default/biorxiv
7,Brenda,Record,7.12K,2023-01-03,default/brenda
8,ChEMBL,Record,2.42M,2024-04-26,default/chembl
9,ChEMBL (DEPRECATED),Record,2.11M,2023-01-03,default/chembl-deprecated


In [11]:
from tqdm import tqdm
from copy import deepcopy

from deepsearch.cps.client.components.elastic import ElasticDataCollectionSource
from deepsearch.cps.queries import DataQuery


# Input query
search_query = '"SUBSTITUTED 6-PHENYLNICOTINIC ACIDS AND THEIR USE"'
data_collection = ElasticDataCollectionSource(
    elastic_id="default", index_key="patent-uspto"
)
page_size = 50

# Prepare the data query
query = DataQuery(
    search_query,  # The search query to be executed
    # source=["description.title", "description.authors", "identifiers"], # Which fields of documents we want to fetch
    limit=page_size,  # The size of each request page
    coordinates=data_collection,  # The data collection to be queries
)


# [Optional] Compute the number of total results matched. This can be used to monitor the pagination progress.
count_query = deepcopy(query)
count_query.paginated_task.parameters["limit"] = 0
count_results = api.queries.run(count_query)
expected_total = count_results.outputs["data_count"]
expected_pages = (
    expected_total + page_size - 1
) // page_size  # this is simply a ceiling formula

print(f"#-found documents: ", count_results)

# Iterate through all results by fetching `page_size` results at the same time
documents = []
cursor = api.queries.run_paginated_query(query)
for result_page in tqdm(cursor, total=expected_pages):
    # Iterate through the results of a single page, and add to the total list
    for row in result_page.outputs["data_outputs"]:
        documents.append(row["_source"])

print(f"Finished fetching all data. Total is {len(documents)} records.")

#-found documents:  RunQueryResult(outputs={'data_outputs': [], 'data_count': 2, 'data_aggs': {'deepsearch_total_size': {'value': 780743.0}}}, next_pages={}, timings=RunQueryResult.QueryTimings(overall=3.0525007601827383, tasks={'0_ElasticQuery': RunQueryResult.QueryTimings.TaskTimings(overall=3.052005048841238, details={})}))


3it [00:01,  1.56it/s]                       

Finished fetching all data. Total is 2 records.





In [None]:
import textwrap

# Create a TextWrapper object
wrapper = textwrap.TextWrapper(width=100)  # Set the desired width

model = init_nlp_model("language;term;material")
model.set_loglevel("INFO")

max_items = 5

for doc in documents:

    dname = doc["file-info"]["filename"]

    for i, item in enumerate(doc["main-text"]):

        if "text" not in item:
            continue

        if i > max_items:
            break

        res = model.apply_on_text(item["text"])

        insts = pd.DataFrame(
            res["instances"]["data"], columns=res["instances"]["headers"]
        )

        materials = insts[insts["type"] == "material"][
            ["type", "subtype", "name", "subj_path"]
        ]

        if len(materials) > 0:
            lines = wrapper.wrap(item["text"])
            print(f"\n {dname}: text-{i}\n")
            print("\n".join(lines), "\n")
            print(materials.to_string())