# Document Conversion - Quick start

## Getting started

The [Deep Search Toolkit](https://ds4sd.github.io/deepsearch-toolkit/) allows document conversion with the following few lines of code. It's that simple! For more info or step-by-step guide:
- Visit https://ds4sd.github.io/deepsearch-toolkit/guide/convert_doc/
- Follow this example notebook

### Set notebook parameters

In [None]:
from dsnotebooks.settings import ProjectNotebookSettings

# notebook settings auto-loaded from .env / env vars
notebook_settings = ProjectNotebookSettings()

PROFILE_NAME = notebook_settings.profile  # the profile to use
PROJ_KEY = notebook_settings.proj_key  # the project to use

# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456

### Import example dependencies

In [None]:
import os
import json

import pandas as pd

import deepsearch as ds

from pathlib import Path
from zipfile import ZipFile

from deepsearch.documents.core.export import export_to_markdown
from IPython.display import display, Markdown, HTML, display_html

### Connect to Deep Search

In [None]:
api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)

## Convert Document

In [None]:
output_dir = Path("./converted_docs")

fname = "20140197356.pdf"

documents = ds.convert_documents(
    api=api,
    proj_key=PROJ_KEY,
    source_path=f"../../data/samples/{fname}",
    progress_bar=True,
)
documents.download_all(result_dir=output_dir)
info = documents.generate_report(result_dir=output_dir)
print(info)

In [None]:
# group output files and visualize the output
md_files = list(output_dir.glob("*.md"))
json_files = list(output_dir.glob("*.json"))

In [None]:
# display last document
# display(Markdown(md_files[-1]))

## Analyse Document with NLP

### term counting

In [None]:
with open(json_files[-1]) as fr:
    doc = json.load(fr)

terms = [
    "METAL",
    "COPPER",
    "COBALT",
    "TUNGSTEN",
    "MOLYBDENUM",
    "RUTHENIUM",
    "Self-assembly material",
    "Self-assembly molecular layer",
    "surface modification",
    "inhibitor",
    "corrosion inhibitor",
    "adsorption",
    "selectivity",
    "Anti-corrosion",
    "contact angle",
    "Area selective deposition",
    "Advanced interconnect metallization",
    "Integrated circuits",
    "Atomic layer deposition",
]

term_hist = [{"key": term, "count": 0} for term in terms]

for i, item in enumerate(doc["main-text"]):

    if "text" not in item:
        continue

    for j, term in enumerate(terms):
        term_hist[j]["count"] += item["text"].count(term.lower())

df = pd.DataFrame(term_hist)
print(df)

### term analysis

In [None]:
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models

from deepsearch_glm.nlp_utils import (
    extract_references_from_doc,
    init_nlp_model,
    list_nlp_model_configs,
)

from deepsearch_glm.glm_utils import (
    create_glm_config_from_docs,
    create_glm_dir,
    create_glm_from_docs,
    expand_terms,
    load_glm,
    read_edges_in_dataframe,
    read_nodes_in_dataframe,
    show_query_result,
)

from tabulate import tabulate

models = load_pretrained_nlp_models()
# print(f"models: {models}")

In [None]:
with open(json_files[-1]) as fr:
    doc = json.load(fr)

model = init_nlp_model("language;term")

for i, item in enumerate(doc["main-text"]):

    if "text" in item:
        res = model.apply_on_text(item["text"])
        # print(res.keys())

        # print(item["text"])
        # print(tabulate(res["instances"]["data"],
        #               headers=res["instances"]["headers"]))

    if i > 10:
        break


res = model.apply_on_doc(doc)

df = pd.DataFrame(res["instances"]["data"], columns=res["instances"]["headers"])

terms = df[df["type"] == "term"][["type", "name", "subj_path"]]
print(terms)

In [None]:
odir = "./glm"
os.makedirs(odir, exist_ok=True)

model_names = "spm;term"
#json_files = [json_files[-1]]

odir, glm = create_glm_from_docs(odir, json_files, model_names)

In [None]:
nodes = read_nodes_in_dataframe("./glm/nodes.csv")
# print(nodes)

# Get all terms of the document
terms = nodes[nodes["name"] == "term"][["total-count", "nodes-text"]]
print(terms)

# Get all terms of the document with `composition`
res = expand_terms(glm, "composition")
# show_query_result(res)

last_result = res["result"][-1]["nodes"]
expanded_terms = pd.DataFrame(last_result["data"], columns=last_result["headers"])

expanded_terms = expanded_terms[["weight", "prob", "cumul", "text", "count"]]
print(expanded_terms)