# Document Conversion - Quick start

## Getting started

The [Deep Search Toolkit](https://ds4sd.github.io/deepsearch-toolkit/) allows document conversion with the following few lines of code. It's that simple! For more info or step-by-step guide:
- Visit https://ds4sd.github.io/deepsearch-toolkit/guide/convert_doc/
- Follow this example notebook

### Set notebook parameters

In [44]:
from dsnotebooks.settings import ProjectNotebookSettings

# notebook settings auto-loaded from .env / env vars
notebook_settings = ProjectNotebookSettings()

PROFILE_NAME = notebook_settings.profile  # the profile to use
PROJ_KEY = notebook_settings.proj_key     # the project to use

# default project_key = 1234567890abcdefghijklmnopqrstvwyz123456

Project key:  1234567890abcdefghijklmnopqrstvwyz123456


### Import example dependencies

In [45]:
import os
import json

import pandas as pd

import deepsearch as ds

from pathlib import Path
from zipfile import ZipFile

from deepsearch.documents.core.export import export_to_markdown
from IPython.display import display, Markdown, HTML, display_html

### Connect to Deep Search

In [46]:
api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)

## Convert Document

In [47]:
output_dir = Path("./converted_docs")

fname = "20140197356.pdf"

documents = ds.convert_documents(
    api=api,
    proj_key=PROJ_KEY,
    source_path=f"../../data/samples/{fname}",
    progress_bar=True
)           
documents.download_all(result_dir=output_dir)
info = documents.generate_report(result_dir=output_dir)
print(info) 

Processing input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:00<00:00, 126.92it/s][38;2;15;98;254m                                                                                                                                                           [0m
Submitting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:03<00:00,  3.65s/it][38;2;15;98;254m                                                                                                                                                            [0m
Converting input:     : 100%|[38;2;15;98;254m██████████████████████████████[0m| 1/1 [00:14<00:00, 14.24s/it][38;2;15;98;254m                                                                                                                                                            [0m


{'Total documents': 1, 'Successfully converted documents': 1}


In [48]:
# Iterare output files and visualize the output
for output_file in output_dir.rglob("json*.zip"):
    with ZipFile(output_file) as archive:
        all_files = archive.namelist()
        for name in all_files:
            if not name.endswith(".json"):
                continue
            
            #basename = name.rstrip('.json')
            doc_json = json.loads(archive.read(name))
            
            ofile = output_dir / name
            print(f"writing {ofile}")
            with ofile.open("w") as fw:
                fw.write(json.dumps(doc_json, indent=2))
                
            doc_md = export_to_markdown(doc_json)

            ofile = output_dir / name.replace(".json", ".md")
            print(f"writing {ofile}")
            with ofile.open("w") as fw:
                fw.write(doc_md)

            

writing converted_docs/20140197356.json
writing converted_docs/20140197356.md


In [49]:
# display last document
# display(Markdown(doc_md))

## Analyse Document with NLP

### term counting

In [50]:
ifile = "./converted_docs/20140197356.json"

with open(ifile) as fr:
    doc = json.load(fr)

terms = ["METAL",
         "COPPER",
         "COBALT",
         "TUNGSTEN",
         "MOLYBDENUM",
         "RUTHENIUM",
         "Self-assembly material", 
         "Self-assembly molecular layer",
         "surface modification", 
         "inhibitor", 
         "corrosion inhibitor", 
         "adsorption", "selectivity", 
         "Anti-corrosion", 
         "contact angle",
         "Area selective deposition",
         "Advanced interconnect metallization",
         "Integrated circuits",
         "Atomic layer deposition"]

term_hist = [ {"key":term, "count":0} for term in terms]

for i,item in enumerate(doc["main-text"]):

    if "text" not in item:
        continue
    
    for j,term in enumerate(terms):
        term_hist[j]["count"] += item["text"].count(term.lower())

df = pd.DataFrame(term_hist)
print(df)

                                    key  count
0                                 METAL      8
1                                COPPER      2
2                                COBALT      0
3                              TUNGSTEN     10
4                            MOLYBDENUM      0
5                             RUTHENIUM      0
6                Self-assembly material      0
7         Self-assembly molecular layer      0
8                  surface modification      0
9                             inhibitor      2
10                  corrosion inhibitor      2
11                           adsorption      0
12                          selectivity      1
13                       Anti-corrosion      0
14                        contact angle      0
15            Area selective deposition      0
16  Advanced interconnect metallization      0
17                  Integrated circuits      2
18              Atomic layer deposition      0


### term analysis

In [43]:
ifile = "./converted_docs/20140197356.json"

with open(ifile) as fr:
    doc = json.load(fr)



