# Deep Search integrations - Argilla.io

In this example we will use the output of the converted document for populating a dataset on
[Argilla](https://argilla.io). This enables the user to annotate text for multiple purposes,
e.g. text classification, named entities recognition, etc as well as train custom models fitting their purposes.


### Setup your environment

In this example we require the connection to a running Argilla instance.

The [README](./README.md) file of this example describes in more details how to set it up.


### Set notebook parameters

The following block defines the parameters specific to this example notebook

- `INPUT_FILE`: the input PDF to converted and analyzed
- `ARGILLA_API_URL`: the API URL of the Argilla instance
- `ARGILLA_API_KEY`: the API Key of the Argilla instance
- `ARGILLA_DATASET`: the name of the dataset on Argilla
- `SPACY_MODEL`: the spaCy model to use for tokenization
    

In [None]:
from dsnotebooks.settings import ProjectNotebookSettings
import os
from pathlib import Path

# notebook settings auto-loaded from .env / env vars
notebook_settings = ProjectNotebookSettings()

PROFILE_NAME = notebook_settings.profile  # the profile to use
PROJ_KEY = notebook_settings.proj_key  # the project to use

INPUT_FILE = Path("../../data/samples/2206.00785.pdf")

# Argilla configuration
ARGILLA_API_URL = os.environ["ARGILLA_API_URL"]  # required env var
ARGILLA_API_KEY = os.environ["ARGILLA_API_KEY"]  # required env var
ARGILLA_DATASET = "deepsearch-documents"
# Tokenization
SPACY_MODEL = "en_core_web_sm"

In [None]:
# Import standard dependenices
import json
import tempfile
import typing
from zipfile import ZipFile

# IPython utilities
from IPython.display import display, Markdown, HTML, display_html

# Import the deepsearch-toolkit
import deepsearch as ds

# Import specific to the example
import argilla as rg
import spacy
from pydantic import BaseModel

In [None]:
# Download the spaCy model
!python -m spacy download {SPACY_MODEL}

In [None]:
from typing import Optional

class DocTextSegment(BaseModel):
    page: int  # page number
    idx: int  # index of text segment in the document
    title: Optional[str] = None  # title of the document
    name: str  # flavour of text segment
    type: str  # type of text segment
    text: str  # content of the text segment
    text_classification: typing.Any = (
        None  # this could be used to store predictions of text classification
    )
    token_classification: typing.Any = (
        None  # this could be used to store predictions of token classification
    )

## Document conversion with Deep Search

In [None]:
# Connect to Deep Search
api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)

In [None]:
# Launch the docucment conversion and download the results
documents = ds.convert_documents(
    api=api, proj_key=PROJ_KEY, source_path=INPUT_FILE, progress_bar=True
)

In [None]:
output_dir = tempfile.mkdtemp()

documents.download_all(result_dir=output_dir, progress_bar=True)

converted_docs = {}
# group output files and visualize the output
for output_file in Path(output_dir).rglob("*.json"):
    with open(output_file, 'r') as file:
        doc_jsondata = json.loads(file.read())
        converted_docs[f"{output_file}//{output_file.name}"] = doc_jsondata

print(f"{len(converted_docs)} documents have been loaded after conversion.")

## Extract text segments

In [None]:
text_segments = []
for doc in converted_docs.values():

    doc_title = doc.get("description").get("title")
    for idx, text_segment in enumerate(doc["main-text"]):
        # filter only components with text
        if "text" not in text_segment:
            continue

        # append to the component to the list of segments
        text_segments.append(
            DocTextSegment(
                title=doc_title,
                page=text_segment.get("prov", [{}])[0].get("page"),
                idx=idx,
                name=text_segment.get("name"),
                type=text_segment.get("type"),
                text=text_segment.get("text"),
            )
        )

print(f"{len(text_segments)} text segments got extracted from the document")

## Log the text segments to Argilla

In [None]:
# Initialize the Argilla SDK
client = rg.Argilla(api_url=ARGILLA_API_URL, api_key=ARGILLA_API_KEY)

# Initialize the spaCy NLP model for the tokenization of the text
nlp = spacy.load("en_core_web_sm")

In [None]:
# Prepare text segments for text classification

records_text_classificaiton = []
for segment in text_segments:
    records_text_classificaiton.append(
        rg.Record(
            fields={"text":segment.text},
            vectors={},
            suggestions=segment.text_classification,
            metadata=segment.dict(
                exclude={"text", "text_classification", "token_classification", "idx", "title", "name"}
            ),
        )
    )

In [None]:
# Submit text for classification
settings = rg.Settings(
    fields=[
        rg.TextField(name="text"),
    ],
    questions=[
        rg.TextQuestion(name="text_generation"),
    ],
    metadata=[
        rg.TermsMetadataProperty(name="type"),
        rg.TermsMetadataProperty(name="page"),
    ],
    vectors=[
        rg.VectorField(name='mini-lm-sentence-transformers', dimensions=384),
    ],
)
dataset = rg.Dataset(name=f"{ARGILLA_DATASET}-text", workspace="argilla", settings=settings)
dataset.create()
dataset.records.log(records_text_classificaiton)

In [None]:
# Prepare text segments for token classification

records_token_classificaiton = []
for segment in text_segments:
    text = [" ".join(token.text) for token in nlp(segment.text)]
    records_token_classificaiton.append(
        rg.Record(
            fields={"text": " ".join(str(x) for x in text)},
            vectors={},
            suggestions=segment.token_classification,
            metadata=segment.dict(
                exclude={"text", "text_classification", "token_classification", "idx", "title", "name"}
            ),
        )
    )

In [None]:
# Submit tokens for classification
# Submit text for classification
settings = rg.Settings(
    fields=[
        rg.TextField(name="text"),
    ],
    questions=[
        rg.TextQuestion(name="text_generation"),
    ],
    metadata=[
        rg.TermsMetadataProperty(name="type"),
        rg.TermsMetadataProperty(name="page"),
    ],
    vectors=[
        rg.VectorField(name='mini-lm-sentence-transformers', dimensions=384),
    ],
)
dataset = rg.Dataset(name=f"{ARGILLA_DATASET}-token", workspace="argilla", settings=settings)
dataset.create()
dataset.records.log(records_token_classificaiton)

## What's next?

Now that the documents are converted and uploaded in Argilla, you can use the links printed above to annotate and train your own models.

Visit the <a href="https://docs.argilla.io" rel="nofollow" target="_blank">Argilla documentation</a> to learn about its features and check out the <a href="https://docs.argilla.io/en/latest/guides/guides.html" rel="nofollow" target="_blank">Deep Dive Guides</a> and <a href="https://docs.argilla.io/en/latest/tutorials/tutorials.html" rel="nofollow" target="_blank">Tutorials</a>.
