In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
from dotenv import load_dotenv, find_dotenv
import os
load_dotenv(find_dotenv())

True

In [3]:
AA_TOKEN=os.getenv("AA_TOKEN")
NAMESPACE=os.getenv("AA_NAMESPACE")

# Document Index

## Collection

### Creating A Collection

In [4]:
from intelligence_layer.connectors import (
    CollectionPath,
    DocumentIndexClient,
)
def create_collection(collection_name: str):
    document_index = DocumentIndexClient(
            token=AA_TOKEN,
        )

    collection_path = CollectionPath(
        namespace=NAMESPACE, 
        collection=collection_name,
    )
    document_index.create_collection(collection_path)

COLLECTION_NAME="demo"
COLLECTION_NAME_DELETE="demo-collection-delete"

create_collection(COLLECTION_NAME)
create_collection(COLLECTION_NAME_DELETE)

### Get all the Collections

In [5]:
def get_collections():
    document_index = DocumentIndexClient(
            token=AA_TOKEN,
        )

    collections = document_index.list_collections(
        namespace=NAMESPACE,
    )
    return collections
collections = get_collections()
collections

[CollectionPath(namespace='rewe', collection='demo'),
 CollectionPath(namespace='rewe', collection='demo2'),
 CollectionPath(namespace='rewe', collection='demo-collection-delete'),
 CollectionPath(namespace='rewe', collection='rag-collection'),
 CollectionPath(namespace='rewe', collection='deepset')]

### Delete a Collection

In [6]:
def _delete_collection(collection_name: str):
    document_index = DocumentIndexClient(
            token=AA_TOKEN,
        )
    collection_path = CollectionPath(
        namespace=NAMESPACE, 
        collection=collection_name,
    )
    # delete collection
    document_index.delete_collection(collection_path)

_delete_collection(COLLECTION_NAME_DELETE)
# get collections to see if it was successfull
get_collections()


[CollectionPath(namespace='rewe', collection='demo'),
 CollectionPath(namespace='rewe', collection='demo2'),
 CollectionPath(namespace='rewe', collection='rag-collection'),
 CollectionPath(namespace='rewe', collection='deepset')]

## Documents

### Parse Content of Document (PDF, TXT and DOCX) 

In [10]:
from parser import FileParser

def get_file_content(path) -> tuple[str, str]:
    """Returns content and file name."""
    parser = FileParser()
    file_content, file_name = parser.parse(path)
    return file_content, file_name

PATH_PDF = "..\..\example_documents\Der-Schattenmann.pdf"
PATH_TXT = "..\..\example_documents\Plyscraper.txt"
PATH_DOCX = "..\..\example_documents\Erfunde-Programmiersprache.docx"

file_content, file_name = get_file_content(PATH_DOCX)

print("File name:", file_name, "\n", "-"*50)
print(file_content)


  PATH_PDF = "..\..\example_documents\Der-Schattenmann.pdf"
  PATH_TXT = "..\..\example_documents\Plyscraper.txt"
  PATH_DOCX = "..\..\example_documents\Erfunde-Programmiersprache.docx"
  PATH_PDF = "..\..\example_documents\Der-Schattenmann.pdf"
  PATH_TXT = "..\..\example_documents\Plyscraper.txt"
  PATH_DOCX = "..\..\example_documents\Erfunde-Programmiersprache.docx"


### Upload Content

In [None]:

from intelligence_layer.connectors import (
    CollectionPath,
    DocumentContents,
    DocumentIndexClient,
    DocumentPath,
)
from datetime import datetime

def upload_document(file_path: str, collection_name: str):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    collection_path = CollectionPath(
        namespace=NAMESPACE, 
        collection=collection_name,
    )
    file_content, file_name = get_file_content(file_path)
    document_path = DocumentPath(
            collection_path=collection_path,
            document_name=file_name,
        )
    document_contents = DocumentContents.from_text(file_content)
    document_contents.metadata = {
        "upload_date": datetime.now().isoformat()
    }
    document_index.add_document(
        document_path, 
        contents=document_contents,
    )

upload_document(PATH_PDF, COLLECTION_NAME)
upload_document(PATH_TXT, COLLECTION_NAME)
upload_document(PATH_DOCX, COLLECTION_NAME)

### Get all Documents within a Collection

In [None]:

def get_documents(collection_name: str):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    collection_path = CollectionPath(
        namespace=NAMESPACE, 
        collection=collection_name,
    )
    documents = document_index.documents(
        collection_path=collection_path,
    )
    return documents

documents = get_documents(COLLECTION_NAME)

for document in documents:
    print(document.model_dump_json(indent=2))

### Get Single Document (and Content)

In [None]:
from intelligence_layer.connectors import (
    CollectionPath,
    DocumentIndexClient,
    DocumentIndexRetriever,
    IndexConfiguration,
    IndexPath,
    DocumentPath
)
def get_document(collection_name: str, document_name: str):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    collection_path = CollectionPath(
        namespace=NAMESPACE, 
        collection=collection_name,
    )
    document_path = DocumentPath(
        collection_path=collection_path,
        document_name=document_name,
    )
    return document_index.document(document_path)

document_name_pdf = "Der-Schattenmann.pdf"
document_name_txt = "Plyscraper.txt"
document_name_docx = "Erfunde-Programmiersprache.docx"

document = get_document(
    collection_name=COLLECTION_NAME,
    document_name=document_name_pdf
)

print(document.model_dump_json(indent=2))

## Data Retriever

### Create Index

In [11]:
from intelligence_layer.connectors import (
    CollectionPath,
    DocumentIndexClient,
    IndexConfiguration,
    IndexPath,
)

def _create_index(index_name: str, chunk_size: int):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    index_path = IndexPath(
        namespace=NAMESPACE,
        index=index_name,
    )
    index_config = IndexConfiguration(
        embedding_type="asymmetric",
        chunk_size=chunk_size,
    )
    document_index.create_index(
        index_path=index_path,
        index_configuration=index_config,
    )
    return

INDEX_NAME = "demo-index"
chunk_size = 126
_create_index(INDEX_NAME, chunk_size)

### Get all Indexes

In [14]:
def get_all_indexes(collection_name: str):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    collection_path = CollectionPath(
        namespace=NAMESPACE, 
        collection=collection_name,
    )
    return document_index.list_assigned_index_names(collection_path)
indexes = get_all_indexes(collection_name=COLLECTION_NAME)
indexes


### Assign Index to Collection

In [None]:

def _assign_index_to_collection(collection_name: str, index_name: str):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    collection_path = CollectionPath(
        namespace=NAMESPACE,
        collection=collection_name,
    )
    document_index.assign_index_to_collection(
        collection_path=collection_path,
        index_name=index_name
    )
    return

_assign_index_to_collection(COLLECTION_NAME, INDEX_NAME)

### Similarity Search

In [None]:
from intelligence_layer.connectors import (
    CollectionPath,
    DocumentIndexClient,
    DocumentIndexRetriever,
)
def similariy_search(collection_name:str, index_name: str, query: str):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    
    document_index_retriever = DocumentIndexRetriever(
        document_index=document_index,
        index_name=index_name,
        namespace=NAMESPACE,
        collection=collection_name,
        k=5,
        threshold=0.5,
    )
    
    relevant_documents = document_index_retriever.get_relevant_documents_with_scores(
        query=query
    )
    return relevant_documents

query = "plyscraper"
relevant_documents = similariy_search(
    collection_name=COLLECTION_NAME,
    index_name=INDEX_NAME,
    query=query
)
relevant_documents

# Upload Data for RAG

## Upload Custom Dataset

In [None]:
ACCEPTED_FILE_EXTENSTIONS = ["txt", "docx", "pdf"]

In [None]:
collection_name = "rag-collection"
index_name = "rag-index"
chunk_size = 128
file_directory_path = "..\..\example_documents"

create_collection(collection_name=collection_name)
for root, dirs, files in os.walk(file_directory_path):
    for file in files:
        file_extenstion = file[file.rfind(".")+1:]
        if file_extenstion in ACCEPTED_FILE_EXTENSTIONS:
            file_path = os.path.join(root,file)
            upload_document(
                file_path=file_path,
                collection_name=collection_name
            )

_create_index(index_name=index_name, chunk_size=chunk_size)
_assign_index_to_collection(collection_name=collection_name, index_name=index_name)
get_documents(collection_name=collection_name)

## Upload quad dataset

In [None]:
from datasets import DatasetDict, load_dataset
from typing import cast
import pandas as pd
import urllib.parse

collection_name = "rewe-workshop-prep"
index_name = "rag-index"

HF_DATASET_NAME = "deepset/germanquad"

def load_german_quad():
    dataset = load_dataset(HF_DATASET_NAME, trust_remote_code=True)
    dataset = cast(DatasetDict, dataset)
    dataset = dataset["train"]

    data = dataset.to_pandas()
    data = cast(pd.DataFrame, data)

    data = data.sample(10, random_state=4711)

    return data

def store_german_quad_in_di(data: pd.DataFrame):
    texts = data.context.unique()

    di_client = DocumentIndexClient(os.getenv("AA_TOKEN"))
    _create_index(index_name=index_name, chunk_size=chunk_size)
    _assign_index_to_collection(collection_name=collection_name, index_name=index_name)

    for text in texts:
        slug = urllib.parse.quote_plus(text[:10])
        collection_path = CollectionPath(
            namespace=NAMESPACE, 
            collection=collection_name,
        )

        document_path = DocumentPath(
            collection_path=collection_path,
            document_name=slug,
        )

        document_contents = DocumentContents(contents=[text])
        di_client.add_document(
            document_path=document_path, contents=document_contents
        )

data = load_german_quad()
store_german_quad_in_di(data)