In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
from dotenv import load_dotenv, find_dotenv
import os
load_dotenv(find_dotenv())

True

In [14]:
AA_TOKEN=os.getenv("AA_TOKEN")
NAMESPACE=os.getenv("AA_NAMESPACE")

# Document Index

## Collection

### Creating A Collection

In [15]:
from intelligence_layer.connectors import (
    CollectionPath,
    DocumentIndexClient,
)
def create_collection(collection_name: str):
    document_index = DocumentIndexClient(
            token=AA_TOKEN,
        )

    collection_path = CollectionPath(
        namespace=NAMESPACE, 
        collection=collection_name,
    )
    document_index.create_collection(collection_path)

COLLECTION_NAME="demo"
COLLECTION_NAME_DELETE="demo-collection-delete"

create_collection(COLLECTION_NAME)
create_collection(COLLECTION_NAME_DELETE)

### Get all the Collections

In [16]:
def get_collections():
    document_index = DocumentIndexClient(
            token=AA_TOKEN,
        )

    collections = document_index.list_collections(
        namespace=NAMESPACE,
    )
    return collections
collections = get_collections()
collections

[CollectionPath(namespace='aleph-alpha', collection='demo-case-study-bosch-team-sru'),
 CollectionPath(namespace='aleph-alpha', collection='Jan-IL-vs-LC'),
 CollectionPath(namespace='aleph-alpha', collection='image-demo'),
 CollectionPath(namespace='aleph-alpha', collection='search-evaluation'),
 CollectionPath(namespace='aleph-alpha', collection='0010600002Eb5faAAB-demo'),
 CollectionPath(namespace='aleph-alpha', collection='wikipedia-de'),
 CollectionPath(namespace='aleph-alpha', collection='infineon-applikationsbeispiel'),
 CollectionPath(namespace='aleph-alpha', collection='f13-ci-collection'),
 CollectionPath(namespace='aleph-alpha', collection='bpa-demo-conet'),
 CollectionPath(namespace='aleph-alpha', collection='test-collection'),
 CollectionPath(namespace='aleph-alpha', collection='faqs'),
 CollectionPath(namespace='aleph-alpha', collection='BMZ-demo'),
 CollectionPath(namespace='aleph-alpha', collection='topic-test'),
 CollectionPath(namespace='aleph-alpha', collection='docum

### Delete a Collection

In [17]:
def _delete_collection(collection_name: str):
    document_index = DocumentIndexClient(
            token=AA_TOKEN,
        )
    collection_path = CollectionPath(
        namespace=NAMESPACE, 
        collection=collection_name,
    )
    # delete collection
    document_index.delete_collection(collection_path)

_delete_collection(COLLECTION_NAME_DELETE)
# get collections to see if it was successfull
get_collections()


[CollectionPath(namespace='aleph-alpha', collection='demo-case-study-bosch-team-sru'),
 CollectionPath(namespace='aleph-alpha', collection='Jan-IL-vs-LC'),
 CollectionPath(namespace='aleph-alpha', collection='image-demo'),
 CollectionPath(namespace='aleph-alpha', collection='search-evaluation'),
 CollectionPath(namespace='aleph-alpha', collection='0010600002Eb5faAAB-demo'),
 CollectionPath(namespace='aleph-alpha', collection='wikipedia-de'),
 CollectionPath(namespace='aleph-alpha', collection='infineon-applikationsbeispiel'),
 CollectionPath(namespace='aleph-alpha', collection='f13-ci-collection'),
 CollectionPath(namespace='aleph-alpha', collection='bpa-demo-conet'),
 CollectionPath(namespace='aleph-alpha', collection='test-collection'),
 CollectionPath(namespace='aleph-alpha', collection='faqs'),
 CollectionPath(namespace='aleph-alpha', collection='BMZ-demo'),
 CollectionPath(namespace='aleph-alpha', collection='topic-test'),
 CollectionPath(namespace='aleph-alpha', collection='docum

## Documents

### Parse Content of Document (PDF, TXT and DOCX) 

In [18]:
from pathlib import Path
from parser import FileParser

def get_file_content(path) -> tuple[str, str]:
    """Returns content and file name."""
    parser = FileParser()
    file_content, file_name = parser.parse(path)
    return file_content, file_name

root = Path().cwd().parent.parent / "example_documents"
PATH_PDF = root / "Der-Schattenmann.pdf"
PATH_TXT = root / "Plyscraper.txt"
PATH_DOCX = root / "Erfunde-Programmiersprache.docx"

file_content, file_name = get_file_content(PATH_DOCX)

print("File name:", file_name, "\n", "-"*50)
print(file_content)


File name: Erfunde-Programmiersprache.docx 
 --------------------------------------------------
Einführung in Nohtyp
Willkommen bei Nohtyp, der Programmiersprache, die alles auf den Kopf stellt! Nohtyp ist eine skurrile und einzigartige Sprache, die entwickelt wurde, um Programmierer herauszufordern und zu unterhalten. Hier sind die Grundlagen, um loszulegen.
1. Syntax
Die Syntax von Nohtyp ist ungewöhnlich und erfordert ein Umdenken. Hier sind einige grundlegende Regeln:
Rückwärts-Schreiben: Alle Befehle und Variablennamen müssen rückwärts geschrieben werden.
Kein Leerzeichen: Leerzeichen sind nicht erlaubt. Verwenden Sie stattdessen Unterstriche _.
Zahlen in Worten: Zahlen müssen als Wörter geschrieben werden (z.B. eins, zwei, drei).
2. Variablen
Variablen in Nohtyp werden wie folgt deklariert:
niam_var = "Hallo Welt"
Hier ist niam_var die Variable main_var, rückwärts geschrieben.
3. Datentypen
Nohtyp unterstützt die folgenden Datentypen:
Zeichenkette: Zeichenketten werden in doppelt

### Upload Content

In [19]:

from intelligence_layer.connectors import (
    CollectionPath,
    DocumentContents,
    DocumentIndexClient,
    DocumentPath,
)
from datetime import datetime

def upload_document(file_path: str, collection_name: str):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    collection_path = CollectionPath(
        namespace=NAMESPACE, 
        collection=collection_name,
    )
    file_content, file_name = get_file_content(file_path)
    document_path = DocumentPath(
            collection_path=collection_path,
            document_name=file_name,
        )
    document_contents = DocumentContents.from_text(file_content)
    document_contents.metadata = {
        "upload_date": datetime.now().isoformat()
    }
    document_index.add_document(
        document_path, 
        contents=document_contents,
    )

upload_document(PATH_PDF, COLLECTION_NAME)
upload_document(PATH_TXT, COLLECTION_NAME)
upload_document(PATH_DOCX, COLLECTION_NAME)

### Get all Documents within a Collection

In [20]:

def get_documents(collection_name: str):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    collection_path = CollectionPath(
        namespace=NAMESPACE, 
        collection=collection_name,
    )
    documents = document_index.documents(
        collection_path=collection_path,
    )
    return documents

documents = get_documents(COLLECTION_NAME)

for document in documents:
    print(document.model_dump_json(indent=2))

{
  "document_path": {
    "collection_path": {
      "namespace": "aleph-alpha",
      "collection": "demo"
    },
    "document_name": "Erfunde-Programmiersprache.docx"
  },
  "created": "2024-11-27T11:49:18.615543",
  "version": 1
}
{
  "document_path": {
    "collection_path": {
      "namespace": "aleph-alpha",
      "collection": "demo"
    },
    "document_name": "Plyscraper.txt"
  },
  "created": "2024-11-27T11:49:18.515651",
  "version": 1
}
{
  "document_path": {
    "collection_path": {
      "namespace": "aleph-alpha",
      "collection": "demo"
    },
    "document_name": "Der-Schattenmann.pdf"
  },
  "created": "2024-11-27T11:49:18.403193",
  "version": 1
}
{
  "document_path": {
    "collection_path": {
      "namespace": "aleph-alpha",
      "collection": "demo"
    },
    "document_name": "Apollo 11 Mission"
  },
  "created": "2024-08-29T09:04:02.557691",
  "version": 1
}
{
  "document_path": {
    "collection_path": {
      "namespace": "aleph-alpha",
      "collectio

### Get Single Document (and Content)

In [21]:
from intelligence_layer.connectors import (
    CollectionPath,
    DocumentIndexClient,
    DocumentIndexRetriever,
    IndexConfiguration,
    IndexPath,
    DocumentPath
)
def get_document(collection_name: str, document_name: str):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    collection_path = CollectionPath(
        namespace=NAMESPACE, 
        collection=collection_name,
    )
    document_path = DocumentPath(
        collection_path=collection_path,
        document_name=document_name,
    )
    return document_index.document(document_path)

document_name_pdf = "Der-Schattenmann.pdf"
document_name_txt = "Plyscraper.txt"
document_name_docx = "Erfunde-Programmiersprache.docx"

document = get_document(
    collection_name=COLLECTION_NAME,
    document_name=document_name_pdf
)

print(document.model_dump_json(indent=2))

{
  "contents": [
    "Der Schattenmann  \nvon Lukas Böhl  \n \n„Er ist verrückt“, dachte sich der kleine Jan, der im Treppenhaus des Mehrfamilienhauses saß, in \ndem er mit seinen Eltern wohnte und den Kater der Nachbarin streichelte. „Herr Veith ist alt und \nverrückt“. Seine Tür stand immer einen Spalt weit offen, egal ob bei Tag oder Nacht, und man \nhörte ihn wirres Zeug flüstern. Wenn es so leise war wie jetzt, glaubte man, die Worte \nentschlüsseln zu können, die er brabbelte. Doch so sehr man sich auch darauf konzentrierte, es \nwar unmöglich, den Sinn dieses Kauderwelschs zu entziffern.  \n \nJan fuhr Felix, dem Kater, in gleichmäßigen Zügen über den Rücken. Mit der anderen Hand stützte \ner seinen Kopf und starrte durch das große Fenster hinunter in den dunklen Hof vor dem Haus, \nder gera de von einem heftigen Sommerregen überschwemmt wurde. Felix hasste den Regen, für \nJan dagegen war es im Moment ein willkommener Zeitvertreib, den Tropfen beim Fallen \nzuzusehen. Der Rege

## Data Retriever

### Create Index

In [22]:
from intelligence_layer.connectors import (
    CollectionPath,
    DocumentIndexClient,
    IndexConfiguration,
    IndexPath,
)

def _create_index(index_name: str, chunk_size: int):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    index_path = IndexPath(
        namespace=NAMESPACE,
        index=index_name,
    )
    index_config = IndexConfiguration(
        embedding_type="asymmetric",
        chunk_size=chunk_size,
    )
    document_index.create_index(
        index_path=index_path,
        index_configuration=index_config,
    )
    return

INDEX_NAME = "demo-index"
chunk_size = 126
_create_index(INDEX_NAME, chunk_size)

### Get all Indexes

In [23]:
def get_all_indexes(collection_name: str):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    collection_path = CollectionPath(
        namespace=NAMESPACE, 
        collection=collection_name,
    )
    return document_index.list_assigned_index_names(collection_path)
indexes = get_all_indexes(collection_name=COLLECTION_NAME)
indexes


['asymmetric', 'asym-32', 'asym-128', 'demo-index']

### Assign Index to Collection

In [24]:

def _assign_index_to_collection(collection_name: str, index_name: str):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    collection_path = CollectionPath(
        namespace=NAMESPACE,
        collection=collection_name,
    )
    document_index.assign_index_to_collection(
        collection_path=collection_path,
        index_name=index_name
    )
    return

_assign_index_to_collection(COLLECTION_NAME, INDEX_NAME)

### Similarity Search

In [29]:
from intelligence_layer.connectors import (
    CollectionPath,
    DocumentIndexClient,
    DocumentIndexRetriever,
)
def similariy_search(collection_name:str, index_name: str, query: str):
    document_index = DocumentIndexClient(
        token=AA_TOKEN,
    )
    
    document_index_retriever = DocumentIndexRetriever(
        document_index=document_index,
        index_name=index_name,
        namespace=NAMESPACE,
        collection=collection_name,
        k=5,
        threshold=0.5,
    )
    
    relevant_documents = document_index_retriever.get_relevant_documents_with_scores(
        query=query
    )
    return relevant_documents

query = "plyscraper"
relevant_documents = similariy_search(
    collection_name=COLLECTION_NAME,
    index_name=INDEX_NAME,
    query=query
)
relevant_documents

[SearchResult(id=DocumentPath(collection_path=CollectionPath(namespace='aleph-alpha', collection='demo'), document_name='Plyscraper.txt'), score=0.7895327, document_chunk=DocumentChunk(text='Plyscraper', start=0, end=9, metadata=None)),
 SearchResult(id=DocumentPath(collection_path=CollectionPath(namespace='aleph-alpha', collection='demo'), document_name='Plyscraper.txt'), score=0.6021297, document_chunk=DocumentChunk(text='For example, in Tokyo, a plyscraper that is 350 meters (1,150 ft) tall called the W350 Project has been proposed with plans to be complete in 2041.[10]  The W350 Project plans to be a hybrid plyscraper using only 10% steel and the remainder engineered wood.  In London, research and planning are underway for the Oakwood Tower which is estimated to be 300 meters (980 ft) tall.[11]  This would be an 80-story building integrated into the London skyline.', start=3927, end=4379, metadata=None)),
 SearchResult(id=DocumentPath(collection_path=CollectionPath(namespace='aleph

# Upload Data for RAG

## Upload Custom Dataset

In [26]:
ACCEPTED_FILE_EXTENSTIONS = ["txt", "docx", "pdf"]

In [31]:
collection_name = "rag-collection"
index_name = "rag-index1"
chunk_size = 128
file_directory_path = Path().cwd().parent.parent / "example_documents"

create_collection(collection_name=collection_name)
for root, dirs, files in os.walk(file_directory_path):
    for file in files:
        file_extenstion = file[file.rfind(".")+1:]
        if file_extenstion in ACCEPTED_FILE_EXTENSTIONS:
            file_path = os.path.join(root,file)
            upload_document(
                file_path=file_path,
                collection_name=collection_name
            )

_create_index(index_name=index_name, chunk_size=chunk_size)
_assign_index_to_collection(collection_name=collection_name, index_name=index_name)
get_documents(collection_name=collection_name)

[DocumentInfo(document_path=DocumentPath(collection_path=CollectionPath(namespace='aleph-alpha', collection='rag-collection'), document_name='zehn_euro_gehen_auf_reisen.txt'), created=datetime.datetime(2024, 11, 27, 11, 53, 54, 624227), version=2),
 DocumentInfo(document_path=DocumentPath(collection_path=CollectionPath(namespace='aleph-alpha', collection='rag-collection'), document_name='Plyscraper.txt'), created=datetime.datetime(2024, 11, 27, 11, 53, 54, 484826), version=2),
 DocumentInfo(document_path=DocumentPath(collection_path=CollectionPath(namespace='aleph-alpha', collection='rag-collection'), document_name='Erfunde-Programmiersprache.docx'), created=datetime.datetime(2024, 11, 27, 11, 53, 54, 387434), version=2),
 DocumentInfo(document_path=DocumentPath(collection_path=CollectionPath(namespace='aleph-alpha', collection='rag-collection'), document_name='Der-Schattenmann.pdf'), created=datetime.datetime(2024, 11, 27, 11, 53, 54, 276451), version=2)]

## Upload quad dataset

In [33]:
from datasets import DatasetDict, load_dataset
from typing import cast
import pandas as pd
import urllib.parse

collection_name = "demo"
index_name = "rag-index1"

HF_DATASET_NAME = "deepset/germanquad"

def load_german_quad():
    dataset = load_dataset(HF_DATASET_NAME, trust_remote_code=True)
    dataset = cast(DatasetDict, dataset)
    dataset = dataset["train"]

    data = dataset.to_pandas()
    data = cast(pd.DataFrame, data)

    data = data.sample(10, random_state=4711)

    return data

def store_german_quad_in_di(data: pd.DataFrame):
    texts = data.context.unique()

    di_client = DocumentIndexClient(os.getenv("AA_TOKEN"))
    _create_index(index_name=index_name, chunk_size=chunk_size)
    _assign_index_to_collection(collection_name=collection_name, index_name=index_name)

    for text in texts:
        slug = urllib.parse.quote_plus(text[:10])
        collection_path = CollectionPath(
            namespace=NAMESPACE, 
            collection=collection_name,
        )

        document_path = DocumentPath(
            collection_path=collection_path,
            document_name=slug,
        )

        document_contents = DocumentContents(contents=[text])
        di_client.add_document(
            document_path=document_path, contents=document_contents
        )

data = load_german_quad()
store_german_quad_in_di(data)