In [1]:
from constants import (JINA_API_KEY,
                       EMBEDDING_MODEL,
                       LLM,
                       OPENAI_API_KEY,
                       QDRANT_CLUSTER,
                       QDRANT_API_KEY
                       )

from utils import num_tokens

In [2]:
# LLM
from llama_index.llms.openai import OpenAI

# Load data from Directory
from llama_index.core import SimpleDirectoryReader

# TEST LLM

In [3]:
response = OpenAI(model = LLM, api_key = OPENAI_API_KEY).complete("Paul Graham is ")
print(response)

a computer scientist, entrepreneur, and venture capitalist. He is best known for co-founding the startup accelerator Y Combinator and for his work as an essayist and author on topics related to technology, startups, and entrepreneurship. Graham has been influential in the tech industry and is considered a thought leader in the startup community.


# TEST EMBEDDING MODEL

In [5]:
from llama_index.embeddings.jinaai import JinaEmbedding

embed_model = JinaEmbedding(api_key=JINA_API_KEY, model=EMBEDDING_MODEL)

In [6]:
embeddings = embed_model.get_text_embedding("This is the text to embed")

print(len(embeddings))
print(embeddings[:5])

embeddings = embed_model.get_text_embedding("Heute ist Freitag und ich habe keine Lust zu arbeiten")
print(len(embeddings))
print(embeddings[:5])

768
[-0.004714012, -0.16542053, -0.040442467, 0.0311203, 0.31619263]
768
[0.3563029, 0.29810587, -0.07292429, 0.026819864, 0.03765996]


### Count Tokens

In [4]:
text = "I am awesome"

In [5]:
print(num_tokens(text, model = LLM))

3


# DATA UNDERSTANDING

**Quoting Jonathan from the N3XTCODER TEAM**

`We have prepared a synthetic dataset based on real documents and structure from a municipal authority. This dataset is simplified to be text-only (no pdfs, docs) and emulates some of the folder structure of the real data. It also includes some evaluation questions within the file eval_questions_v2.txt.`

`Everything in this dataset is fiction, and it obviously has shortcomings with regards to proving that a solution will work on the real data. However we hope it will help in the design of your solutions. If a solution is promising, then further evaluation work using real data from our challenge partner, can be undertaken at the end of the programme.`


### Loaders

Before choosing an LLM to act on your data, you need to load it. LlamaIndex does this via data connectors, also called **Reader**.
Data connectors ingest data from different data sources and format the data into **Document** objects. A **Document** is a collection of data(text) and metadata about the data.

In [14]:
import os
from typing import List
from llama_index.core import Document

In [16]:
def load_data(input_dir: str = "data/raw/v2_unzipped/txt", file_ext: List[str] = [".txt"], recursive: bool = True, filename_as_id: bool = True) -> List[Document]:
    """
    Load documents from a specified directory with given file extensions.

    This function utilizes the SimpleDirectoryReader to read files from the given 
    directory and create documents. The function supports reading files with 
    specified extensions and can optionally read files recursively from subdirectories.
    Additionally, it can use filenames as document IDs.

    Parameters:
    ----------
    input_dir : str, optional
        The path to the directory from which to load the files. Defaults to "data/raw/v2_unzipped/txt".
    file_ext : List[str], optional
        A list of file extensions to filter the files that need to be read. Defaults to [".txt"].
    recursive : bool, optional
        If True, the reader will include files from subdirectories recursively. Defaults to True.
    filename_as_id : bool, optional
        If True, the filenames will be used as document IDs. Defaults to True.

    Returns:
    -------
    List[Document]
        A list of documents read from the specified directory.
    
    Example:
    -------
    >>> documents = load_data(input_dir="data/my_texts", file_ext=[".txt", ".md"], recursive=False, filename_as_id=False)
    Loaded 10 docs
    >>> print(len(documents))
    10
    """
    
    # SimpleDirectoryReader creates documents out of every file in a given directory
    reader = SimpleDirectoryReader(input_dir=input_dir,
                                   required_exts=file_ext,
                                   recursive=recursive,
                                   filename_as_id=filename_as_id
                                   )
    documents = reader.load_data()
    print(f"Loaded {len(documents)} docs")
    return documents


In [17]:
docs = load_data()

Loaded 44 docs


In [21]:
for doc in docs[:3]:
    print(doc.metadata['file_path'])
    print(doc.text[:20])

/workspaces/Intranet_Innovation_Nx3tCoder/data/raw/v2_unzipped/txt/Büro Bürgermeister/Da01-02_Presse.txt
 
Guidelines for Pre
/workspaces/Intranet_Innovation_Nx3tCoder/data/raw/v2_unzipped/txt/Büro Bürgermeister/da01-03.txt
13 
 
Eine neue DSFA
/workspaces/Intranet_Innovation_Nx3tCoder/data/raw/v2_unzipped/txt/ENNI Stadt und Service Niederrhein AöR/daenni01_dienstleistungsrahmenvertrag.txt
Max Mustermann
01.02


In [25]:
docs[0].text

" \nGuidelines for Press and Public Relations of the City Administration of Fictitiousville\nThe press and public relations of the City Administration of Fictitiousville primarily aims to publicize the services provided by all organizational units as shown in the Departmental Distribution Plan. The goal is to create a positive image of the entire administration. Press and public relations are understood as professional communication with the aim of strengthening and improving the image of the city and administration. This should be achieved as a unified voice.\n\nThe press work of Fictitiousville is to be proactively shaped. This includes reporting on public relevant issues at an early stage.\n\nTo achieve these goals, the following regulations are made:\n\n1. Responsibilities\nThe responsibility for press and public relations lies with the Mayor. The coordination, central planning, and implementation are taken over by the Department 1.1 - Press Office. The content is coordinated with 

In [29]:
from llama_index.core import PromptTemplate

identify_language_template_str = (
    """
    # ROLE
    Act as an Expert Document Language Detector, specialized in identifying the language of a given text.
    
    # CONTEXT
    You are provided with text that may contain content in English, German, or a mix of both languages.
    Your task is to identify the predominant language of the text, determining whether it is mostly in English or German.

    # RESPONSIBILITY
    The text is as follows:\n{text}\n
    Please identify the predominant language as either 'English' or 'German'.

    # RULES
    1. Output the predominant language as a one-word string: either 'English' or 'German'.
    2. If the text contains a mix of both languages, determine which language is more predominant.
    3. Ignore the presence of a few words or sentences in German or English in the text of a document. Focus on identifying the main language.
    """
)

In [23]:
bilingual_text_identifier = PromptTemplate(identify_language_template_str)

In [27]:
response = OpenAI(model = LLM, api_key = OPENAI_API_KEY)
print(response)

/2009


In [None]:
def update_metadata(documents: List[Document]) -> List[Document]:
    for doc in documents:

        # Extracting the directory path
        directory_path = os.path.dirname(doc.metadata['file_path'])
        folder_name = os.path.basename(directory_path)
        
        # Adding the directory path as a new metadata field
        doc.metadata['folder_name'] = folder_name
        
        # Adding the number of tokens in the text as a new metadata field
        tokens = num_tokens(doc.text, model = LLM)
        doc.metadata['num_tokens'] = tokens

        # Adding the language of the text as a new metadata field
        language 
        doc.metadata['language'] = 


        
        
    return {"file_path": file_path,
            "num_tokens": num_tokens(file_path, model = LLM),
            "file_name": file_path.split("/")[-1]}

In [None]:


metadata = {'file_path': '/workspaces/Intranet_Innovation_Nx3tCoder/data/raw/v2_unzipped/txt/ENNI Stadt und Service Niederrhein AöR/daenni01_dienstleistungsrahmenvertrag.txt',
 'file_name': 'daenni01_dienstleistungsrahmenvertrag.txt',
 'file_type': 'text/plain',
 'file_size': 6215,
 'creation_date': '2024-05-10',
 'last_modified_date': '2024-04-30'}

print(directory_path)
# Extracting the folder name
folder_name = os.path.basename(directory_path)

print(folder_name)


### Transformations

After loading the data, you need to process and transform it before putting it into a storage system. Transformations include 
- chunking
- extracting metadata
- embedding each chunk

This is necessary to make sure that the data can be retrieved and used optimally by the LLM.


Transformation input/outputs are Node objects (a Document is a subclass of a Node). Transformations can also be stacked and reordered.


In [None]:
pipeline = IngestionPipeline(transformations=[TokenTextSplitter(), ...])

nodes = pipeline.run(documents=documents)

## USE THIS IN MARKDOWN FILE TO EXPLAIN NODE
Under the hood, this splits your Document into Node objects, which are similar to Documents (they contain text and metadata) but have a relationship to their parent Document.

If you want to customize core components, like the text splitter, through this abstraction you can pass in a custom transformations list or apply to the global Settings

In [11]:
documents[0].metadata

{'file_path': '/workspaces/Intranet_Innovation_Nx3tCoder/data/raw/v2_unzipped/txt/Büro Bürgermeister/Da01-02_Presse.txt',
 'file_name': 'Da01-02_Presse.txt',
 'file_type': 'text/plain',
 'file_size': 4398,
 'creation_date': '2024-05-10',
 'last_modified_date': '2024-04-30'}

### Qdrant Vector Database

In [None]:
from qdrant_client import QdrantClient
from llama_index.vector_stores.qdrant import QdrantVectorStore

client = QdrantClient(url=QDRANT_CLUSTER, api_key=QDRANT_API_KEY)

vector_store_hybrid = QdrantVectorStore(
    client=client,
    collection_name="hacker-news-hybrid",
    enable_hybrid=True,
    batch_size=16
    )  # this is important for the ingestion


# ETL

# FINAL ETL PIPELINE

In [None]:
from llama_index.embeddings.jinaai import JinaEmbedding
from llama_index.postprocessor.jinaai_rerank import JinaRerank

jina_rerank = JinaRerank(api_key=JINA_API_KEY, top_n=5, )

embed_model = JinaEmbedding(api_key=JINA_API_KEY, model=EMBEDDING_MODEL, embed_batch_size=16)