From cf71d8b0893bed4c1da1ccf054329b229bf74b25 Mon Sep 17 00:00:00 2001 From: Oleh Kostromin Date: Tue, 25 Jun 2024 19:52:42 +0200 Subject: [PATCH] ner-docs --- .github/workflows/deploy.yml | 1 + src/app/docs/ner/page.md | 89 ++++++ src/app/docs/quick-start/page.md | 92 ------- src/app/docs/tagging-overview/page.md | 23 ++ src/app/docs/use-cases-local-chatbot/page.md | 78 ------ src/app/docs/use-cases-rag-agent/page.md | 246 ----------------- src/app/docs/use-cases-rag-chatbot/page.md | 272 ------------------- src/lib/navigation.js | 19 +- src/markdoc/tags.js | 10 + 9 files changed, 131 insertions(+), 699 deletions(-) create mode 100644 src/app/docs/ner/page.md delete mode 100644 src/app/docs/quick-start/page.md create mode 100644 src/app/docs/tagging-overview/page.md delete mode 100644 src/app/docs/use-cases-local-chatbot/page.md delete mode 100644 src/app/docs/use-cases-rag-agent/page.md delete mode 100644 src/app/docs/use-cases-rag-chatbot/page.md diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 682da5c..fc337b3 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -33,3 +33,4 @@ jobs: with: github_token: ${{ secrets.GH_ACCESS_TOKEN }} publish_dir: ./out + cname: skllm.beastbyte.ai diff --git a/src/app/docs/ner/page.md b/src/app/docs/ner/page.md new file mode 100644 index 0000000..9b1454d --- /dev/null +++ b/src/app/docs/ner/page.md @@ -0,0 +1,89 @@ +--- +title: Named Entity Recognition +nextjs: + metadata: + title: Named Entity Recognition + description: Learn about NER. +--- + +## Overview + +{% callout title="Warning" type="warning" %} +Named Entity Recognition is an experimental feature and may be subject to instability. Please be aware that the API and/or functionality could change. +{% /callout %} + +Named Entity Recognition is a process of locating and classifying the named entities in a provided text. + + +Currently, Scikit-LLM has a single NER estimator (only works with the GPT family) called `Explainable NER`. + +Exemplary usage: + +```python +from skllm.models.gpt.tagging.ner import GPTExplainableNER as NER + +entities = { + "PERSON": "A name of an individual.", + "ORGANIZATION": "A name of a company.", + "DATE": "A specific time reference." +} + +data = [ + "Tim Cook announced new Apple products in San Francisco on June 3, 2022.", + "Elon Musk visited the Tesla factory in Austin on January 10, 2021.", + "Mark Zuckerberg introduced Facebook Metaverse in Silicon Valley on May 5, 2023." +] + +ner = NER(entities=entities, display_predictions=True) +tagged = ner.fit_transform(data) +``` + +The model will tag the entities and provide a short reasoning behind its choice. If the `display_predictions` output is set to `True`, the outputs of the model are parsed automatically and presented in a human readable way: each entity is highlighted and the explanation is displayed on hovering over the entity. + +Exemplary output: + +============================== +{% html innerHTML="
Entities:PERSONORGANIZATIONDATE

Tim Cookannounced newAppleproducts in San Francisco onJune 3, 2022.
Elon Muskvisited theTeslafactory in Austin onJanuary 10, 2021.
Mark ZuckerbergintroducedFacebookMetaverse in Silicon Valley onMay 5, 2023.
"%} + +{% /html %} +============================== + +The `display_output` functionality works in both Jupyter Notebook and plain Python scripts. When used outside Jupyter, a new HTML page will be auto-generated and opened in a new browser window. + + +## Sparse vs Dense NER + +We distinguish between two modes of generating the predictions: sparse and dense. + +In dense mode the model produces a complete (tagged) output right away, while in sparse mode only a list of entities is produced which is then mapped to the text via regex. + +In most of the scenarios the usage of sparse mode should be preferable for the following reasons: + - lower number of output tokens (cheaper to use); + - strict validation -> it is guaranteed that the output is invertable and only contains the specified entities; + - higher accuracy, especially with smaller models. + +Dense mode should only be used when the following conditions are met: + - a larger model is used (e.g. gpt-4); + - the text is expected to contain multiple (distinct) instances of lexically ambiguous words. + +For example, in a sentence "**Apple** is the favorite fruit of the CEO of **Apple**", the first and second occurrences of the word "Apple" should be classified as different entities, which is only possible using the dense mode. + +## API Reference + +The following API reference only lists the parameters needed for the initialization of the estimator. The remaining methods follow the syntax of a scikit-learn transformer. + +### GPTExplainableNER + +```python +from skllm.models.gpt.tagging.ner import GPTExplainableNER +``` + +| **Parameter** | **Type** | **Description** | +| ------------- | -------- | ------------------------ | +| `entities` | `dict` | A dictionary of entities to recognize, with keys as **uppercase** entity names and values as descriptions. | +| `display_predictions` | `Optional[bool]` | Determines whether to display predictions, by default False. | +| `sparse_output` | `Optional[bool]` | Determines whether to generate a sparse representation of the predictions, by default True. | +| `model` | `Optional[str]` | A model to use, by default "gpt-4o". | +| `key` | `Optional[str]` | Estimator-specific API key; if None, retrieved from the global config, by default None. | +| `org` | `Optional[str]` | Estimator-specific ORG key; if None, retrieved from the global config, by default None. | +| `num_workers` | `Optional[int]` | Number of workers (threads) to use, by default 1. | \ No newline at end of file diff --git a/src/app/docs/quick-start/page.md b/src/app/docs/quick-start/page.md deleted file mode 100644 index b9554e4..0000000 --- a/src/app/docs/quick-start/page.md +++ /dev/null @@ -1,92 +0,0 @@ ---- -title: Quick start -nextjs: - metadata: - title: Quick start - description: Get started with Dingo. ---- - -## - -To get started with Dingo, you can install the framework using pip: - -```bash -pip install agent-dingo -``` - -Now we can create a simple pipeline that summarizes the text provided as input and translates it into French. In this particular example, we will use GPT-3.5 model from OpenAI, but Dingo supports other LLM providers as well. - -Firstly, make sure to set the `OPENAI_API_KEY` environment variable to your OpenAI API key: - -```bash -export OPENAI_API_KEY=your-api-key -``` - -Next, create a new Python script and import the necessary modules: - -```python -from agent_dingo.llm.openai import OpenAI -from agent_dingo.core.blocks import PromptBuilder -from agent_dingo.core.message import UserMessage -from agent_dingo.core.state import ChatPrompt -``` - -Then, define the pipeline by creating and chaining the building blocks together: - -````python -# Model -gpt = OpenAI("gpt-3.5-turbo") - -# Summary prompt block -summary_pb = PromptBuilder( - [UserMessage("Summarize the text in 10 words: ```{text}```.")] -) - -# Translation prompt block -translation_pb = PromptBuilder( - [UserMessage("Translate the text into {language}: ```{summarized_text}```.")], - from_state=["summarized_text"], -) - -# Pipeline -pipeline = summary_pb >> gpt >> translation_pb >> gpt -```` - -Finally, run the pipeline with the input text: - -```python -input_text = """ -Dingo is an ancient lineage of dog found in Australia, exhibiting a lean and sturdy physique adapted for speed and endurance, dingoes feature a wedge-shaped skull and come in colorations like light ginger, black and tan, or creamy white. They share a close genetic relationship with the New Guinea singing dog, diverging early from the domestic dog lineage. Dingoes typically form packs composed of a mated pair and their offspring, indicating social structures that have persisted through their history, dating back approximately 3,500 years in Australia. -""" - -output = pipeline.run(text = input_text, language = "french") -print(output) -``` - -To deploy the pipeline as a web service, you can use the following code: - -```python -#server.py -from agent_dingo.serve import serve_pipeline - -serve_pipeline({"gpt-summary-translation": pipeline}, port=8000) -``` - -This will start a web server on port 8000, exposing the pipeline as a REST API. You can now send requests using any HTTP client, or even using the official OpenAI python client library: - -```python -# client.py -import openai - -client = openai.OpenAI(base_url = "http://localhost:8000") - -messages = [ - {"role": "context_text", "content": ""}, - {"role": "context_language", "content": "french"}, -] - -out = client.chat.completions.create(messages = messages, model = "gpt-summary-translation") -print(out) -``` - -In this example, we have created a simple pipeline which is not designed for multi-turn conversations. To make it compatible with OpenAI Chat structure, Dingo defines special message roles like `context_text` and `context_language` which are used to pass the input arguments to the pipeline. Section [Core](/docs/core-overview) goes into more details on differences between context and chat inputs and how to handle them in Dingo. \ No newline at end of file diff --git a/src/app/docs/tagging-overview/page.md b/src/app/docs/tagging-overview/page.md new file mode 100644 index 0000000..d61d4b9 --- /dev/null +++ b/src/app/docs/tagging-overview/page.md @@ -0,0 +1,23 @@ +--- +title: Overview +nextjs: + metadata: + title: Overview + description: Learn about text tagging. +--- + +Tagging in Scikit-LLM can be an arbitrary task that takes a raw text and returns the same text with inserted XML-like tags. + +For example, a sentiment analysis task could look as follows: + +Input: +```bash +I love my new phone, but I am disappointed with the battery life. +``` + +Output: +```xml +I love my new phone, but I am disappointed with the battery life. +``` + +In an ideal scenario, such tagging process should be invertible, so the original text can always be reconstructed from the tagged one. However, this is not always feasible and hence not considered to be a mandatory requirement. diff --git a/src/app/docs/use-cases-local-chatbot/page.md b/src/app/docs/use-cases-local-chatbot/page.md deleted file mode 100644 index c735143..0000000 --- a/src/app/docs/use-cases-local-chatbot/page.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -title: Building a local chatbot -nextjs: - metadata: - title: Building a local chatbot - description: Learn how to build a local chat bot. ---- - -## Overview - -In the [previous tutorial](/docs/use-cases-rag-chatbot), we have built a RAG chatbot using a closed-source LLM and embedding model from OpenAI. Since some users prefer running LLMs locally, this tutorial will demonstrate how to build a RAG chatbot using a fully local, open-source solution by changing just two Dingo components. - ---- - -## Chatbot Architecture and Technical Stack - -![Local App Architecture](https://gist.githubusercontent.com/iryna-kondr/f4779bfaa918e8af9ab1d455d63e142c/raw/4ef5627a6ce5ac37ce3ffacb786a35e49558f674/dingo_local_app_architecture.svg) - -The application will consist of the following components: - -1. [Streamlit](https://streamlit.io/) application: provides a frontend interface for users to interact with a chatbot. - -2. FastAPI: facilitates communication between the frontend and backend. - -3. [CapybaraHermes-2.5-Mistral-7B-GGUF](https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF): LLM that generates responses upon receiving user queries. - -4. Embedding model from [SentenceTransformers](https://sbert.net/): computes text embeddings. - -5. [QDrant](https://qdrant.tech/): vector database that stores embedded chunks of text. - -There are two main differences to an architecture used in the previous tutorial: - -- **Usage of quantized open-source LLM:** - -For running the model locally, Dingo can use [`llama-cpp-python`](https://github.com/abetlen/llama-cpp-python) that is a Python binding for [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library which allows to run models converted to GGUF, a binary file format for storing models for inference with `llama.cpp`. - -You can find many GGUF models on [Hugging Face Hub](https://huggingface.co/models?library=gguf). We have chosen `CapybaraHermes-2.5-Mistral-7B-GGUF` model [prvided by TheBloke](https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF) for this tutorial. - -In order to download the model, you must go to [`Files and versions`](https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF/tree/main), where you will find many different files to choose from. They correspond to different [quantization types](https://huggingface.co/docs/hub/en/gguf#quantization-types) of the model. Quantization involves reducing the memory needed to store model weights by decreasing their precision (for example, from 32-bit floating points to 4-bit integers). Higher precision usually leads to a higher accuracy but also requires more computational resources, which can make the model slower and more costly to operate. Decreasing the precision allows loading large models that typically would not fit into memory, and accelerating the inference. Usually, a 4-bit quantization is considered to be an optimal balance between performance, and size/speed for LLMs. - -- **Usage of open-source embedding model:** - -SentenceTransformers is a Python toolkit that is built on top of Hugging Face's transformers library. It facilitates using transformer models, like BERT, RoBERTa, and others, for generating sentence embeddings. These embeddings can be used for tasks such as clustering, semantic search, and classification of texts. You can check the provided pre-trained models tuned for specific tasks either on the page of SentenceTransformers [here](https://sbert.net/docs/pretrained_models.html#model-overview), or on the [Hugging Face Hub](https://huggingface.co/models?library=sentence-transformers&sort=downloads). The models on Hugging Face Hub have a [widget](https://huggingface.co/docs/hub/models-widgets#whats-a-widget) that allows running inferences and playing with the model directly in the browser. - ---- - -## Implementation - -As the first step, we need to initialize an embedding model, a chat model and a vector store that will be populated with embedded chunks in the next step. - -```python -# components.py -from agent_dingo.rag.embedders.sentence_transformer import SentenceTransformer -from agent_dingo.rag.vector_stores.qdrant import Qdrant -from agent_dingo.llm.llama_cpp import LlamaCPP - -# Initialize an embedding model -embedder = SentenceTransformer(model_name="paraphrase-MiniLM-L6-v2") - -# Initialize a vector store -vector_store = Qdrant(collection_name="phi_3_docs", embedding_size=384, path="./qdrant_db") - -# Initialize an LLM -model = "capybarahermes-2.5-mistral-7b.Q4_K_M.gguf" -llm = LlamaCPP(model=model, n_ctx = 2048) -``` - -The subsequent steps involve populating the vector store, creating a RAG pipeline, and building a chatbot UI. These steps are exactly the same as in the [previous tutorial](/docs/use-cases-rag-chatbot). - -By asking a question about the Phi-3 family of models, we can verify that our local model accurately retrieves the relevant information: - -![Dingo Local Chatbot](https://i.ibb.co/23VmG8Y/Screenshot-2024-05-04-at-21-12-59.png) - ---- - -## Conclusion - -In this tutorial we have built a simple local chatbot that utilizes RAG technique and successfully retrieves information from a vector store to generate up-to-date responses. It can be seen that Dingo provides developers with flexibility, as the components of a LLM pipeline can be easily exchanged. For example, we were able to switch from a proprietary solution to a fully open-source solution running locally by simply changing two components of the pipeline. diff --git a/src/app/docs/use-cases-rag-agent/page.md b/src/app/docs/use-cases-rag-agent/page.md deleted file mode 100644 index 7904c41..0000000 --- a/src/app/docs/use-cases-rag-agent/page.md +++ /dev/null @@ -1,246 +0,0 @@ ---- -title: Building a RAG agent -nextjs: - metadata: - title: Building a RAG agent - description: Learn how to build a RAG agent. ---- - -## Overview - -In previous tutorials, we built a pipeline that embeds the chunks of text similar to user's query to a system message, which allows the chatbot to access the external knowledge base. However, in practice, this approach may be too naive, as it: - -- Embeds the data regardless its necessity; -- Does not provide a mechanism to selectively access different data sources; -- Does not allow to modify the query before retrieving the data; -- Does not allow to pass multiple queries. - -All of these limitations can be addressed by building a more sophisticated pipeline logic, that might have a routing and query-rewriting mechanisms. However, a viable alternative is to use an `Agent` which can inherently perform all of these tasks. - -The fundamental concept of agents involves using a language model to determine a sequence of actions (including the usage of external tools) and their order. One possible action could be retrieving data from an external knowledge base in response to a user's query. In this tutorial, we will develop a simple Agent that accesses multiple data sources and invokes data retrieval when needed. - -As an example of external knowledge bases, we will use three webpages containing release announcement posts about recently released generative models: - -1. [Phi-3 family of models](https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/) from Microsoft; -2. [Llama 3 model](https://ai.meta.com/blog/meta-llama-3/) from Meta; -3. [OpenVoice model](https://research.myshell.ai/open-voice) from MyShell. - -Since all of these models were released recently and this information was not included in GPT-4's training data, GPT can either provide no information about these topics, or may hallucinate and generate incorrect responses (see example in my previous article [here](/docs/use-cases-rag-chatbot)). By creating an agent that is able to retrieve data from external datasources (such as webpages linked above), we will provide an LLM with relevant contextual information that will be used for generating responses. - ---- - -## RAG Agent Architecture and Technical Stack - -![App Architecture](https://gist.githubusercontent.com/iryna-kondr/f4779bfaa918e8af9ab1d455d63e142c/raw/f33293fd26a27e636286b8a9285b56d120bf1cab/dingo_agent_architecture.svg) - -The application will consist of the following components: - -1. [Streamlit](https://streamlit.io/) application: provides a frontend interface for users to interact with a chatbot. - -2. `FastAPI`: facilitates communication between the frontend and backend. - -3. `Dingo Agent`: `GPT-4 Turbo` model from OpenAI that has access to provided knowledge bases and invokes data retrieval from them if needed. - -4. `LLMs docs`: a vector store containing documentation about two recently released Phi-3 family of models and Llama 3. - -5. `Audio gen docs`: a vector store containing documentation about recently released OpenVoice model. - -6. `Embedding V3 small` model from OpenAI: computes text embeddings. - -7. [QDrant](https://qdrant.tech/): vector database that stores embedded chunks of text. - ---- - -## Implementation - -### Indexing - -#### Step 1: - -As the first step, we need to initialize an embedding model, a chat model, and two vector stores: one for storing documentation for Llama 3 and Phi-3, and another for storing documentation for OpenVoice. - -{% callout title="Note" %} -It is needed to set OPENAI_API_KEY environment variable. -{% /callout %} - -```python -# components.py -from agent_dingo.rag.embedders.openai import OpenAIEmbedder -from agent_dingo.rag.vector_stores.qdrant import Qdrant -from agent_dingo.llm.openai import OpenAI - -# Initialize an embedding model -embedder = OpenAIEmbedder(model="text-embedding-3-small") - -# Initialize a vector store with information about Phi-3 and Llama 3 models -llm_vector_store = Qdrant(collection_name="llm", embedding_size=1536, path="./qdrant_db_llm") - -# Initialize a vector store with information about OpenVoice model -audio_gen_vector_store = Qdrant(collection_name="audio_gen", embedding_size=1536, path="./qdrant_db_audio_gen") - -# Initialize an LLM -llm = OpenAI(model = "gpt-3.5-turbo") -``` - -#### Step 2: - -Then, the above-mentioned websites have to be parsed, chunked into smaller pieces, and embedded. The embedded chunks are used to populate the corresponding vector stores. - -```python -# build.py -from components import llm_vector_store, audio_gen_vector_store, embedder -from agent_dingo.rag.readers.web import WebpageReader -from agent_dingo.rag.chunkers.recursive import RecursiveChunker - -# Read the content of the websites -reader = WebpageReader() -phi_3_docs = reader.read("https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/") -llama_3_docs = reader.read("https://ai.meta.com/blog/meta-llama-3/") -openvoice_docs = reader.read("https://research.myshell.ai/open-voice") - -# Chunk the documents -chunker = RecursiveChunker(chunk_size=512) -phi_3_chunks = chunker.chunk(phi_3_docs) -llama_3_chunks = chunker.chunk(llama_3_docs) -openvoice_chunks = chunker.chunk(openvoice_docs) - -# Embed the chunks -for doc in [phi_3_chunks, llama_3_chunks, openvoice_chunks]: - embedder.embed_chunks(doc) - -# Populate LLM vector store with embedded chunks about Phi-3 and Llama 3 -for chunk in [phi_3_chunks, llama_3_chunks]: - llm_vector_store.upsert_chunks(chunk) - -# Populate audio gen vector store with embedded chunks about OpenVoice -audio_gen_vector_store.upsert_chunks(openvoice_chunks) -``` - -Run this script: - -```bash -python build.py -``` - -#### Step 3: - -Once the vector store is created, we can create a RAG pipeline. To access the pipeline from the streamlit application, we can serve it using the `serve_pipeline` function, which provides a REST API compatible with the OpenAI API (this means that we can use an official OpenAI Python client to interact with the pipeline). - -```python -# serve.py -from agent_dingo.agent import Agent -from agent_dingo.serve import serve_pipeline -from components import llm_vector_store, audio_gen_vector_store, embedder, llm - -agent = Agent(llm, max_function_calls=3) - -# Define a function that an agent can call if needed -@agent.function -def retrieve(topic: str, query: str) -> str: - """Retrieves the documents from the vector store based on the similarity to the query. - This function is to be used to retrieve the additional information in order to answer users' queries. - - Parameters - ---------- - topic : str - The topic, can be either "large_language_models" or "audio_generation_models". - "large_language_models" covers the documentation of Phi-3 family of models from Microsoft and Llama 3 model from Meta. - "audio_generation_models" covers the documentation of OpenVoice voice cloning model from MyShell. - Enum: ["large_language_models", "audio_generation_models"] - query : str - A string that is used for similarity search of document chunks. - - Returns - ------- - str - JSON-formatted string with retrieved chunks. - """ - print(f'called retrieve with topic {topic} and query {query}') - if topic == "large_language_models": - vs = llm_vector_store - elif topic == "audio_generation_models": - vs = audio_gen_vector_store - else: - return "Unknown topic. The topic must be one of `large_language_models` or `audio_generation_models`" - query_embedding = embedder.embed(query)[0] - retrieved_chunks = vs.retrieve(k=5, query=query_embedding) - print(f'retrieved data: {retrieved_chunks}') - return str([chunk.content for chunk in retrieved_chunks]) - -# Create a pipeline -pipeline = agent.as_pipeline() - -# Serve the pipeline -serve_pipeline( - {"gpt-agent": pipeline}, - host="127.0.0.1", - port=8000, - is_async=False, -) -``` - -Run the script: - -```bash -python serve.py -``` - -At this stage, we have an openai-compatible compatible backend with a model named `gpt-agent`, running on `http://127.0.0.1:8000/`. The Streamlit application will send requests to this backend. - -#### Step 4: - -Finally, we can proceed with building a chatbot UI: - -```python -# app.py -import streamlit as st -from openai import OpenAI - -st.title("🦊 Agent") - -# provide any string as an api_key parameter -client = OpenAI(base_url="http://127.0.0.1:8000", api_key="123") - -if "openai_model" not in st.session_state: - st.session_state["openai_model"] = "gpt-agent" -if "messages" not in st.session_state: - st.session_state.messages = [] - -for message in st.session_state.messages: - avatar = "🦊" if message["role"] == "assistant" else "👤" - with st.chat_message(message["role"], avatar=avatar): - st.markdown(message["content"]) - -if prompt := st.chat_input("How can I assist you today?"): - st.session_state.messages.append({"role": "user", "content": prompt}) - with st.chat_message("user", avatar="👤"): - st.markdown(prompt) - - with st.chat_message("assistant", avatar="🦊"): - stream = client.chat.completions.create( - model=st.session_state["openai_model"], - messages=[ - {"role": m["role"], "content": m["content"]} - for m in st.session_state.messages - ], - stream=False, - ) - response = st.write_stream((i for i in stream.choices[0].message.content)) - st.session_state.messages.append({"role": "assistant", "content": response}) -``` - -Run the application: - -```bash -streamlit run app.py -``` - -🎉 We have successfully developed an agent that is augmented with the technical documentation of several newly released generative models, and can retrieve information from these documents if necessary. To assess the agent's ability to decide when to call the `retrieve` function and its effectiveness in retrieving data from external sources, we can pose some questions about the documents provided. As you can see below, the agent generated correct responses to these questions: - -![Dingo Agent](https://i.ibb.co/Kh3zVGV/Screenshot-2024-05-05-at-15-33-02.png) - ---- - -## Conclusion - -In this tutorial, we have developed a RAG agent that can access external knowledge bases and retrieve data from them if needed. Unlike a "naive" RAG pipeline, the agent can selectively decide whether to access the external data, which data source to use (and how many times), and how to rewrite the user's query before retrieving the data. This approach allows the agent to provide more accurate and relevant responses, while the high-level pipeline logic remains as simple as of a "naive" RAG pipeline. diff --git a/src/app/docs/use-cases-rag-chatbot/page.md b/src/app/docs/use-cases-rag-chatbot/page.md deleted file mode 100644 index 2cf8547..0000000 --- a/src/app/docs/use-cases-rag-chatbot/page.md +++ /dev/null @@ -1,272 +0,0 @@ ---- -title: Building a RAG chatbot -nextjs: - metadata: - title: Building a RAG chatbot - description: Learn how to build a RAG chat bot. ---- - -## Overview - -Chatbots are among the most popular use cases for large language models (LLMs). They are designed to understand and respond to user inquiries, provide answers, perform tasks, or direct users to resources. Utilizing chatbots can significantly decrease customer support costs and improve response times to user requests. However, a common issue with chatbots is their tendency to deliver generic information when users expect domain-specific responses. Additionally, they may generate outdated information when users need current updates. - -For demonstrations, I have chosen the webpage about [Phi-3](https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/) — a family of open AI models by Microsoft released in April 2024. - -If we ask how many parameters Phi-3-mini model has, GPT-4 will generate a response indicating that it does not know the answer: - -```python -from openai import OpenAI -client = OpenAI() - -completion = client.chat.completions.create( - model="gpt-4-turbo", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "How many parameters does the Phi-3-mini model from Microsoft have?"} - ] -) - -print(completion.choices[0].message) - -# As of the last update, the Phi model variants by Microsoft, including the Phi-3-mini, are not explicitly defined in publicly available resources. There has been no detailed information released about a specific "Phi-3-mini" model. -``` - -If we ask GPT-3.5 the same question, it will hallucinate and provide incorrect information: - -```python -completion = client.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "How many parameters does the Phi-3-mini model from Microsoft have?"} - ] -) - -print(completion.choices[0].message) - -# The Phi-3-mini model from Microsoft has 121 million parameters. -``` - -These problems can be addressed by using the retrieval-augmented generation (RAG) technique. This technique supplements the LLM with a knowledge base external to its training data sources. For instance, an organization's internal knowledge base, such as a Wiki or internal PDFs, can be provided. - -The tutorial below will demonstrate how to build a simple chatbot that utilizes RAG technique and can retrieve information about a recently released family of Phi-3 models. - ---- - -## RAG Architecture - -![RAG Architecture](https://gist.githubusercontent.com/iryna-kondr/f4779bfaa918e8af9ab1d455d63e142c/raw/ce8e33447a34db0259f888d39c58256c2cbf43b1/dingo_rag_use_case.svg) - -The basic steps of the Naive RAG include: - -**1. Indexing** - -Indexing starts with extraction of raw data from various formats such as webpage, PDF, etc. To manage the context restrictions of language models and increase the response accuracy, the extracted text is broken down into smaller, manageable chunks. For now, Dingo supports a recursive chunking that involves breaking down a large text input into smaller segments recursively until the chunks are of a desired size. The choice of the chunking size is heavily dependent on the needs of RAG application. Thus, it is recommeded to experiment with different sizes to select the best one that will allow preserving the context and maintaining the accuracy. The extracted chunks are encoded into vector representations using an embedding model and stored in a vector database. - -**2. Retrieval** - -When a user submits a query, the RAG system uses the encoding model from the indexing phase to convert the query into a vector representation. It then calculates similarity scores between the query vector and the vectors of chunks in the vector database. The system identifies and retrieves the top K chunks with the highest similarity to the query. These chunks serve as the expanded context for the prompt. - -**3. Generation** - -The users query and selected chunks are combined into a single prompt and passed to LLM. Thus, the model is provided with the necessary contextual information to formulate and deliver a response. - ---- - -## Chatbot Architecture and Technical Stack - -![App Architecture](https://gist.githubusercontent.com/iryna-kondr/f4779bfaa918e8af9ab1d455d63e142c/raw/7f8f41d5bf00a23638b8958cc970281857a43a6f/dingo_app_architecture.svg) - -On a high level, the application will consist of the following components: - -1. [Streamlit](https://streamlit.io/) application: provides a frontend interface for users to interact with a chatbot. - -2. `FastAPI`: facilitates communication between the frontend and backend. - -3. `GPT-4 Turbo` model from OpenAI: LLM that generates responses upon receiving user queries. - -4. `Embedding V3 small` model from OpenAI: computes text embeddings. - -5. [QDrant](https://qdrant.tech/): vector database that stores embedded chunks of text. - ---- - -## Implementation - -### Indexing - -#### Step 1: - -As the first step, we need to initialize an embedding model, a chat model and a vector store that will be populated with embedded chunks in the next step. - -{% callout title="Note" %} -It is needed to set OPENAI_API_KEY environment variable. -{% /callout %} - -```python -# components.py -from agent_dingo.rag.embedders.openai import OpenAIEmbedder -from agent_dingo.rag.vector_stores.qdrant import Qdrant -from agent_dingo.llm.openai import OpenAI - -# Initialize an embedding model -embedder = OpenAIEmbedder(model="text-embedding-3-small") - -# Initialize a vector store -vector_store = Qdrant(collection_name="phi_3_docs", embedding_size=1536, path="./qdrant_db") - -# Initialize an LLM -llm = OpenAI(model="gpt-4-turbo") -``` - -#### Step 2: - -Then, the website about [Phi-3](https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/) family of models has to be parsed, chunked into smaller pieces, and embedded. The embedded chunks are used to populate a vector store. - -```python -# build.py -from components import vector_store, embedder -from agent_dingo.rag.readers.web import WebpageReader -from agent_dingo.rag.chunkers.recursive import RecursiveChunker - -# Read the content of the website -reader = WebpageReader() -docs = reader.read("https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/") - -# Chunk the document -chunker = RecursiveChunker(chunk_size=512) -chunks = chunker.chunk(docs) - -# Embed the chunks -embedder.embed_chunks(chunks) - -# Populate vector store with embedded chunks -vector_store.upsert_chunks(chunks) -``` - -Run this script: - -```bash -python build.py -``` - -At this stage, the vector store is created, allowing chunks to be retrieved and incorporated into the prompt based on a user's query. - -#### [Optional Step] - -It is also possible to identify which chunks are retrieved and check their similarity scores to the user's query: - -```python -# test.py -from components import vector_store, embedder -query = "How many parameters does Phi-3-mini model from Microsoft have?" -query_embedding = embedder.embed([query])[0] -# select a single chunk (k=1) with the highest similarity to the query -retrieved_chunks = vector_store.retrieve(k=1, query=query_embedding) -print(retrieved_chunks) -#[RetrievedChunk(content=' Starting today, Phi-3-mini , a 3.8B language model is available...', document_metadata={'source': 'https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/'}, score=0.7154231207501476)] -``` - -We can see that the correct chunk was retrieved, which indeed contains information about the number of parameters in the Phi-3-mini model. - -### Retrieval and Augmentation - -#### Step 3: - -Once the vector store is created, we can create a RAG pipeline and serve it. - -Streamlit [only supports](https://docs.streamlit.io/develop/api-reference/chat/st.chat_message) two types of messages: `User` and `Assistant`. However, it us often more appropriate to include the retrieved data into the `System` message. Therefore, we use a custom block that injects a `System` message into the chat prompt before passing it to the RAG modifier. - -```python -# serve.py -from agent_dingo.rag.prompt_modifiers import RAGPromptModifier -from agent_dingo.serve import serve_pipeline -from agent_dingo.core.blocks import InlineBlock -from agent_dingo.core.state import ChatPrompt -from agent_dingo.core.message import SystemMessage -from components import vector_store, embedder, llm - -@InlineBlock() -def inject_system_message(state: ChatPrompt, context, store): - messages = state.messages - system_message = SystemMessage("You are a helpful assistant.") - return ChatPrompt([system_message]+messages) - -rag = RAGPromptModifier(embedder, vector_store) -pipeline = inject_system_message>>rag>>llm - -serve_pipeline( - {"gpt-rag": pipeline}, - host="127.0.0.1", - port=8000, - is_async=False, -) -``` - -Run the script: - -```bash -python serve.py -``` - -At this stage, we have a RAG pipeline compatible with the OpenAI API, named `gpt-rag`, running on `http://127.0.0.1:8000/`. The Streamlit application will send requests to this backend. - -#### Step 4: - -Finally, we can proceed with building a chatbot UI: - -```python -# app.py -import streamlit as st -from openai import OpenAI - -st.title("🦊 LLM Expert") - -# provide any string as an api_key parameter -client = OpenAI(base_url="http://127.0.0.1:8000", api_key="123") - -if "openai_model" not in st.session_state: - st.session_state["openai_model"] = "gpt-rag" - -if "messages" not in st.session_state: - st.session_state.messages = [] - -for message in st.session_state.messages: - avatar = "🦊" if message["role"] == "assistant" else "👤" - with st.chat_message(message["role"], avatar=avatar): - st.markdown(message["content"]) - -if prompt := st.chat_input("How can I assist you today?"): - st.session_state.messages.append({"role": "user", "content": prompt}) - with st.chat_message("user", avatar="👤"): - st.markdown(prompt) - - with st.chat_message("assistant", avatar="🦊"): - stream = client.chat.completions.create( - model=st.session_state["openai_model"], - messages=[ - {"role": m["role"], "content": m["content"]} - for m in st.session_state.messages - ], - stream=False, - ) - response = st.write_stream((i for i in stream.choices[0].message.content)) - st.session_state.messages.append({"role": "assistant", "content": response}) -``` - -Run the application: - -```bash -streamlit run app.py -``` - -🎉 We have successfully developed a chatbot that is augmented with the technical documentation of Phi-3 family of models. -If we pose the same question to this chatbot as we previously did to GPT-4 and GPT-3.5 models, we will observe that it correctly answers the question: - -![Dingo Chatbot](https://i.ibb.co/rQm0m41/Dingo-Chatbot.png) - ---- - -## Conclusion - -In this tutorial we have built a simple chatbot that utilizes RAG technique and successfully retrieves information from a vector store to generate up-to-date responses. It can be seen that Dingo enhances the development of LLM-based applications by offering essential (core) features and flexibility. That allows developers to quickly and easily create application prototypes. diff --git a/src/lib/navigation.js b/src/lib/navigation.js index 5e9472a..e7cd6c4 100644 --- a/src/lib/navigation.js +++ b/src/lib/navigation.js @@ -1,3 +1,4 @@ + export const navigation = [ { title: 'Introduction', @@ -29,17 +30,13 @@ export const navigation = [ { title: 'Overview', href: '/docs/text-vectorization' }, ], }, - // { - // title: 'Use cases', - // links: [ - // { title: 'Building a RAG chatbot', href: '/docs/use-cases-rag-chatbot' }, - // { - // title: 'Building a local chatbot', - // href: '/docs/use-cases-local-chatbot', - // }, - // { title: 'Building a RAG agent', href: '/docs/use-cases-rag-agent' }, - // ], - // }, + { + title: 'Tagging', + links: [ + { title: 'Overview', href: '/docs/tagging-overview' }, + { title: 'Named Entity Recognition', href: '/docs/ner' }, + ] + }, { title: 'Contributing', links: [{ title: 'How to contribute', href: '/docs/how-to-contribute' }], diff --git a/src/markdoc/tags.js b/src/markdoc/tags.js index 957b1d0..b7fa7ff 100644 --- a/src/markdoc/tags.js +++ b/src/markdoc/tags.js @@ -42,6 +42,16 @@ const tags = { href: { type: String }, }, }, + html: { + parse: true, + attributes: { + innerHTML: { type: String }, + }, + render: ({ innerHTML, children }) => { + console.log(innerHTML); + return
+ }, + }, } export default tags