Merge pull request #9 from FederalCSUMission/VectorDB_IPYNB_Yassin

Added notebook for querying VDB w/o reloading docs
FederalCSUMission · Sep 11, 2023 · b819da4 · b819da4
2 parents a91e0d5 + 4e986af
commit b819da4
Show file tree

Hide file tree

Showing 2 changed files with 299 additions and 3 deletions.
diff --git a/10-VectorDB_Load.ipynb → 11-VectorDB_Load.ipynb b/10-VectorDB_Load.ipynb → 11-VectorDB_Load.ipynb
@@ -189,10 +189,10 @@
         "import weaviate\n",
         "\n",
         "WEAVIATE_URL = os.environ[\"VECTOR_DB_WEAVIATE_URL\"] \n",
-        "WEAVIATE_API_KEY = os.environ[\"VECOTR_DB_WEVIATE_API_KEY\"]\n",
+        "WEAVIATE_API_KEY = os.environ[\"VECTOR_DB_WEVIATE_API_KEY\"]\n",
         "\n",
         "client = weaviate.Client(url=WEAVIATE_URL, auth_client_secret=weaviate.AuthApiKey(WEAVIATE_API_KEY))\n",
-        "vectorstore = Weaviate.from_documents(docs, embeddings, client=client, by_text=False)"
+        "vectorstore = Weaviate.from_documents(docs, embeddings, client=client, by_text=False, index_name=\"arxivcs_index\")"
       ]
     },
     {
@@ -256,7 +256,7 @@
         "\n",
         "llm = AzureChatOpenAI(\n",
         "    openai_api_base=os.environ[\"AZURE_OPENAI_ENDPOINT\"] ,\n",
-        "    openai_api_version=\"2023-05-15\",\n",
+        "    openai_api_version=os.environ[\"AZURE_OPENAI_API_VERSION\"],\n",
         "    deployment_name=os.environ[\"AZURE_OPENAI_LLM_DEPLOYMENT\"],\n",
         "    openai_api_key=os.environ[\"AZURE_OPENAI_API_KEY\"],\n",
         "    openai_api_type=\"azure\",\n",

diff --git a/12-VectorDB_QA.ipynb b/12-VectorDB_QA.ipynb
@@ -0,0 +1,296 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "source": [
+        "# Question and answer LLM using the Vector Database data as context"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "source": [
+        "Prerequites: \n",
+        "\n",
+        "1. This notebook assumes you've completed the previous notebook and have data loaded into your vector store. \n",
+        "\n",
+        "2. Azure openAI endpoint\n",
+        "    Confirm that you've deployed both an embedding model and a LLM. https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/create-resource?pivots=web-portal\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "jupyter": {
+          "outputs_hidden": true,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [],
+      "source": [
+        "%pip install weaviate-client\n",
+        "%pip install langchain\n",
+        "%pip install openai[datalib]\n",
+        "%pip install tiktoken\n",
+        "%pip install python-dotenv"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "gather": {
+          "logged": 1694446501722
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "True"
+            ]
+          },
+          "execution_count": 8,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "import os\n",
+        "from dotenv import load_dotenv\n",
+        "load_dotenv(\"credentials.env\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "gather": {
+          "logged": 1694446501905
+        },
+        "jupyter": {
+          "outputs_hidden": true
+        }
+      },
+      "outputs": [],
+      "source": [
+        "from langchain.vectorstores import Weaviate\n",
+        "import weaviate\n",
+        "\n",
+        "WEAVIATE_URL = os.environ[\"VECTOR_DB_WEAVIATE_URL\"]\n",
+        "WEAVIATE_API_KEY = os.environ[\"WEAVIATE_API_KEY\"]\n",
+        "\n",
+        "#create client to inte ract with Weaviate\n",
+        "client = weaviate.Client(url=WEAVIATE_URL, auth_client_secret=weaviate.AuthApiKey(WEAVIATE_API_KEY))\n",
+        "#Print schemas in weaviate, you should see your index from the previous notebook named \"arxivcs_index\"\n",
+        "client.schema.get()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "source": [
+        "### Set embedding parameters and LLM paramters"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "source": [
+        "We are not embedding documents into Weaviate but we still need the embedding model to convert our prompt into a vector and do a similarity search. Make sure you are using the same embedding model here as you did to write data to the vector database.  "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "gather": {
+          "logged": 1694446502048
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [],
+      "source": [
+        "#embedding parameters, should be the same as what was used to embed documents into weaviate\n",
+        "import openai\n",
+        "\n",
+        "openai.api_type = \"azure\"\n",
+        "openai.api_key = os.environ['openai_api_key']\n",
+        "openai.api_base = os.environ[\"AZURE_OPENAI_ENDPOINT\"]\n",
+        "openai.api_version = os.environ[\"AZURE_OPENAI_API_VERSION\"]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "gather": {
+          "logged": 1694446502196
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [],
+      "source": [
+        "#LLM Paramaters\n",
+        "from langchain.chat_models import AzureChatOpenAI\n",
+        "\n",
+        "llm = AzureChatOpenAI(\n",
+        "    openai_api_base=os.environ[\"AZURE_OPENAI_ENDPOINT\"],\n",
+        "    openai_api_version=os.environ[\"AZURE_OPENAI_API_VERSION\"],\n",
+        "    deployment_name=os.environ[\"AZURE_OPENAI_LLM_DEPLOYMENT\"],\n",
+        "    openai_api_key=os.environ['openai_api_key'],\n",
+        "    openai_api_type=\"azure\",\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "gather": {
+          "logged": 1694446250628
+        },
+        "jupyter": {
+          "outputs_hidden": false,
+          "source_hidden": false
+        },
+        "nteract": {
+          "transient": {
+            "deleting": false
+          }
+        }
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "'The main themes in these retrieved documents are quantum mechanics, measurement in quantum mechanics, the limitations of quantum equations and amplitudes, the physical meaning and interpretation of long numbers in quantum computing, and the untestable nature of claims about individual mega-states in quantum computing. The documents also discuss the relationship between quantum mechanics and materialism, as well as the challenges and limitations of quantum computing.'"
+            ]
+          },
+          "execution_count": 6,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "from langchain import PromptTemplate, LLMChain\n",
+        "\n",
+        "# Prompt\n",
+        "prompt = PromptTemplate.from_template(\n",
+        "    \"Summarize the main themes in these retrieved docs: {docs}\"\n",
+        ")\n",
+        "\n",
+        "# Chain\n",
+        "llm_chain = LLMChain(llm=llm, prompt=prompt)\n",
+        "\n",
+        "#Converting our input question to an Embedding to use for search\n",
+        "response = openai.Embedding.create(\n",
+        "    input=\"What is Quantum mechanics?\",\n",
+        "    engine=\"textembedding\"\n",
+        ")\n",
+        "embeddings = response['data'][0]['embedding']\n",
+        "\n",
+        "# Run\n",
+        "db = Weaviate(client=client, index_name=\"arxivcs_index\", text_key=\"text\")\n",
+        "docs = db.similarity_search_by_vector(embedding=embeddings)\n",
+        "result = llm_chain(docs)\n",
+        "\n",
+        "# Output\n",
+        "result['text']"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernel_info": {
+      "name": "python310-sdkv2"
+    },
+    "kernelspec": {
+      "display_name": "Python 3.10 - SDK v2",
+      "language": "python",
+      "name": "python310-sdkv2"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.11"
+    },
+    "microsoft": {
+      "host": {
+        "AzureML": {
+          "notebookHasBeenCompleted": true
+        }
+      },
+      "ms_spell_check": {
+        "ms_spell_check_language": "en"
+      }
+    },
+    "nteract": {
+      "version": "nteract-front-end@1.0.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2
+}