In [None]:
!pip install langchain_community langchain_text_splitters chromadb datasets sentence-transformers langchain ctransformers langchain_experimental

# Using ChromaDB and LangChain for an open-source RAG system for natural language data

In [1]:
# import
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

from langchain_core.documents import Document
import re

In [3]:
def create_vector_db_from_docs(docs: list[Document], embedding_function):
  # load embeddings of docs into Chroma
  db = Chroma.from_documents(docs, embedding_function)

  return db

In [4]:
def query_vector_db(db, docs: list[Document], query: str, k: int = 5):
  """Get the top-k most similar documents to the query."""
  # define number of docs to return
  k = min(k, len(docs))

  # find top-k most similar docs
  similar_docs = db.similarity_search_with_score(query, k)

  return similar_docs

## Create and query vector database on open-source essay data (learning)

### Chunk and embed data

In [None]:
from datasets import load_dataset

dataset = load_dataset("iamketan25/essay-instructions-dataset")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/10.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
# get 1 sample
text = str(dataset['train'][4]['chosen'])

In [None]:
# save for loading with langchain
f = open('essay.txt', 'w')
f.write(text)
f.close()

In [None]:
# load the document
loader = TextLoader("/content/essay.txt")
documents = loader.load()

# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, chunk_overlap=0, separators=[" ", ",", "\n"]
)
docs = text_splitter.split_documents(documents)

In [None]:
len(docs)

21

In [None]:
docs

In [None]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

db = create_vector_db_from_docs(docs, embedding_function)

### Query vector database

In [None]:
# query the db
query = "What approach do Google China take?"

similar_docs = query_vector_db(docs, query=query, k=5)

# print results
print(similar_docs)

# NOTE: this uses cosine distance, not similarity, which can be greater than 1 and a lower value is better - it is the complement to cosine similarity
# https://stackoverflow.com/questions/76678783/langchains-chroma-vectordb-similarity-search-with-score-and-vectordb-simil

[(Document(page_content='sales, as an essential output for novelty as suggested by diamond model. Google China also experiences competition in the advertising industry from other organisations such as Facebook and other social networks.\n\nStrengths\n\nIrrespective of the model that is deployed to assess innovation within an organisation, the focus should be on utilising opportunities and strengths to overcome weakness and threats. Google China has established a central position on the search industry. It holds the highest share in the market of more than 60%. China is also experiencing an immense growth in android applications.\n\nThrough Smartphone, android applications make it possible to access many of the Google products. The company has low acquisition costs in comparison with revenues. The advancement of Google chrome encompasses one of the major achievements of the company with the capacity to enhance its dominance in the Chinese market in comparison with its competitors such a

In [None]:
len(similar_docs)

5

## Create and query vector database on open-source chat data

In [2]:
import sqlite3

### Loading from google drive
Skip if you have manually dragged the db file to /content.

In [3]:
# load dialogsum data from google drive (have to manually upload db file from repo to google drive first)
from google.colab import drive
drive.mount('/content/db', force_remount=True)
drive_path_to_db = 'db/MyDrive/Research/Prompt_Routing/'
gdrive_fp = f'/content/{drive_path_to_db}ConversationHistoryRecords.db'
content_fp = '/content/ConversationHistoryRecords.db'

Mounted at /content/db


In [7]:
import shutil

shutil.copyfile(gdrive_fp, content_fp)

'/content/ConversationHistoryRecords.db'

### Loading chat data

In [None]:
conn = sqlite3.connect(content_fp)
cursor = conn.cursor()
cursor.execute('SELECT * FROM messageRecords;')

rows = cursor.fetchall()
print("All rows:", rows, "\n\n", "Individual rows:\n")
all_chat_data = ""
all_chat_data_list = []
for row in rows:
    all_chat_data+= f" {row[1]}"
    all_chat_data_list.append(row[1])
    print(row[1])

conn.close()

In [11]:
all_chat_data

" #Person2#: Oh, we always look forward to going on a holiday. We always make sure we can get away at least once a year. #Person1#: It's too late. You see, in our market October is the season for the kind of commodity. So the goods must be shipped before October or we won't be ready for the season. #Person1#: Harry, do you like the opera? #Person1#: It is not an easy job. The government is working on it. Lots of trees have been planted in the north of the capital. Also, methods of generating rain are being used by the government to fight against the drought. But of course, it will not work overnight. So, have you come out with a new plan for today? #Person1#: Did you go to school today? #Person2#: Yeh. I work part-time at a supermarket. #Person1#: I really want to go to China for vacation, but I can't find a cheap plane ticket. #Person2#: well, she's alright, but not really my cup of tea. What about the blond with the red dress?  #Person1#: That's right. How much will that be? #Person1

In [None]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [13]:
def get_text_chunks(text, separator, cleaning_regex, is_seperator_regex):
  """Get chunks from chat data. Chat data is one long string and different separators generate different results."""
  # split text on custom separator
  text_splitter = CharacterTextSplitter(
    chunk_size=10, chunk_overlap=0, separator=separator, is_separator_regex=is_seperator_regex
  ) # splitter will combine chunks split on ```separator``` until it is at least ```chunk_size```
  split_text = text_splitter.split_text(text)

  # create Documents, cleaning the page_content for better embeddings
  docs = [Document(page_content=re.sub(cleaning_regex, "", x)) for x in split_text]

  return docs

### Chunking on each speaker turn

In [None]:
speaker_turn_chunks = get_text_chunks(text=all_chat_data, separator="#Person", cleaning_regex="[12]#:\s",is_seperator_regex=True)
speaker_turn_chunks

In [None]:
len(speaker_turn_chunks)

In [None]:
speaker_turn_db = create_vector_db_from_docs(docs=speaker_turn_chunks, embedding_function=embedding_function)

In [None]:
# query the chat data
query = "Where were trees planted?"
similar_docs = query_vector_db(db=speaker_turn_db, docs=speaker_turn_chunks, query=query, k=5)
similar_docs

[(Document(page_content='It is not an easy job. The government is working on it. Lots of trees have been planted in the north of the capital. Also, methods of generating rain are being used by the government to fight against the drought. But of course, it will not work overnight. So, have you come out with a new plan for today?'),
  1.164899468421936),
 (Document(page_content='So, what did you do out there? I mean besides bask in the sun, obviously.'),
  1.4934577941894531),
 (Document(page_content='where would you like to be located?'),
  1.5621901750564575),
 (Document(page_content='Do you mean the red flowers? They are peony flowers?'),
  1.5675867795944214),
 (Document(page_content='Great, my brother and I went to a lakeside house with my uncle and aunt.'),
  1.6061367988586426)]

Results are poor. Answer is getting lost in long speaker turns.

### Chunking on each sentence

In [None]:
sentence_chunks = get_text_chunks(text=all_chat_data, separator=". ", cleaning_regex="#Person[12]#: ", is_seperator_regex=False)
sentence_chunks

In [None]:
sentence_chunks = get_text_chunks(text=all_chat_data, separator=". ", cleaning_regex="", is_seperator_regex=False)
sentence_chunks

In [15]:
len(sentence_chunks)

237

In [16]:
sentence_db = create_vector_db_from_docs(docs=sentence_chunks, embedding_function=embedding_function)
# query the chat data
query = "Where were trees planted?"
similar_docs = query_vector_db(db=sentence_db, docs=sentence_chunks, query=query, k=5)
similar_docs

[(Document(page_content='Lots of trees have been planted in the north of the capital'),
  0.6489994525909424),
 (Document(page_content='Also, methods of generating rain are being used by the government to fight against the drought'),
  1.4273008108139038),
 (Document(page_content='So, what did you do out there? I mean besides bask in the sun, obviously'),
  1.492201805114746),
 (Document(page_content='I spent lots of time doing research on the folk art of our country'),
  1.5239131450653076),
 (Document(page_content="I'm a Reporter for the Sun"), 1.5554907321929932)]

## Generate natural language response

In [17]:
import torch
from langchain.llms import CTransformers

print(torch.cuda.is_available())

False


In [18]:
# configure model
config = {'max_new_tokens': 100, 'temperature': 0}
llm = CTransformers(model='TheBloke/Mistral-7B-Instruct-v0.1-GGUF', model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf", config=config)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

mistral-7b-instruct-v0.1.Q4_K_M.gguf:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

In [20]:
from langchain import PromptTemplate, LLMChain

# create the llm chain with the prompt template
template = """<s>[INST] You are a helpful, respectful and honest assistant. Context information is below.
---------------------
{context}
---------------------
Given the context information and not prior knowledge, answer the question:
Question: {question} [/INST] </s>
"""

prompt = PromptTemplate(template=template, input_variables=["context", "question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [32]:
# extract the text from the most similar docs to the query
page_content_list = [doc[0].page_content for doc in similar_docs]

# clean up context examples for injection into prompt
context_p = "\n".join(page_content_list)

question_p = "Where were trees planted?"

# view prompt
prompt.format(context=context_p, question=question_p)

"<s>[INST] You are a helpful, respectful and honest assistant. Context information is below.\n---------------------\nLots of trees have been planted in the north of the capital\nAlso, methods of generating rain are being used by the government to fight against the drought\nSo, what did you do out there? I mean besides bask in the sun, obviously\nI spent lots of time doing research on the folk art of our country\nI'm a Reporter for the Sun\n---------------------\nGiven the context information and not prior knowledge, answer the question:\nQuestion: Where were trees planted? [/INST] </s>\n"

In [29]:
# get natural language response for query based on most similar docs
response = llm_chain.invoke({"context":context_p, "question":question_p})

In [30]:
response["text"]

'Based on the provided context information, it is mentioned that lots of trees have been planted in the north of the capital. However, there is no specific location or address given for where these trees were planted.'

## Getting metadata for chunks

In [53]:
def get_langchain_docs(text_list: list[str], separators: list[str], cleaning_regex: str, is_seperator_regex: bool)->list[docs]:
  """Get chunks from chat data, adding a conversation id to the metadata.

  Chat data is a list of strings, where one string represents a chat. Each
  chat has a unique id.

  Reference for adding metadata:
  https://python.langchain.com/docs/integrations/vectorstores/chroma/#update-and-delete

  Args:
    text_list: list of strings to be chunked. Each item is the string of a full chat.
    separators: what to split the chunks on.
    cleaning_regex: regex for anything to clean from text.
    is_seperator_regex: bool of whether the separator is regex or a raw string.
  """
  # split text on custom separator
  # TODO: improve this - currently produces texts of just "?"
  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=10, chunk_overlap=0, separators=separators, is_separator_regex=is_seperator_regex
  ) # splitter will combine chunks split on ```separator``` until it is at least ```chunk_size```

  langchain_docs = []
  for conversation_id, text in enumerate(text_list):
    split_text = text_splitter.split_text(text)

    langchain_docs+=[Document(
        page_content=re.sub(cleaning_regex, "", doc).strip(),
        # add id for conversation
        metadata={"conversation_id": conversation_id}
        )
    for doc in split_text
    ]

  return langchain_docs

In [56]:
docs_with_ids = get_langchain_docs(text_list=all_chat_data_list, separators=[". ", "#Person1#: ", "#Person1#: ", "?"], cleaning_regex="#Person[12]#:\s", is_seperator_regex=False)
docs_with_ids[0]

Document(page_content='Oh, we always look forward to going on a holiday', metadata={'conversation_id': 0})

## Quering database based on metadata

In [None]:
# create new db
sentence_metadata_db = create_vector_db_from_docs(docs=docs_with_ids, embedding_function=embedding_function)

In [67]:
def query_filtered_vector_db(db, docs: list[Document], query: str, conversation_id: int = "null", k: int = 5)->list[Document]:
  """Get the top-k most similar documents to the query with the specific conversation id.

  Reference for filtering on metadata: https://python.langchain.com/docs/integrations/vectorstores/chroma/#filtering-on-metadata

  Args:
    db: vectorstore with vectorised chunks in.
    docs: original list of langchain docs.
    query: question we are attempting to answer.
    conversation_id: id of conversation we want to filter on. Value is "null" if we are not filtering.
    k: maximum number of similar documents to return.
  """
  # define number of docs to return
  k = min(k, len(docs))

  # find top-k most similar docs
  similar_docs = db.similarity_search_with_score(query, k, filter={"conversation_id": conversation_id})

  return similar_docs

In [71]:
# search for top documents with that id
query = "What is Harry being asked?"
similar_docs_with_ids = query_filtered_vector_db(db=sentence_metadata_db, docs=docs_with_ids, conversation_id=2, query=query, k=2)
similar_docs_with_ids

[(Document(page_content='Harry, do you like the opera', metadata={'conversation_id': 2}),
  1.0387530326843262),
 (Document(page_content='Harry, do you like the opera', metadata={'conversation_id': 2}),
  1.0387530326843262)]

## Query SQL database
- need to implement text2sql work on the mocked VA data

In [2]:
from langchain.sql_database import SQLDatabase
from langchain_experimental.sql import SQLDatabaseChain
from langchain_community.llms import HuggingFaceEndpoint
import sqlite3
import os
# import requests
# from urllib.parse import urlparse
# from langchain.output_parsers import ResponseSchema, StructuredOutputParser
# from langchain_core.prompts import PromptTemplate

In [3]:
# set up hugging face access
HUGGINGFACEHUB_API_TOKEN = "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"
os.environ['HUGGINGFACEHUB_API_TOKEN'] = HUGGINGFACEHUB_API_TOKEN

In [9]:
# set up db with mocked data
from sqlalchemy import create_engine, MetaData

def get_schema(uri):
    #create engine
    engine = create_engine(uri)

    #get schema
    metadata = MetaData()
    metadata.reflect(bind=engine)

    result = ""
    #return table names and type
    for table in metadata.sorted_tables:
        result += f'CREATE TABLE {table} (\n'
        # print(f"Table: {table.name}")
        for column in table.columns:
            result += f'\t {column.name} - {column.type}\n'
            # print(f"\tColumn: {column.name} - Type: {column.type}")
        result += ')\n'
    return result

In [14]:
#### PICK UP FROM HERE
#### NEED TO ADAPT ABOVE FUNCTION FOR LOCAL DB OR MAKE URI FOR LOCAL DB AND USE AS IS

In [None]:
def get_text_chunks_with_metadata(text, separator, cleaning_regex, is_seperator_regex):

    text_splitter = CharacterTextSplitter(
        chunk_size=10, chunk_overlap=0, separator=separator, is_separator_regex=is_seperator_regex
    )
    split_text = text_splitter.split_text(text)

    docs_with_metadata = []
    conversation_id = 0
    for chunk in split_text:
        cleaned_chunk = re.sub(cleaning_regex, "", chunk)
        docs_with_metadata.append({
            'text': cleaned_chunk,
            'metadata': {'conversation_id': conversation_id}
        })

        if "#Person1#:" in chunk or "#Person2#:" in chunk:
            conversation_id += 1

    return docs_with_metadata


conversation_chunks_with_metadata = get_text_chunks_with_metadata(
    text=all_chat_data,
    separator="#Person1#: ",
    cleaning_regex="#Person[12]#: ",
    is_seperator_regex=False
)



In [None]:
conversation_chunks_with_metadata

[{'text': 'Oh, we always look forward to going on a holiday. We always make sure we can get away at least once a year.',
  'metadata': {'conversation_id': 0}},
 {'text': "It's too late. You see, in our market October is the season for the kind of commodity. So the goods must be shipped before October or we won't be ready for the season.",
  'metadata': {'conversation_id': 1}},
 {'text': 'Harry, do you like the opera?', 'metadata': {'conversation_id': 1}},
 {'text': 'It is not an easy job. The government is working on it. Lots of trees have been planted in the north of the capital. Also, methods of generating rain are being used by the government to fight against the drought. But of course, it will not work overnight. So, have you come out with a new plan for today?',
  'metadata': {'conversation_id': 1}},
 {'text': 'Did you go to school today? Yeh. I work part-time at a supermarket.',
  'metadata': {'conversation_id': 1}},
 {'text': "I really want to go to China for vacation, but I can