# LibRAG Proof of Concept

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pandas as pd

In [5]:
#!pip install sentence_transformers

### We are going to ensure that we have our data downloaded from the SCC.
### We are going to download one interval of the full text, as well as the entire metadata file

In [6]:
# replace with sample_full_text.json
file = open("")

full_text = json.load(file)

FileNotFoundError: [Errno 2] No such file or directory: '../EDA Phase/bpl-digital-commonwealth/ft_13_checkpoint_10_133.json'

In [None]:
print(len(full_text))

In [None]:
# file path for metadata file on SCC: /projectnb/sparkgrp/ml-bpl-rag-data/full_data/bpl_data.json
meta = open("")
bpl_metadata = json.load(meta)

In [None]:
print(full_text['commonwealth:w3764603d']['text'])

### Embedding a paragraph using Word2Vec

In [None]:
# from sentence_transformers import SentenceTransformer

# # Load a pre-trained Sentence-BERT model
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# # Example paragraph
# paragraph = full_text['commonwealth:w3764603d']['text']
# paragraph_embedding = model.encode(paragraph)

# # Output: a vector representation of the paragraph
# print(paragraph_embedding)


### Setting up a Retriever

In [None]:
#!pip install langchain openai faiss-cpu

#### After ensuring we have the necessary dependencies, we are going to make our retriever

In [None]:
from typing import List

from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever


class ToyRetriever(BaseRetriever):
    """A toy retriever that contains the top k documents that contain the user query.

    This retriever only implements the sync method _get_relevant_documents.

    If the retriever were to involve file access or network access, it could benefit
    from a native async implementation of `_aget_relevant_documents`.

    As usual, with Runnables, there's a default async implementation that's provided
    that delegates to the sync implementation running on another thread.
    """

    documents: List[Document]
    """List of documents to retrieve from."""
    k: int
    """Number of top results to return"""
    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[str]:
        matching_documents = []
        for document in documents:
            if len(matching_documents) >= self.k:
                return matching_documents

            if query.lower() in document.page_content.lower():
                matching_documents.append(document.metadata['title'])
        return matching_documents


In [None]:
df = pd.DataFrame(bpl_metadata['Data'])

In [None]:
df.drop(columns=df.columns[0], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df_attributes = pd.json_normalize(df['attributes'])
df_attributes = pd.concat([df.drop(columns=['attributes']), df_attributes], axis=1)
df_attributes.head(10)

In [None]:
df_attributes.to_csv("metadata_attributes.csv")

In [None]:
df_attributes.loc[df_attributes[]]

In [None]:
df_attributes = pd.read_csv("metadata_attributes.csv")

In [None]:
df.count()

### Turn full text into Documents type

In [None]:
import re
def get_title(text):
    match = re.search(r'\d+\s+(.+?)\n', text)

    # Extracting and printing the title if there's a match
    if match:
        title = match.group(1)
    return title

In [None]:
# Turn the BPL data into a Document
from langchain.schema import Document
documents = []
for doc in full_text:
    title = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_tsi"]))
    ID = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "id"]))
    abstract = str(df_attributes.loc[df_attributes["id"] == doc, "abstract_tsi"])
    title_subtitle = str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_subtitle_tsi"])
    documents += [Document(page_content=full_text[doc]['text'], metadata={"title": title, "abstract": abstract, "subtitle": title_subtitle, "ID":ID})]
    #documents += [Document(page_content=full_text[doc]['text'])]


In [None]:
# retriever = ToyRetriever(documents=documents, k=1)
# retriever.invoke("Richmond")

In [None]:
# import openai
!source poc_venv/bin/activate

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

In [None]:
#!pip install langchain_community

### Using Chroma Vector Store

In [None]:
# import os
# os.chmod('mydatabase.db', 0o666)

In [None]:
#!pip install chromadb==0.5.0
#!pip install --upgrade openai langchain
# !pip install --upgrade langchain langchain_community langchain_openai openai python-dotenv chromadb
# !pip install --upgrade transformers
#!pip install --upgrade transformers torchvision

# !pip install openai==1.37.1
# !pip install langchain==0.2.11
# !pip install langchain-openai==0.1.19
# !pip install langchain-community==0.2.10
# !pip install langchain-experimental==0.0.63
# !pip install transformers

Now we can embed our data into a Chroma vector store:

In [None]:
# from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv
import openai
import os
import shutil
import time
from langchain.embeddings import HuggingFaceEmbeddings

# Load environment variables. Assumes that project contains .env file with API keys
load_dotenv()

import tempfile
CHROMA_PATH = tempfile.mkdtemp()  # Use a temporary directory

def main(documents):
    generate_data_store(documents)


def generate_data_store(documents):
    chunks = split_text(documents)
    save_to_chroma(chunks)


def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

def save_to_chroma(chunks):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
        print(f"Removed existing database at {CHROMA_PATH}.")

    # Create a new DB from the documents.
    os.makedirs(CHROMA_PATH, exist_ok=True)  # Ensure the directory exists

    #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=3072)
    try:
        db = Chroma.from_documents(
            chunks, embeddings, persist_directory=CHROMA_PATH
        )
        db.persist()
        print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
    except Exception as e:
        print(f"An error occurred: {e}")



if __name__ == "__main__":
    main(documents)

### Making the Query

We'll download langserve to make a sample UI for our app:

In [None]:
#!pip install "langserve[all]"

In [None]:
import argparse
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModel
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import StrOutputParser

# For LangServe
from fastapi import FastAPI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langserve import add_routes
import nest_asyncio
import uvicorn


# copy from above
CHROMA_PATH = "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/tmpcp1qkd0k"

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

# Initialize LangSmith App
# app = App()

# @langsmith_route("/answer-question")
def main(query: str):
    # Create CLI with a default value for Jupyter testing
    parser = argparse.ArgumentParser()
    parser.add_argument("query_text", type=str, help="The query text.")
    args = parser.parse_args(args=[query])  # Add a default value here for testing
    query_text = args.query_text

    # Prepare the database
    embedding_function = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=3072)
    #embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    results = db.similarity_search_with_relevance_scores(query_text, k=3)
    for i in range(len(results)):
        if len(results) == 0 or results[0][1] < 0.1:
            print(f"Unable to find matching results for \"{query_text}\"")
            print(results[0][1])
            return

    #print(results)
    
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    print(prompt)

    model = ChatOpenAI()
    response_text = model.predict(prompt)

    sources = [doc.metadata.get("title") + ": " + str(doc.metadata.get("ID")) for doc, _score in results]
    formatted_response = f"Response: {response_text}\n\nSources: {sources}"
    # response with context, sources, and answer to my query
    print(formatted_response)

if __name__ == "__main__":
    query1 = "Who did Z.B Oakes receive a letter from?"
    query2 = "What did Henry M. Sikes say about India Goods?"
    query3 = "What are some of the most controversial topics in this database?"
    query4 = "What happened in World War II?"
    query5 = "Who critiqued India Goods?"
    query6 = "Tell me about Barnstable Public Schools"
    #query7 = "What did Thos. L Gelzia talk about in their letter to Mr Z. B. Oakes, but not in the Tocsin of Liberty?"
    queries = [query1, query2, query3, query4, query5, query6]
    # print("-------------------New Query-------------------")
    # for query in queries:
    #     main(query)
    #     print("-------------------New Query-------------------")
    main(query1)
        


# Notes from Gardos

These are the list of fields, if you need any clarification about these fields ask about them.

Vectorize all of the fields

Give this to the LLM as a preface prompt.

Maybe two vector stores?