# LibRAG Proof of Concept

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pandas as pd

In [3]:
#!pip install sentence_transformers

### We are going to ensure that we have our data downloaded from the SCC.
### We are going to download one interval of the full text, as well as the entire metadata file

In [6]:
# replace with sample_full_text.json
file = open("../EDA Phase/bpl-digital-commonwealth/ft_13_checkpoint_10_133.json")

full_text = json.load(file)

In [15]:
print(len(full_text))

133


In [5]:
# file path for metadata file on SCC: /projectnb/sparkgrp/ml-bpl-rag-data/full_data/bpl_data.json
meta = open("../EDA Phase/bpl-digital-commonwealth/bpl_data.json")
bpl_metadata = json.load(meta)

In [6]:
print(full_text['commonwealth:w3764603d']['text'])

Charlotte N.C.
Feb 21st/57
Z.B. Oaks Esq
Charleston S.C.
Dr Sr
I take the
Liberty to Address you as Regards
your Negro Market & your
Opinion as to how it will continue
through the Spring & Summer
I have an Idea of Trying To puchase
in the Mountians of N.C. & Va
& Selling in your Market or in
Richmond Va. I expect to Trade
on the Small Scale for Some
Market & if I can Sell in
your Market for a fair profit
I Shall do my Buisiness with
you & C

I wish To no what Boys from 18 to
20 yrs old [deletion]and[/deletion] both no 1 & no 2 Boys
also Boys 12 ys old Say weigs 80 to 90
lbs & girls 12 ys old weighs Say 60 to
80 lb & from 14 To ys old To 20 ys old
Please write me by Return Mail
& give me the Market prices of
above Negros & [deletion][/deletion] are they [deletion][/deletion]
Brisk Sale or dull Address
me Charlotte N.C.
Yours Respectfully
T.H. Jones




### Embedding a paragraph using Word2Vec

In [6]:
# from sentence_transformers import SentenceTransformer

# # Load a pre-trained Sentence-BERT model
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# # Example paragraph
# paragraph = full_text['commonwealth:w3764603d']['text']
# paragraph_embedding = model.encode(paragraph)

# # Output: a vector representation of the paragraph
# print(paragraph_embedding)


### Setting up a Retriever

In [7]:
#!pip install langchain openai faiss-cpu

#### After ensuring we have the necessary dependencies, we are going to make our retriever

In [8]:
from typing import List

from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever


class ToyRetriever(BaseRetriever):
    """A toy retriever that contains the top k documents that contain the user query.

    This retriever only implements the sync method _get_relevant_documents.

    If the retriever were to involve file access or network access, it could benefit
    from a native async implementation of `_aget_relevant_documents`.

    As usual, with Runnables, there's a default async implementation that's provided
    that delegates to the sync implementation running on another thread.
    """

    documents: List[Document]
    """List of documents to retrieve from."""
    k: int
    """Number of top results to return"""
    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[str]:
        matching_documents = []
        for document in documents:
            if len(matching_documents) >= self.k:
                return matching_documents

            if query.lower() in document.page_content.lower():
                matching_documents.append(document.metadata['title'])
        return matching_documents


In [10]:
df = pd.DataFrame(bpl_metadata['Data'])

In [11]:
df.drop(columns=df.columns[0], axis=1, inplace=True)

In [12]:
df.head()

Unnamed: 0,type,attributes,links
0,DigitalObject,"{'id': 'commonwealth-oai:xp68md23x', 'system_c...",{'self': 'https://www.digitalcommonwealth.org/...
1,DigitalObject,"{'id': 'commonwealth-oai:xp68m844v', 'system_c...",{'self': 'https://www.digitalcommonwealth.org/...
2,DigitalObject,"{'id': 'commonwealth-oai:xp68mb49n', 'system_c...",{'self': 'https://www.digitalcommonwealth.org/...
3,DigitalObject,"{'id': 'commonwealth-oai:xp68mc60v', 'system_c...",{'self': 'https://www.digitalcommonwealth.org/...
4,DigitalObject,"{'id': 'commonwealth-oai:xp68mc72n', 'system_c...",{'self': 'https://www.digitalcommonwealth.org/...


In [14]:
df_attributes = pd.json_normalize(df['attributes'])
df_attributes = pd.concat([df.drop(columns=['attributes']), df_attributes], axis=1)
df_attributes.head(10)

Unnamed: 0,type,links,id,system_create_dtsi,system_modified_dtsi,curator_model_ssi,curator_model_suffix_ssi,title_info_primary_tsi,genre_basic_ssim,genre_specific_ssim,...,storage_key_base_ss,identifier_issn_ssim,frequency_tsi,contained_by_ssi,note_credits_tsim,identifier_isbn_ssim,identifier_music_publisher_ssim,note_arrangement_tsim,transcription_ark_id_ssi,transcription_key_base_ss
0,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth-oai:xp68md23x,2021-03-04T00:13:09Z,2021-09-02T20:40:00Z,Curator::DigitalObject,DigitalObject,من فضلكم توقفوا الأشخاص الذين ارتكبوا أسوأ الج...,[Posters],[Political posters],...,,,,,,,,,,
1,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth-oai:xp68m844v,2021-03-03T23:58:44Z,2021-09-02T20:21:32Z,Curator::DigitalObject,DigitalObject,海员们 : 要警惕航运事故,[Posters],[Political posters],...,,,,,,,,,,
2,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth-oai:xp68mb49n,2021-03-04T00:06:25Z,2021-09-02T20:30:29Z,Curator::DigitalObject,DigitalObject,人間としての尊厳を保てる : 生活賃金を,[Posters],[Political posters],...,,,,,,,,,,
3,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth-oai:xp68mc60v,2021-03-04T00:10:40Z,2021-09-02T20:35:20Z,Curator::DigitalObject,DigitalObject,野火,[Posters],[Political posters],...,,,,,,,,,,
4,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth-oai:xp68mc72n,2021-03-04T00:11:07Z,2021-09-02T20:35:52Z,Curator::DigitalObject,DigitalObject,野火,[Posters],[Political posters],...,,,,,,,,,,
5,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth-oai:xp68mc992,2021-03-04T00:12:14Z,2021-09-02T20:36:59Z,Curator::DigitalObject,DigitalObject,團結 抗強權,[Posters],[Political posters],...,,,,,,,,,,
6,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth-oai:xp68m804w,2021-03-03T23:57:00Z,2021-09-02T20:19:35Z,Curator::DigitalObject,DigitalObject,大队的夜晩 (年画) 史惠芳作,[Posters],[Political posters],...,,,,,,,,,,
7,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth-oai:xp68m8365,2021-03-03T23:58:27Z,2021-09-02T20:21:12Z,Curator::DigitalObject,DigitalObject,морякам,[Posters],[Political posters],...,,,,,,,,,,
8,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth:8k71nz966,2015-09-14T22:06:01Z,2022-07-08T19:59:21Z,Curator::DigitalObject,DigitalObject,A,[Prints],,...,,,,,,,,,,
9,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth:8k71p000r,2015-09-14T22:06:33Z,2022-07-08T19:59:21Z,Curator::DigitalObject,DigitalObject,E,[Prints],,...,,,,,,,,,,


In [15]:
df_attributes.to_csv("metadata_attributes.csv")

In [13]:
df_attributes.loc[df_attributes[]]

Unnamed: 0,type,links,id,system_create_dtsi,system_modified_dtsi,curator_model_ssi,curator_model_suffix_ssi,title_info_primary_tsi,genre_basic_ssim,genre_specific_ssim,...,storage_key_base_ss,identifier_issn_ssim,frequency_tsi,contained_by_ssi,note_credits_tsim,identifier_isbn_ssim,identifier_music_publisher_ssim,note_arrangement_tsim,transcription_ark_id_ssi,transcription_key_base_ss
0,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth-oai:xp68md23x,2021-03-04T00:13:09Z,2021-09-02T20:40:00Z,Curator::DigitalObject,DigitalObject,من فضلكم توقفوا الأشخاص الذين ارتكبوا أسوأ الج...,[Posters],[Political posters],...,,,,,,,,,,
1,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth-oai:xp68m844v,2021-03-03T23:58:44Z,2021-09-02T20:21:32Z,Curator::DigitalObject,DigitalObject,海员们 : 要警惕航运事故,[Posters],[Political posters],...,,,,,,,,,,
2,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth-oai:xp68mb49n,2021-03-04T00:06:25Z,2021-09-02T20:30:29Z,Curator::DigitalObject,DigitalObject,人間としての尊厳を保てる : 生活賃金を,[Posters],[Political posters],...,,,,,,,,,,
3,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth-oai:xp68mc60v,2021-03-04T00:10:40Z,2021-09-02T20:35:20Z,Curator::DigitalObject,DigitalObject,野火,[Posters],[Political posters],...,,,,,,,,,,
4,DigitalObject,{'self': 'https://www.digitalcommonwealth.org/...,commonwealth-oai:xp68mc72n,2021-03-04T00:11:07Z,2021-09-02T20:35:52Z,Curator::DigitalObject,DigitalObject,野火,[Posters],[Political posters],...,,,,,,,,,,


In [3]:
df_attributes = pd.read_csv("metadata_attributes.csv")

  df_attributes = pd.read_csv("metadata_attributes.csv")


In [16]:
df.count()

type          1303800
attributes    1303800
links         1303800
dtype: int64

In [46]:
str(df_attributes.loc[df_attributes["id"] == "commonwealth-oai:xp68m844v"]["title_info_primary_tsi"]).split(" ")[4:]

['海员们', ':', '要警惕航运事故\nName:', 'title_info_primary_tsi,', 'dtype:', 'object']

### Turn full text into Documents type

In [18]:
print(str(df_attributes.loc[df_attributes["id"] == "commonwealth-oai:xp68m844v", "abstract_tsi"]))

1    Poster produced by the International Transport...
Name: abstract_tsi, dtype: object


In [19]:
#df_attributes.columns.tolist()

In [65]:
df_attributes.loc[df_attributes["genre_basic_ssim"] == "[Posters]"]

Unnamed: 0,type,links,id,system_create_dtsi,system_modified_dtsi,curator_model_ssi,curator_model_suffix_ssi,title_info_primary_tsi,genre_basic_ssim,genre_specific_ssim,...,storage_key_base_ss,identifier_issn_ssim,frequency_tsi,contained_by_ssi,note_credits_tsim,identifier_isbn_ssim,identifier_music_publisher_ssim,note_arrangement_tsim,transcription_ark_id_ssi,transcription_key_base_ss


In [7]:
import re
def get_title(text):
    match = re.search(r'\d+\s+(.+?)\n', text)

    # Extracting and printing the title if there's a match
    if match:
        title = match.group(1)
    return title

In [8]:
# Turn the BPL data into a Document
from langchain.schema import Document
documents = []
for doc in full_text:
    title = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_tsi"]))
    ID = get_title(str(df_attributes.loc[df_attributes["id"] == doc, "id"]))
    abstract = str(df_attributes.loc[df_attributes["id"] == doc, "abstract_tsi"])
    title_subtitle = str(df_attributes.loc[df_attributes["id"] == doc, "title_info_primary_subtitle_tsi"])
    documents += [Document(page_content=full_text[doc]['text'], metadata={"title": title, "abstract": abstract, "subtitle": title_subtitle, "ID":ID})]
    #documents += [Document(page_content=full_text[doc]['text'])]


In [9]:
documents[0].metadata

{'title': 'T.G. Hudson, Oglethorpe, Ga., autograph letter...',
 'abstract': '1161475    Asks market price at Charleston.\nName: abstract_tsi, dtype: object',
 'subtitle': '1161475    NaN\nName: title_info_primary_subtitle_tsi, dtype: object',
 'ID': 'commonwealth:9k41zk460'}

In [144]:
# retriever = ToyRetriever(documents=documents, k=1)
# retriever.invoke("Richmond")

In [21]:
# import openai

In [22]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

In [23]:
#!pip install langchain_community

In [25]:
print(documents[1].metadata)

{'title': '1161521    T.H. Jones, Charlotte, N.C., autograph letter ...\nName: title_info_primary_tsi, dtype: object', 'abstract': '1161521    Expects to make purchases in North Carolina an...\nName: abstract_tsi, dtype: object'}


### Using Chroma Vector Store

In [66]:
# import os
# os.chmod('mydatabase.db', 0o666)

In [67]:
#!pip install chromadb==0.5.0
#!pip install --upgrade openai langchain
# !pip install --upgrade langchain langchain_community langchain_openai openai python-dotenv chromadb
# !pip install --upgrade transformers
#!pip install --upgrade transformers torchvision

# !pip install openai==1.37.1
# !pip install langchain==0.2.11
# !pip install langchain-openai==0.1.19
# !pip install langchain-community==0.2.10
# !pip install langchain-experimental==0.0.63
# !pip install transformers

Now we can embed our data into a Chroma vector store:

In [12]:
# from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv
import openai
import os
import shutil
import time
from langchain.embeddings import HuggingFaceEmbeddings

# Load environment variables. Assumes that project contains .env file with API keys
load_dotenv()

import tempfile
CHROMA_PATH = tempfile.mkdtemp()  # Use a temporary directory

def main(documents):
    generate_data_store(documents)


def generate_data_store(documents):
    chunks = split_text(documents)
    save_to_chroma(chunks)


def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

def save_to_chroma(chunks):
    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
        print(f"Removed existing database at {CHROMA_PATH}.")

    # Create a new DB from the documents.
    os.makedirs(CHROMA_PATH, exist_ok=True)  # Ensure the directory exists

    #embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=3072)
    try:
        db = Chroma.from_documents(
            chunks, embeddings, persist_directory=CHROMA_PATH
        )
        db.persist()
        print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
    except Exception as e:
        print(f"An error occurred: {e}")



if __name__ == "__main__":
    main(documents)

Split 133 documents into 13931 chunks.
Then again, perhaps as we become accustomed
to the inflated value of the dollar, in time
that price may not seem as exhorbitant as it
does to me now.

When I was a child, soup was never served as
an appetizer, but always as a meal. It didn't
come from a can, either. It was made from
bones left over from Sunday's roast and had
some strength to it. When cooled, it jelled,
and suspended in it were bits and pieces of
meat and vegetables.

With soup, corn or clam chowder, or oyster
stew came crackers. Not those skimpy skinny
saltines, but thick rich common or pilot cra-
ckers. Three or four of those, along with your
soup, and you had a meal.

Father liked pie, and Mother baked them fre-
quently. All kinds: mince, apple, pumpkin,
squash, and lemon meringue. When I think of
the calories we consumed, I'm surprised we
weren't chubby. Instead, we were all thin as
rails.
{'title': 'Thanksgiving', 'abstract': '1161754    NaN\nName: abstract_tsi, dtype: object

  db.persist()


### Making the Query

We'll download langserve to make a sample UI for our app:

In [None]:
#!pip install "langserve[all]"

In [14]:
import argparse
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModel
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.output_parsers import StrOutputParser

# For LangServe
from fastapi import FastAPI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langserve import add_routes
import nest_asyncio
import uvicorn


# copy from above
CHROMA_PATH = "/var/folders/xq/fj3st__56r54gz9tdvb7d2k40000gn/T/tmpcp1qkd0k"

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

# Initialize LangSmith App
# app = App()

# @langsmith_route("/answer-question")
def main(query: str):
    # Create CLI with a default value for Jupyter testing
    parser = argparse.ArgumentParser()
    parser.add_argument("query_text", type=str, help="The query text.")
    args = parser.parse_args(args=[query])  # Add a default value here for testing
    query_text = args.query_text

    # Prepare the database
    embedding_function = OpenAIEmbeddings(model="text-embedding-3-large", dimensions=3072)
    #embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    results = db.similarity_search_with_relevance_scores(query_text, k=3)
    for i in range(len(results)):
        if len(results) == 0 or results[0][1] < 0.1:
            print(f"Unable to find matching results for \"{query_text}\"")
            print(results[0][1])
            return

    #print(results)
    
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    print(prompt)

    model = ChatOpenAI()
    response_text = model.predict(prompt)

    sources = [doc.metadata.get("title") + ": " + str(doc.metadata.get("ID")) for doc, _score in results]
    formatted_response = f"Response: {response_text}\n\nSources: {sources}"
    # response with context, sources, and answer to my query
    print(formatted_response)

if __name__ == "__main__":
    query1 = "Who did Z.B Oakes receive a letter from?"
    query2 = "What did Henry M. Sikes say about India Goods?"
    query3 = "What are some of the most controversial topics in this database?"
    query4 = "What happened in World War II?"
    query5 = "Who critiqued India Goods?"
    query6 = "Tell me about Barnstable Public Schools"
    #query7 = "What did Thos. L Gelzia talk about in their letter to Mr Z. B. Oakes, but not in the Tocsin of Liberty?"
    queries = [query1, query2, query3, query4, query5, query6]
    # print("-------------------New Query-------------------")
    # for query in queries:
    #     main(query)
    #     print("-------------------New Query-------------------")
    main(query1)
        


Human: 
Answer the question based only on the following context:

Columbia Jany 8th / 55
Mr Z B. Oakes
Dear Sir
I read
your letter, desiring me to
pay the amount of the [unclear]
over to you, but I cannot do
so until Col. Bauskett gives
me notice to do so. I have
seen him since I read your
letter, I regret very much that
I cannot comply with your
request. The arrangement
which Mr Mazyck made with
the Bank, was to meet it
when we received notice to
do so,
Yours respectfully
Thos. Taylor

---

and the Ballance with all other obligatory and Kind
favors I will have to make straight when I see you
you will no doubt feel hurt at this step of
mine but when you consider all things, and that
upon this very step our mutual safety & welfare
depended, why like General Jackson at orleans
I take the Responsibility, and time I hope
will alike prove I was Right
with much Esteem & Regard
I Remain faithfully yours
Theo. C Tharin

Mount Holly } {Three Mount Holly PCV
June 22 { Theo C Tharin PCV
Z.B. Oake

# Notes from Gardos

These are the list of fields, if you need any clarification about these fields ask about them.

Vectorize all of the fields

Give this to the LLM as a preface prompt.

Maybe two vector stores?