In [1]:
import openai
import langchain
import pinecone
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
import warnings
from langchain_community.vectorstores import SupabaseVectorStore
from supabase import create_client, Client
# For Suppress specific warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import os

In [4]:
## Reading the Document
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents


In [5]:
doc=read_doc('documents/')
doc

[Document(metadata={'source': 'documents\\GDP.pdf', 'page': 0}, page_content=' \n \n \n \n \nA u g u s t  2 0 1 8  \n \n________________________________________________________________________  \n  \n \n \n Hutchins Center Working Paper #43  \nTHIS PAPER IS ONLINE  AT \nhttps://www.brookings.edu/research/gdp -as-a-\nmeasure -of-economic -well-being  \n  \n \n \n \n \n GDP as a Measure of Economic  \nWell -being  \n \nKaren Dynan  \nHarvard University  \nPeterson Institute for International Economics  \n \nLouise Sheiner  \nHutchins Center on Fiscal and Monetary Policy, The Brookings Institution  \nThe authors thank Katharine Abraham, Ana Aizcorbe, Martin Baily, Barry Bosworth, David Byrne, Richard Cooper, Carol \nCorrado, Diane Coyle, Abe Dunn, Marty Feldstein, Martin Fleming, Ted Gayer, Greg Ip, Billy Jack, Ben Jones, Chad Jones, Dale \nJorgenson, Greg  Mankiw, Dylan Rassier, Marshall Reinsdorf, Matthew Shapiro, Dan Sichel, Jim Stock, Hal Varian, David \nWessel, Cliff Winston, and par

In [6]:
len(doc)

53

In [7]:
## Divide into Chunks
def chunk_data(docs, chunk_size=1500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return doc


In [8]:
documents = chunk_data(docs=doc)
documents

[Document(metadata={'source': 'documents\\GDP.pdf', 'page': 0}, page_content='A u g u s t  2 0 1 8  \n \n________________________________________________________________________  \n  \n \n \n Hutchins Center Working Paper #43  \nTHIS PAPER IS ONLINE  AT \nhttps://www.brookings.edu/research/gdp -as-a-\nmeasure -of-economic -well-being  \n  \n \n \n \n \n GDP as a Measure of Economic  \nWell -being  \n \nKaren Dynan  \nHarvard University  \nPeterson Institute for International Economics  \n \nLouise Sheiner  \nHutchins Center on Fiscal and Monetary Policy, The Brookings Institution  \nThe authors thank Katharine Abraham, Ana Aizcorbe, Martin Baily, Barry Bosworth, David Byrne, Richard Cooper, Carol \nCorrado, Diane Coyle, Abe Dunn, Marty Feldstein, Martin Fleming, Ted Gayer, Greg Ip, Billy Jack, Ben Jones, Chad Jones, Dale \nJorgenson, Greg  Mankiw, Dylan Rassier, Marshall Reinsdorf, Matthew Shapiro, Dan Sichel, Jim Stock, Hal Varian, David \nWessel, Cliff Winston, and participants at th

In [9]:
len(documents)
# documents[1]

140

In [10]:
documents[0]

Document(metadata={'source': 'documents\\GDP.pdf', 'page': 0}, page_content='A u g u s t  2 0 1 8  \n \n________________________________________________________________________  \n  \n \n \n Hutchins Center Working Paper #43  \nTHIS PAPER IS ONLINE  AT \nhttps://www.brookings.edu/research/gdp -as-a-\nmeasure -of-economic -well-being  \n  \n \n \n \n \n GDP as a Measure of Economic  \nWell -being  \n \nKaren Dynan  \nHarvard University  \nPeterson Institute for International Economics  \n \nLouise Sheiner  \nHutchins Center on Fiscal and Monetary Policy, The Brookings Institution  \nThe authors thank Katharine Abraham, Ana Aizcorbe, Martin Baily, Barry Bosworth, David Byrne, Richard Cooper, Carol \nCorrado, Diane Coyle, Abe Dunn, Marty Feldstein, Martin Fleming, Ted Gayer, Greg Ip, Billy Jack, Ben Jones, Chad Jones, Dale \nJorgenson, Greg  Mankiw, Dylan Rassier, Marshall Reinsdorf, Matthew Shapiro, Dan Sichel, Jim Stock, Hal Varian, David \nWessel, Cliff Winston, and participants at the

In [11]:
## Embedding Technique of OpenAI
embeddings = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001944FC71790>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001944FBFD910>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='sk-proj-mRrh6d4LeOex9lB2VrMBT3BlbkFJH7gvc0s2YsBWQ07ZWH65', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

In [12]:
vectors = embeddings.embed_query("How are you?")
vectors

[-0.016745297276263428,
 -0.012179519077485991,
 0.006614207305135477,
 -0.02596323198016492,
 -0.016177659891036346,
 0.01754739325753732,
 -0.011130624476023606,
 -0.009927480504112768,
 -0.01815205071518945,
 -0.010421078675812332,
 0.027838902611184563,
 0.0016736041326834588,
 -0.007379283213984322,
 -0.011642731752613118,
 0.00728056395217344,
 -0.015338543837337407,
 0.028357179899844917,
 -0.011883361292053345,
 0.013981150494978117,
 -0.02058301870729449,
 0.002477242616975061,
 0.006355068660805301,
 0.0010265286569515525,
 -0.008224569279754103,
 -0.01580746102943103,
 -0.007805011252904633,
 0.025087095854040935,
 -0.01236461850828865,
 0.02231060904861394,
 -0.02514879597474935,
 0.005633181905129768,
 0.007718631549574145,
 -0.01311735532431839,
 0.004087604591593686,
 0.008823055794012812,
 -0.02228592900033058,
 0.00401047990636946,
 -0.010451927804843962,
 0.020360898272744208,
 -0.0063026235582031515,
 0.02702446660576899,
 0.001300321079949976,
 -0.005247558944669925

In [13]:
len(vectors)

1536

In [47]:
# Supabase API URL
url = "YOUR_SUPABASE_URL"  
# Supabase API key
key = "YOUR_SUPABASE_API_KEY"  
supabase: Client = create_client(url, key)

# pinecone.init(
#     api_key="b47e9efc-5012-47df-baf7-d7b748ffd012",
#     environment="gcp-starter"
# )

# index_name="langchainvector"

# index=Pinecone.from_documents(doc,embeddings,index_name=index_name)

# def retrieve_query(query, k=2):
#     matching_results = index.similarity_Search(query, k=k)
#     return matching_results

# pc = Pinecone(api_key="b47e9efc-5012-47df-baf7-d7b748ffd012")
# index = pc.Index("langchainvector")



In [47]:
# from pinecone.grpc import PineconeGRPC as Pinecone
# from pinecone import ServerlessSpec
# pc = Pinecone(api_key="b47e9efc-5012-47df-baf7-d7b748ffd012")
# index_name = pc.Index("langchainvector")

In [82]:

# # Insert documents and embeddings into Supabase
# def insert_data(documents, embeddings):
#     for doc in documents:
#         embedding = embeddings.embed_documents([doc.page_content])[0]
#         data = {
#             "document": doc.page_content,
#             "embedding": embedding
#         }
#         response = supabase.table('langchainvector').insert(data).execute()
#         print(response)

# insert_data(documents, embeddings)



In [81]:
# # Fetch and print documents and embeddings from Supabase
# def fetch_data():
#     response = supabase.table('langchainvector').select("*").execute()
#     return response.data

# data = fetch_data()
# print(data)

In [62]:
index=SupabaseVectorStore.from_documents(documents,embeddings, client=supabase ,table_name="documents")

In [72]:
## Cosine Similarity to retrieve results from VectorDB

def retrieve_query(query, k=2):
    matching_results = index.similarity_search(query,k=k)
    return matching_results

In [74]:
from langchain.chains.question_answering import load_qa_chain
from langchain import OpenAI

In [75]:
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0.5)
chain = load_qa_chain(llm,chain_type="stuff")

In [76]:
# ## Search answers from vectorDB
def retrieve_answer(query):
    doc_search = retrieve_query(query)
    print(doc_search)
    response=chain.run(input_documents=doc_search, question=query)
    return response 


In [80]:
our_query = "The differences between GDP and welfare"
answer = retrieve_answer(our_query)
print(answer)

[Document(metadata={'page': 4, 'source': 'documents\\GDP.pdf'}, page_content='example, in a 1934 report to Congress, Kuznets stated that “the welfare of a nation … can scarcely be \ninferred from a measure of national income” (Bureau of Fo reign and Domestic Commerce and Kuznets, \n1934).  \nSome of the differences between GDP and welfare are outside the scope of this paper. For example, \nGDP does not include important societal features such as discrimination and crime. In addition, as an \neconomy -wide concept, GDP does not provide information about the distribution of income, which bears \nimportantly on the welfare of individuals within an economy.5 Nor does GDP capture features of the \nenvironment such as climate change and the availability of natur al resources.  \nMuch of the discussion of GDP and welfare in this paper will focus on a narrower distinction —the \ndifference between GDP and what we call aggregate  economic  well -being , defined as the consumer \nwelfare derived