In [16]:
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio

In [17]:
from PyPDF2 import PdfReader

In [18]:
from dotenv import load_dotenv
import os

load_dotenv()  # This method will load variables from .env

# Now you can access the variables using os.getenv
ASTRA_DB_APPLICATION_TOKEN = os.getenv('ASTRA_DB_APPLICATION_TOKEN')# enter the "AstraCS:..." string found in in your Token JSON file
ASTRA_DB_ID = os.getenv('ASTRA_DB_ID')# enter your Database ID
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')# enter your OpenAI key





In [19]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('UK_Budget.pdf')

In [20]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [21]:
raw_text

"SPRING BUDGET 2023\nHC 1183 March 2023SPRING BUDGET 2023\nReturn to an order of the House of Commons \ndated 15 March 2023 \nCopy of the Budget Report – March 2023 as\xa0Laid before the House of Commons by the Chancellor of the Exchequer when opening the Budget.\nVictoria Atkins \nHis Majesty’s Treasury 15 March 2023\nOrdered by the House of Commons to be \nprinted 15 March 2023\nHC 1183© Crown copyright 2023\nThis publication is licensed under the terms of the Open Government \nLicence v3.0 except where otherwise stated. To view this licence, visit \nnationalarchives.gov.uk/doc/open-government-licence/version/3\nWhere we have identified any third party copyright information you \nwill need to obtain permission from the copyright holders concerned.\nThis publication is available at www.gov.uk/official-documents\nAny enquiries regarding this publication should be sent to us at \npublic.enquiries@hmtreasury.gov.uk\nISBN 978-1-5286-3950-7\nE02872049 03/23\nPrinted on paper containing 40%

In [22]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [23]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [24]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [25]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [26]:
texts[:50]

['SPRING BUDGET 2023\nHC 1183 March 2023SPRING BUDGET 2023\nReturn to an order of the House of Commons \ndated 15 March 2023 \nCopy of the Budget Report – March 2023 as\xa0Laid before the House of Commons by the Chancellor of the Exchequer when opening the Budget.\nVictoria Atkins \nHis Majesty’s Treasury 15 March 2023\nOrdered by the House of Commons to be \nprinted 15 March 2023\nHC 1183© Crown copyright 2023\nThis publication is licensed under the terms of the Open Government \nLicence v3.0 except where otherwise stated. To view this licence, visit \nnationalarchives.gov.uk/doc/open-government-licence/version/3\nWhere we have identified any third party copyright information you \nwill need to obtain permission from the copyright holders concerned.',
 'nationalarchives.gov.uk/doc/open-government-licence/version/3\nWhere we have identified any third party copyright information you \nwill need to obtain permission from the copyright holders concerned.\nThis publication is available at 

In [27]:

astra_vector_store.add_texts(texts)

print("Inserted %i headlines." % len(texts))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 444 headlines.


In [28]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


QUESTION: "what is current gdp"
ANSWER: "According to the given context, the current GDP (as of February 2023) is 0.7% lower than the post-pandemic monthly peak reached in May 2022."

FIRST DOCUMENTS BY RELEVANCE:
    [0.9006] "quarter, before growth returns and strengthens over the rest of the forecast period. ..."
    [0.9005] "Statistics, February 2021.
b The Atkinson review: final report. Measurement of gover ..."
    [0.8995] "it better reflects changes in the actual services delivered rather than simply 
chan ..."
    [0.8993] "stock of inward foreign direct investment than any country other than the US.17 
In  ..."
