## Install the required dependencies:

In [None]:
!pip install -q cassio datasets langchain openai tiktoken

## Importing the packages

In [None]:
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper        # Wrap all vectors in one specific package
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings       # Responsible to convert text to vectors

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

#Integrate the Cassandra DB with LangChain and also helps to initialize the DB
import cassio

In [None]:
!pip install PyPDF2

In [None]:
from PyPDF2 import PdfReader

## Setup
API intergrations

In [None]:
ASTRA_DB_APPLICATION_TOKEN = "Use your own" # Application Token

ASTRA_DB_ID = "Use your own" # Database ID

OPENAI_API_KEY = "Use your own" # OpenAI key

In [None]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('budget_speech.pdf')

In [None]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [None]:
raw_text

## Initialising the Connection to DataBase --> Ignore the warnings after running.

In [None]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

#### Create the LangChain embedding and LLM objects

In [None]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

#### Create your LangChain vector store

In [None]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [None]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [None]:
# Top 50
texts[:50] 

## Load the dataset into the vector store

In [None]:
astra_vector_store.add_texts(texts[:50])

print("Inserted %i headlines." % len(texts[:50]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

## Run the QA cycle
Simply run the cells and ask a question -- or quit to stop. (you can also stop execution with the "▪" button on the top toolbar)

Here are some suggested questions with respect to the pdf file we used:

What is the current GDP?
How much the agriculture target will be increased to and what the focus will be

In [None]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))