#  Install and import required libraries

In [22]:
# Install PyPDF2 for reading PDF files
# !pip install pyPDF2

# Import necessary libraries for LangChain and PDF reading
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

from PyPDF2 import PdfReader
import cassio
import os
from dotenv import load_dotenv

# Load API keys and connect to AstraDB

In [None]:
# Load environment variables for OpenAI API key
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# AstraDB credentials (replace with your actual tokens)
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")

# Initialize connection to AstraDB
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)


# Read text from the PDF file

In [None]:
# Load the PDF file
pdf_reader = PdfReader("budget_speech.pdf")

# Extract all text from the PDF pages into one string
raw_text = ""
for page in pdf_reader.pages:
    content = page.extract_text()
    if content:
        raw_text += content


# Split the text into smaller chunks for easier processing

In [None]:
# Create a text splitter that breaks the text into chunks of about 800 characters,
# with 200 characters overlapping to keep context between chunks
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=800,
    chunk_overlap=200,
    length_function=len
)

# Split the raw text
texts = text_splitter.split_text(raw_text)

# Check the first chunk (optional)
print(texts[:1])


#  Set up OpenAI LLM and embedding models

In [None]:
# Initialize OpenAI LLM (language model) and embeddings using your API key
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Create a vector store backed by AstraDB and insert text chunks

In [None]:
# Create the Cassandra (AstraDB) vector store with embeddings
astra_vector_store = Cassandra(
    embedding=embeddings,
    table_name="demo_table",
    session=None,
    keyspace=None
)

# Insert the first 10 text chunks into the vector store
astra_vector_store.add_texts(texts[:10])

print(f"Successfully inserted first {len(texts[:10])} chunks of text.")


#  Create an index and query the stored text

In [None]:
# Wrap the vector store with an index for easy querying
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

# Query the index with a question, and get a response from the LLM
response = astra_vector_index.query(question="what are the contents of the speech", llm=llm)

print(response)


# (Optional) Perform similarity search directly on the vector store

In [None]:
# Find the most similar text chunk(s) to the query
results = astra_vector_store.similarity_search(query="what are the contents of the speech", k=1)

# Print the best matching chunk
print(results[0].page_content)
