In [12]:
%pip install cassio datasets langchain astrapy langchain-openai tiktoken cassandra-driver PyPDF2 langchain-community --quiet --user

Note: you may need to restart the kernel to use updated packages.


In [17]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_openai import OpenAI,OpenAIEmbeddings
from datasets import load_dataset
import cassio
from typing_extensions import Concatenate
import os
from dotenv import load_dotenv
load_dotenv()

ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY","")

In [21]:
from PyPDF2 import PdfReader


pdfReader = PdfReader("sample.pdf")
raw_text = ""

for i,page in enumerate(pdfReader.pages):
    content = page.extract_text()
    if content:
        raw_text += content
    


In [None]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [18]:
llm= OpenAI(api_key=OPENAI_API_KEY) # type: ignore
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY) # type: ignore

In [19]:
astra_vector_store = Cassandra(
    embedding= embeddings,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None
)

In [22]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

texts = text_splitter.split_text(raw_text)

In [26]:
(texts[:50])

['top-500-movies\nPage 1rankrelease_datetitle\n12019-04-23Avengers: Endgame\n22011-05-20Pirates of the Caribbean: On Stranger Tides\n32015-04-22Avengers: Age of Ultron\n42015-12-16Star Wars Ep. VII: The Force Awakens\n52018-04-25Avengers: Infinity War\n62007-05-24Pirates of the Caribbean: At World’s End\n72017-11-13Justice League\n82015-10-06Spectre\n92023-07-11Mission: Impossible Dead Reckoning Part One\n102019-12-18Star Wars: The Rise of Skywalker\n112018-05-23Solo: A Star Wars Story\n122012-03-07John Carter\n132016-03-23Batman v Superman: Dawn of Justice\n142017-12-13Star Wars Ep. VIII: The Last Jedi\n152019-07-11The Lion King\n162010-11-24Tangled\n172007-05-04Spider-Man 3\n182016-04-22Captain America: Civil War\n192022-07-01Thor: Love and Thunder\n202009-07-15Harry Potter and the Half-Blood Prince\n212013-12-12The Hobbit: The Desolation of Smaug\n222014-12-10The Hobbit: The Battle of the Five Armies\n232017-04-07The Fate of the Furious\n242021-09-29No Time to Die\n252009-12-17Avata

In [27]:
astra_vector_store.add_texts(texts[:50])
print("Texts added to vector store")
astra_vector_index = VectorStoreIndexWrapper(
    vectorstore=astra_vector_store,
)

Texts added to vector store


In [None]:
first_question = True
question = ""
while True:
    if first_question:
        question = input("Enter your question: or type 'exit' to quit: ")
        first_question = False
    else:
        question = input("What is your next question: or type 'exit' to quit: ")
    
    if question.lower() == "exit":
        break

    if question == "":
        print("Please enter a question")
        continue
    
    print("Question: ", question)
    answer = astra_vector_index.query(question,llm=llm).strip()
    print("Answer: ", answer)




 Question:  what is 4 rank movie
Answer:  Star Wars Ep. VII: The Force Awakens
