In [2]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

from datasets import load_dataset

import cassio

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from PyPDF2 import PdfReader

In [4]:
import os
from dotenv import load_dotenv

load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = openai_key

astra_token = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
os.environ["ASTRA_DB_APPLICATION_TOKEN"] = astra_token

astra_db_id = os.getenv("ASTRA_DB_ID")
os.environ["ASTRA_DB_ID"] = astra_db_id

In [5]:
pdfreader = PdfReader("The Guide to Larry Niven's Ringworld.pdf")

In [6]:
from typing_extensions import Concatenate
# reading the pdf
raw_text = ''
for i,page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

### Connecting the database

In [8]:
cassio.init(token=astra_token, database_id=astra_db_id)

### Creating OpenAI embeddings

In [9]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)
embeddings = OpenAIEmbeddings()

In [10]:
astra_vector_store = Cassandra(
    embedding=embeddings,
    table_name="qa_mini",
    session= None,
    keyspace= None
)

In [11]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=800,
    chunk_overlap=200,
    length_function=len
)

texts = text_splitter.split_text(raw_text)

## Converting the texts into embeddings by pushing it into the vector store

In [13]:
astra_vector_store.add_texts(texts[100:200])

print ("Inserted %i headlines." %len(texts[100:200]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 100 headlines.


### Q&A cycle

In [14]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question about Ringworld (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nAsk another question to know more (or type 'quit' to exit): ").strip()

    if query_text.lower() == 'quit':
        break

    if query_text == "":
        continue

    first_question = False

    # Process the query using the vector index
    print("\nQuestion: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("First Documents by Relevance:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print(" [%0.4f] \"%s......\"" %(score, doc.page_content[:100]))



Question: "What is Ringworld?"
ANSWER: "Ringworld is a science fiction novel by Larry Niven set in Known Space, a universe in which humanity shares the galaxy with other alien species. In the story, a group of beings embark on a manned expedition to the Ringworld, a massive artificial ring-shaped structure surrounding a star. The Ringworld itself was abandoned and reduced to barbarism by a plague, but the expedition is sent to explore it for potential treasures and discoveries. The novel explores themes of exploration, alien civilizations, and the mysteries of the Ringworld itself."

First Documents by Relevance:
 [0.9249] "Ringworld was abandoned. The Experimentalists regained power 
under the threat of Kzinti expansion a......"
 [0.9240] "ominous crimson sphere, 200 meters in diameter with ugly traces of 
black and silver. The captain's ......"
 [0.9152] "The Guide to Lurry Niven's Ringworld Major Races of Known Space y//\ 
Trj gm |K /Jft ® V/ -<42m4 
fc......"
 [0.9098] "from the g