In [1]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

from datasets import load_dataset

import cassio


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# The Python library which would assist reading the PDF file content
from PyPDF2 import PdfReader

### Setup

In [3]:
ASTRA_DB_APPLICATION_TOKEN = *************
ASTRA_DB_ID = **************

OpenAI_API_Key = *****************

In [4]:
# Loading pdf file
pdfreader = PdfReader('/Users/sparshnagpal/Desktop/projects/pdfquery/book.pdf')

In [5]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [6]:
raw_text

'VISUALIZATION\n1Interactive Dynamics for \nVisual Analysis\nA taxonomy of tools that support the fluent and flexible use of visualizations\nJeffrey Heer,  Stanford University \nBen Shneiderman, University of Maryland, College Park\nThe increasing scale and availability of digital data provides an extraordinary resource for informing \npublic policy, scientific discovery, business strategy, and even our personal lives. To get the most out of such data, however, users must be able to make sense of it: to pursue questions, uncover patterns of interest, and identify (and potentially correct) errors. In concert with data-management systems and statistical algorithms, analysis requires contextualized human judgments regarding the domain-specific significance of the clusters, trends, and outliers discovered in data. \nVisualization provides a powerful means of making sense of data. By mapping data attributes to \nvisual properties such as position, size, shape, and color, visualization desig

In [8]:
#Connect CassandraDB database (VectorDB)
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

Create the LangChain embedding and LLM objects for later use

In [9]:
llm = OpenAI(openai_api_key=OpenAI_API_Key)
embedding = OpenAIEmbeddings(openai_api_key=OpenAI_API_Key)

  warn_deprecated(
  warn_deprecated(


In [13]:
# Building the table for embeddings in DB
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None
)

In [10]:
from langchain.text_splitter import CharacterTextSplitter

# We need to split the text using Character Text Splitter so that it should not increase the token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap = 200,
    length_function = len
)

texts = text_splitter.split_text(raw_text)

Created a chunk of size 1091, which is longer than the specified 800


In [11]:
texts[:50]

['VISUALIZATION\n1Interactive Dynamics for \nVisual Analysis\nA taxonomy of tools that support the fluent and flexible use of visualizations\nJeffrey Heer,  Stanford University \nBen Shneiderman, University of Maryland, College Park\nThe increasing scale and availability of digital data provides an extraordinary resource for informing',
 'Jeffrey Heer,  Stanford University \nBen Shneiderman, University of Maryland, College Park\nThe increasing scale and availability of digital data provides an extraordinary resource for informing \npublic policy, scientific discovery, business strategy, and even our personal lives. To get the most out of such data, however, users must be able to make sense of it: to pursue questions, uncover patterns of interest, and identify (and potentially correct) errors. In concert with data-management systems and statistical algorithms, analysis requires contextualized human judgments regarding the domain-specific significance of the clusters, trends, and outlier

### Load the dataset into the vector store

In [14]:
astra_vector_store.add_texts(texts[:50])
print("Inserted %i headlines" %len(texts[:50]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 50 headlines


In [15]:
first_question = True
while True:
    if first_question:
        query_text = input('\nEnter your question (or type "quit" to exit): ').strip()
    else:
        query_text = input('\nWhats your next  question (or type "quit" to exit): ').strip()
    
    if query_text.lower() == "quit":
        break
    if query_text == "":
        continue
    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


QUESTION: "what is this book about?"
ANSWER: "This book is about visual analysis tools and techniques, including a taxonomy of interactive dynamics and examples of systems that exhibit those dynamics. It also covers topics such as data and view specification, filtering, sorting, deriving values or models from data, and navigation and coordination of views for exploration."

    [0.8659] "Our focus on interactive elements presumes a basic familiarity with visualization de ..."
    [0.8659] "Our focus on interactive elements presumes a basic familiarity with visualization de ..."
    [0.8651] "The goal of this article is to assist designers, researchers, professional analysts, ..."
    [0.8650] "E R U G I F 
  
E R U G I F 
  
E R U G I F 
  
E R U G I F 
  E R U G I F 
  
  
   ..."

QUESTION: "what does this book talk about?"
ANSWER: "The book discusses a taxonomy of interactive dynamics for successful analytic dialogues in visual analysis tools. It also mentions the importance of fam