## Importing Packages

In [9]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain_core.output_parsers import JsonOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import LLMChain
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain_core.pydantic_v1 import BaseModel, Field
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import List, Dict, Optional
from PyPDF2 import PdfReader
import cassio
import json
import os
import docx

In [3]:
def extract_text_from_pdfs(directory_path):
    pdf_files = os.listdir(directory_path)
    all_text = ""

    for pdf_file in pdf_files:
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, pdf_file)
            
            with open(pdf_path, "rb") as f:
                pdf_reader = PdfReader(f)
                
                for page in pdf_reader.pages:
                    content = page.extract_text()
                    if content:
                        all_text += content + "\n\n"  # Add newlines between pages

    return all_text

# Usage
directory_path = "../SOR_HEADLAMP/Regulation_standards"
raw_text = extract_text_from_pdfs(directory_path)
print(f"Total extracted text length: {len(raw_text)} characters")

Total extracted text length: 1928564 characters


In [4]:
load_dotenv(dotenv_path='../.env')

True

In [5]:
ASTRA_DB_APPLICATION_TOKEN = os.getenv('ASTRA_DB_APPLICATION_TOKEN')
ASTRA_DB_ID = os.getenv('ASTRA_DB_ID')
groq_api_key = os.getenv('GROQ_API_KEY')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

In [6]:
# Initializing the the database, llm and embedding
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)
llm = ChatGroq(model="llama-3.1-70b-versatile",groq_api_key = groq_api_key)
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [7]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo_1024",
    session=None,
    keyspace=None,
)

In [8]:
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

In [10]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
chunks = text_splitter.split_text(raw_text)

In [11]:
astra_vector_store.add_texts(chunks)

print("Inserted %i docs." % len(chunks))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 2487 docs.


In [12]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


QUESTION: "What are the COP test for fuel tanks"
ANSWER: "The COP (Conformity of Production) tests for fuel tanks vary depending on the type of tank:

For Metallic Fuel Tanks:
1. Leakage test
2. Pressure test

For Plastic (Non-Metallic) Fuel Tanks:
1. Overturn Test
2. Mechanical Strength
3. Resistance to High Temperature Test"

FIRST DOCUMENTS BY RELEVANCE:
    [0.8696] "10/12 
  
Sl. No Component Applicable 
standard COP Tests COP 
Frequency 
18 Fuel ta ..."
    [0.7811] "Frequency 
 18 Fuel tanks IS :14681- 1999 For Metallic Fuel Tank   
i. Leakage test  ..."
    [0.7452] "In Column 4 “CoP tests” substitute following text f or existing text 
 
 
For Metall ..."
    [0.7432] "7/12 
 Annex B 
(See 11.1) 
LIST OF COP TESTS AND COP FREQ UENCY FOR SAFETY COMPONEN ..."
